Initial commit: digital-patients pipeline (clean, no large files)

Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
This commit is contained in:
2026-03-26 15:15:23 +01:00
commit 9e6a16c19b
45 changed files with 7207 additions and 0 deletions

15
k8s/README.md Normal file
View File

@@ -0,0 +1,15 @@
# access the current workspace
kubectl exec -it -n bioinformatics $(kubectl get pod -l app=digital-patient-nextflow -n bioinformatics | grep Run | awk '{ print $1 }') -- bash
nextflow run test.nf -profile k8s
# DEPRECATE: cleanup error pods in bioinformatics
kubectl get pod -n bioinformatics | grep -E "Pending|Error" | awk '{print $1}' | xargs -P 10 -I {} kubectl delete pod -n bioinformatics {}
# sync data (/data/bugra/similarity-search/utility/vec_db/nf_fingerptint) from node to workspace
kubectl delete -f k8s/job-copy-node-to-pvc.yaml; kubectl apply -f k8s/job-copy-node-to-pvc.yaml
# run the actual nextflow jobs
kubectl apply -f k8s/job-nextflow-digital-patient.yaml
# [WARNING] destroy the curent nextflow job
kubectl delete -f k8s/job-nextflow-digital-patient.yaml

View File

@@ -0,0 +1,72 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: digital-patient-nextflow
namespace: bioinformatics
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app: digital-patient-nextflow
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
creationTimestamp: null
labels:
app: digital-patient-nextflow
spec:
containers:
- command:
- sleep
- infinity
image: nextflow/nextflow:25.04.6
imagePullPolicy: IfNotPresent
name: nextflow
workingDir: /mnt/dreamdock-data/digital-patient-data
resources:
limits:
cpu: "2"
memory: 4Gi
requests:
cpu: "1"
memory: 2Gi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /mnt/Avatar
name: avatar-new-volume
- mountPath: /mnt/dreamdock-data/
name: dreamdock-volume
dnsPolicy: ClusterFirst
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: nextflow-sa
serviceAccountName: nextflow-sa
terminationGracePeriodSeconds: 30
volumes:
- name: avatar-new-volume
persistentVolumeClaim:
claimName: avatar-new # CHANGE ME, RELATE TO nextflow.params.input
- name: dreamdock-volume
persistentVolumeClaim:
claimName: dreamdock-data # CHANGE ME, RELATE TO nextflow.params.input
# ---
# apiVersion: v1
# kind: PersistentVolumeClaim
# metadata:
# name: digital-patient-data
# namespace: bioinformatics
# spec:
# accessModes:
# - ReadWriteMany
# resources:
# requests:
# storage: 6000Gi
# storageClassName: truenas-nfs

18
k8s/gpu-test.yaml Normal file
View File

@@ -0,0 +1,18 @@
apiVersion: v1
kind: Pod
metadata:
name: gpu-test-pod
namespace: bioinformatics
spec:
restartPolicy: Never
nodeSelector:
gpu: 'yes'
gpu-type: 'geforce-rtx-3090'
containers:
- name: cuda-test
# Using a standard, lightweight NVIDIA CUDA image to test the runtime
image: nvidia/cuda:11.8.0-base-ubuntu22.04
command: ["nvidia-smi"]
resources:
limits:
nvidia.com/gpu: 1

View File

@@ -0,0 +1,54 @@
apiVersion: batch/v1
kind: Job
metadata:
name: job-data-copy-digital-patient-node-to-pvc
namespace: bioinformatics
spec:
backoffLimit: 6
completionMode: NonIndexed
completions: 1
manualSelector: false
parallelism: 1
suspend: false
template:
spec:
containers:
- args:
- |
apk add --no-cache rsync
mkdir -p /target
rsync -av --exclude .git/ --exclude .git-old/ --exclude work/ /source/ /target/digital-patient-data
command:
- /bin/sh
- -c
image: alpine
imagePullPolicy: Always
name: data-copy
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /source
name: k8s-node
- mountPath: /target
name: pvc-volume
dnsPolicy: ClusterFirst
nodeSelector:
kubernetes.io/hostname: k8s-node23
restartPolicy: Never
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: omic-app
operator: Equal
value: similarity-search
volumes:
- name: pvc-volume
persistentVolumeClaim:
claimName: dreamdock-data
- hostPath:
path: /data/bugra/digital_patient ## CHANGE ME
type: ""
name: k8s-node

View File

@@ -0,0 +1,52 @@
apiVersion: batch/v1
kind: Job
metadata:
name: job-nextflow-digital-patient
namespace: bioinformatics
spec:
backoffLimit: 1
completionMode: NonIndexed
completions: 1
manualSelector: false
parallelism: 1
podReplacementPolicy: TerminatingOrFailed
suspend: false
template:
spec:
containers:
- command:
- /bin/bash
- -c
- cd /mnt/dreamdock-data/digital-patient-data && nextflow run test.nf -profile k8s
image: nextflow/nextflow:25.04.6
imagePullPolicy: IfNotPresent
name: nextflow
resources:
limits:
cpu: "4"
memory: 8Gi
requests:
cpu: "2"
memory: 4Gi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /mnt/Avatar
name: avatar-nas-volume
- mountPath: /mnt/dreamdock-data/
name: dreamdock-volume
dnsPolicy: ClusterFirst
restartPolicy: Never
schedulerName: default-scheduler
securityContext: {}
serviceAccount: nextflow-sa
serviceAccountName: nextflow-sa
terminationGracePeriodSeconds: 30
volumes:
- name: avatar-nas-volume
hostPath:
path: /mnt/Avatar-NAS
type: Directory
- name: dreamdock-volume
persistentVolumeClaim:
claimName: dreamdock-data # CHANGE ME, RELATE TO nextflow.params.input