Configure pipeline for WES execution on k8s cluster

- Add main.nf as WES entry point with PVC-based paths - Update nextflow.config with k8s/k8s_gpu profiles (eureka-pvc) - Update params.json defaults to /omic/eureka/digital-patients/ paths - Remove stageInMode 'copy' from corto and vcf2prot for k8s compat - Add reassemble.nf for one-time chunk reassembly on PVC
2026-03-26 17:58:28 +01:00
parent 9e6a16c19b
commit 88627edc25
6 changed files with 371 additions and 325 deletions
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,161 @@
+nextflow.enable.dsl=2
+
+// ================= CONTAINER IMAGES ================================================================
+
+params.container_borzoi = 'harbor.cluster.omic.ai/omic/digital-patients/borzoi:latest'
+params.container_vcf2prot = 'harbor.cluster.omic.ai/omic/digital-patients/vcf2prot:latest'
+params.container_rna2protexpression = 'harbor.cluster.omic.ai/omic/digital-patients/rna2protexpression:latest'
+params.container_corto = 'harbor.cluster.omic.ai/omic/digital-patients/corto:latest'
+params.container_ecotyper = 'harbor.cluster.omic.ai/omic/digital-patients/ecotyper:latest'
+params.container_synthea = 'harbor.cluster.omic.ai/omic/digital-patients/synthea:cudf'
+
+// Container options - empty for k8s (k8s handles GPU scheduling via pod specs)
+params.containerOptions = ''
+params.containerOptions_synthea = ''
+params.containerOptions_borzoi = ''
+params.containerOptions_rna2protexpression = ''
+
+// =================================================================================================
+
+// SYNTHEA
+params.n_pat = 2
+params.percent_male = 0.5
+params.disease = 'schizophrenia'
+params.project_name = 'test'
+
+// CIBERSORT
+params.cibersortx_username = "gabriel.richman.2009@anderson.ucla.edu"
+params.cibersortx_token = "b5b39e563fb947df4cfd4843d40fdb99"
+
+// ====================== FILEPATHS (PVC mount paths) ===============================================
+
+params.imputed_store = '/omic/eureka/digital-patients/imputed'
+params.pheno_store = '/omic/eureka/digital-patients/ukbb_phenotypes_filtered.csv'
+params.regulon = '/omic/eureka/digital-patients/regulon.rda'
+params.healthy_dir = '/omic/eureka/digital-patients/healthy'
+params.synthea_support_dir = '/omic/eureka/digital-patients/supporting-data'
+params.mane = '/omic/eureka/digital-patients/MANE.GRCh38.v1.3.update.tsv'
+params.signature_matrix = '/omic/eureka/digital-patients/LM22_sourceGEP_ensg.txt'
+params.outdir = '/omic/eureka/digital-patients/output'
+params.ecotyper_outdir = "${params.outdir}/ecotyper"
+
+// ====================== FILTERING PARAMETERS =====================================================
+
+params.enable_filtering = true
+params.top_n_variants = 500
+params.top_n_genes = 1000
+params.transcriptome_log2fc_threshold = 1.5
+params.top_n_proteins = 500
+params.top_n_immune_cells = 20
+params.top_n_metabolites = 50
+params.metabolome_pvalue_threshold = 0.05
+
+// ====================== INCLUDES ================================================================
+
+//SYNTHEA
+include {get_disease_stats_no_patients} from './main_synthea.nf'
+include {generate_m_variants_cudf} from './main_synthea.nf'
+include {generate_f_variants_cudf} from './main_synthea.nf'
+include {make_vcfs} from './main_synthea.nf'
+include {generate_m_healthy_cudf} from './main_synthea.nf'
+include {generate_f_healthy_cudf} from './main_synthea.nf'
+
+//BORZOI
+include {FILTER_VCF} from './main_borzoi.nf'
+include {PREDICT_EXPRESSION} from './main_borzoi.nf'
+include {CREATE_PROTEIN_CLUSTER} from './main_borzoi.nf'
+
+//VCF2PROT
+include {VCF2PROT} from './main_vcf2prot.nf'
+
+//RNA2PROTEINEXPRESSION
+include {RNA2PROTEXPRESSION} from './main_rna2proteinexpression'
+
+//CORTO
+include {CORTO} from './main_corto.nf'
+
+//CIBERSORT
+include {CONVERT_TO_TXT} from './main_cibersortx.nf'
+include {CIBERSORTx_FRACTIONS} from './main_cibersortx.nf'
+include {CIBERSORTx_HIRES} from './main_cibersortx.nf'
+include {ADD_TISSUE_NAMES_TO_CIBERSORTX} from './main_cibersortx.nf'
+
+//FILTERING PROCESSES
+include {FILTER_VARIANTS} from './main_filter_outputs.nf'
+include {FILTER_TRANSCRIPTOME} from './main_filter_outputs.nf'
+include {FILTER_PROTEOME} from './main_filter_outputs.nf'
+include {FILTER_IMMUNE_CELLS} from './main_filter_outputs.nf'
+include {FILTER_METABOLOME} from './main_filter_outputs.nf'
+include {FILTER_MUTATED_PROTEINS} from './main_filter_outputs.nf'
+include {CREATE_SUMMARY_REPORT} from './main_filter_outputs.nf'
+
+
+workflow {
+  pheno_store_ch = file(params.pheno_store)
+  imputed_store_ch = file(params.imputed_store)
+  synthea_support_ch = file(params.synthea_support_dir)
+  regulon_ch = file(params.regulon)
+  signature_matrix_ch = file(params.signature_matrix)
+  mane_ch = file(params.mane)
+  health_dir_ch = file(params.healthy_dir)
+
+  //SYNTHEA
+  switch (params.disease) {
+    case 'healthy':
+      generate_m_healthy_cudf(health_dir_ch)
+      generate_f_healthy_cudf(health_dir_ch)
+      m_healthy = generate_m_healthy_cudf.out
+      f_healthy = generate_f_healthy_cudf.out
+      txt_ch = f_healthy.mix(m_healthy).flatten()
+      break
+    default:
+      get_disease_stats_no_patients(pheno_store_ch, imputed_store_ch)
+      generate_m_variants_cudf(get_disease_stats_no_patients.out)
+      generate_f_variants_cudf(get_disease_stats_no_patients.out)
+      f_var = generate_f_variants_cudf.out
+      m_var = generate_m_variants_cudf.out
+      txt_ch = f_var.mix(m_var).flatten()
+  }
+  make_vcfs(txt_ch, synthea_support_ch)
+
+  //BORZOI
+  FILTER_VCF(mane_ch, make_vcfs.out)
+  PREDICT_EXPRESSION(FILTER_VCF.out, mane_ch)
+
+  //VCF2PROT
+  VCF2PROT(make_vcfs.out, FILTER_VCF.out)
+
+  //RNA2PROTEINEXPRESSION
+  PREDICT_EXPRESSION.out
+      .collect()
+      .flatten()
+      .set { rna_input }
+  RNA2PROTEXPRESSION(rna_input)
+
+  //CORTO
+  CORTO(PREDICT_EXPRESSION.out, regulon_ch)
+
+  //CIBERSORT
+  signature_file = Channel.fromPath(signature_matrix_ch, checkIfExists: true)
+  CONVERT_TO_TXT(PREDICT_EXPRESSION.out)
+  CIBERSORTx_FRACTIONS(CONVERT_TO_TXT.out, signature_file)
+  CIBERSORTx_HIRES(CONVERT_TO_TXT.out, CIBERSORTx_FRACTIONS.out, signature_file)
+  ADD_TISSUE_NAMES_TO_CIBERSORTX(CONVERT_TO_TXT.out, CIBERSORTx_HIRES.out)
+
+  // FILTERING STAGE
+  if (params.enable_filtering) {
+    FILTER_VARIANTS(make_vcfs.out)
+    FILTER_TRANSCRIPTOME(PREDICT_EXPRESSION.out)
+    FILTER_PROTEOME(RNA2PROTEXPRESSION.out)
+    FILTER_IMMUNE_CELLS(ADD_TISSUE_NAMES_TO_CIBERSORTX.out)
+    FILTER_METABOLOME(CORTO.out)
+    FILTER_MUTATED_PROTEINS(VCF2PROT.out, FILTER_VARIANTS.out.filtered_vcf)
+    CREATE_SUMMARY_REPORT(
+      FILTER_TRANSCRIPTOME.out,
+      FILTER_PROTEOME.out,
+      FILTER_IMMUNE_CELLS.out,
+      FILTER_METABOLOME.out,
+      FILTER_VARIANTS.out.filtered_vcf
+    )
+  }
+}
--- a/main_corto.nf
+++ b/main_corto.nf
@@ -8,7 +8,6 @@ process CORTO {
  publishDir "${params.outdir}/${params.project_name}", mode: 'copy'
  // debug true
  // maxForks 1
-  stageInMode 'copy'

  input:
    path TPM
--- a/main_vcf2prot.nf
+++ b/main_vcf2prot.nf
@@ -7,7 +7,6 @@ process VCF2PROT {
  containerOptions "${params.containerOptions}"
  // echo true
  publishDir "${params.outdir}/vcf2prot", mode: 'copy'
-  stageInMode 'copy'
  maxForks 1

  input:
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,98 +1,47 @@
 manifest {
-    name                = 'digital patients'
+    name                = 'digital-patients'
    author              = 'omic'
    recurseSubmodules   = true
-    homePage            = 'https://gitlab.com/omic/next/registry/pipelines/digitalpatients'
+    homePage            = 'https://trs-gitea.cluster.omic.ai/omic/digital-patients'
    description         = 'generative digital patients and multi-omics pipeline'
    mainScript          = 'main.nf'
    nextflowVersion     = '!>=21.04.3'
-    defaultBranch       = 'master'
+    defaultBranch       = 'main'
 }

-// docker {
-//     enabled = true
-//     temp = 'auto'
-// }
-
-// process {
-//     withLabel: 'gpu_process' {
-//         containerOptions = '--gpus all --rm'
-//     }
-//     
-//     withLabel: 'cpu_process' {
-//         containerOptions = '--rm'
-//     }
-// }
-
-def sharedPod = [
-  [env: 'NXF_DEBUG', value: '0'],
-  [label: 'omic-app', value: 'digitalpatients'],
-  [imagePullSecret: 'gitlab-registry-secret'],
-  [volumeClaim: 'avatar-new', mountPath: '/mnt/Avatar/'],
-]
-
 profiles {
+    standard {
        docker {
-    docker.enabled = true
+            enabled = true
+            temp = 'auto'
+        }
+    }

+    k8s_gpu {
        process {
-      executor = 'local'
-      
-      withLabel: 'gpu_process' {
-        maxForks = 1  // Only one GPU task at a time on single GPU system
-        containerOptions = '--gpus all --rm'
-      }
-      
-      withLabel: 'cpu_process' {
-        containerOptions = '--rm'
-      }
-    }
-  }
-
-  k8s {
-    process {
-     
            executor = 'k8s'
-      namespace = 'bioinformatics'  
-      debug = true
-      
-      pod = sharedPod
-
-      withLabel: 'gpu_process' {
-       
-        pod = sharedPod + [
-          [nodeSelector: [gpu: 'yes', 'gpu-type': 'geforce-rtx-3090']]
-        ]
+            pod = [[nodeSelector: 'nvidia.com/gpu.present=true']]
+            accelerator = [request: 1, type: 'nvidia.com/gpu']
+        }
+        docker {
+            enabled = true
+        }
+        k8s {
+            storageClaimName = 'eureka-pvc'
+            storageMountPath = '/omic/eureka'
        }
    }

-    workDir = "/mnt/dreamdock-data/digital-patient-data/work"
-
    k8s {
-      serviceAccount = 'nextflow-sa'
-      namespace = 'bioinformatics'
-      storageClaimName = 'dreamdock-data'
-      storageMountPath = '/mnt/dreamdock-data'
-      
-      pullPolicy = 'IfNotPresent'
-      cleanup = true                    // delete pods after Ctrl+C or finished?
-      // cleanup = false                    // delete pods after Ctrl+C or finished?
-      
-
-      // RUN AS DIFFERENT USERS
-      // securityContext = [fsGroup: 1000]
-      // securityContext = [
-      //   runAsUser: 1000,
-      //   fsGroup: 1000,
-      //   runAsNonRoot: true
-      // ]
+        process {
+            executor = 'k8s'
+        }
+        docker {
+            enabled = true
+        }
+        k8s {
+            storageClaimName = 'eureka-pvc'
+            storageMountPath = '/omic/eureka'
        }
-    
-
-
-    // Use container image
-    // process.container = 'harbor.cluster.omic.ai/omic/faiss-indexer:latest'
-
    }
 }
-
--- a/params.json
+++ b/params.json
@@ -1,6 +1,5 @@
 {
  "params": {
-    "// SYNTHEA PARAMETERS": {},
    "n_pat": {
      "type": "integer",
      "description": "Number of patients to generate (must be >= 2)",
@@ -10,9 +9,7 @@
      "var_name": "params.n_pat",
      "examples": [2, 10, 100],
      "pattern": "^[0-9]+$",
-      "validation": {
-        "min": 2
-      },
+      "validation": { "min": 2 },
      "notes": "Minimum of 2 patients required (one male, one female)"
    },
    "percent_male": {
@@ -23,42 +20,9 @@
      "pipeline_io": "parameter",
      "var_name": "params.percent_male",
      "examples": [0.0, 0.5, 1.0],
-      "pattern": "^0(\\.\\d+)?|1(\\.0)?$",
-      "validation": {
-        "min": 0.0,
-        "max": 1.0
-      },
+      "validation": { "min": 0.0, "max": 1.0 },
      "notes": "Value must be between 0 and 1 inclusive"
    },
-    "imputed_store": {
-      "type": "folder",
-      "description": "Path to imputed UKBB data",
-      "default": "/mnt/Avatar/imputed/ukbb/imputed",
-      "required": true,
-      "pipeline_io": "input",
-      "var_name": "params.imputed_store",
-      "examples": [
-        "/mnt/Avatar/imputed/ukbb/imputed",
-        "/rosalind/ukbb/imputed"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Directory containing imputed UKBB data"
-    },
-    "pheno_store": {
-      "type": "file",
-      "description": "Path to phenotype data file",
-      "default": "/mnt/Avatar/dd/synthea/metadata/ukbb_phenotypes_filtered.csv",
-      "required": true,
-      "pipeline_io": "input",
-      "var_name": "params.pheno_store",
-      "examples": [
-        "/mnt/Avatar/dd/synthea/metadata/ukbb_phenotypes_filtered.csv"
-      ],
-      "pattern": ".*\\.csv$",
-      "validation": {},
-      "notes": "CSV file containing filtered UKBB phenotype data"
-    },
    "disease": {
      "type": "string",
      "description": "Disease or condition to simulate",
@@ -66,83 +30,10 @@
      "required": true,
      "pipeline_io": "parameter",
      "var_name": "params.disease",
-      "examples": [
-        "schizophrenia",
-        "healthy",
-        "leukaemia",
-        "Purpura and other haemorrhagic conditions"
-      ],
-      "pattern": ".*",
-      "enum": [
-        "schizophrenia",
-        "healthy",
-        "leukaemia",
-        "Purpura and other haemorrhagic conditions"
-      ],
-      "validation": {},
+      "examples": ["schizophrenia", "healthy", "leukaemia"],
+      "enum": ["schizophrenia", "healthy", "leukaemia", "Purpura and other haemorrhagic conditions"],
      "notes": "Use 'healthy' for healthy individuals or specify a disease condition"
    },
-    "healthy_dir": {
-      "type": "folder",
-      "description": "Path to healthy patient data directory",
-      "default": "/Workspace/next/registry/pipelines/digital_patient",
-      "required": true,
-      "pipeline_io": "input",
-      "var_name": "params.healthy_dir",
-      "examples": [
-        "/Workspace/next/registry/pipelines/digital_patient",
-        "/mnt/Avatar/digital_patient"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Directory containing data for healthy patient generation"
-    },
-    "outdir": {
-      "type": "folder",
-      "description": "Output directory for results",
-      "default": "/mnt/omic-next-apis/wes/digital_patients",
-      "required": true,
-      "pipeline_io": "output",
-      "var_name": "params.outdir",
-      "examples": [
-        "/mnt/OmicNAS/dd/digital_patient",
-        "/path/to/custom/output"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Directory where all pipeline results will be stored"
-    },
-    "// BORZOI PARAMETERS": {},
-    "container_borzoi": {
-      "type": "string",
-      "description": "Borzoi container image",
-      "default": "borzoi:latest",
-      "required": true,
-      "pipeline_io": "parameter",
-      "var_name": "params.container_borzoi",
-      "examples": [
-        "borzoi:latest",
-        "borzoi:v1.0"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Docker container image for Borzoi module"
-    },
-    "containerOptions": {
-      "type": "string",
-      "description": "Container runtime options",
-      "default": "--gpus all --rm -v /mnt:/mnt",
-      "required": false,
-      "pipeline_io": "parameter",
-      "var_name": "params.containerOptions",
-      "examples": [
-        "--gpus all --rm -v /mnt:/mnt",
-        "--rm -v /data:/data"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Docker container runtime options for GPU usage and volume mounts"
-    },
    "project_name": {
      "type": "string",
      "description": "Project identifier",
@@ -150,106 +41,92 @@
      "required": true,
      "pipeline_io": "parameter",
      "var_name": "params.project_name",
-      "examples": [
-        "test",
-        "production",
-        "schizophrenia_study"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Identifier for the digital patient project"
+      "examples": ["test", "production", "schizophrenia_study"]
+    },
+    "imputed_store": {
+      "type": "folder",
+      "description": "Path to imputed UKBB data directory",
+      "default": "/omic/eureka/digital-patients/imputed",
+      "required": true,
+      "pipeline_io": "input",
+      "var_name": "params.imputed_store",
+      "examples": ["/omic/eureka/digital-patients/imputed"],
+      "notes": "Directory containing imputed UKBB GWAS .bgz files"
+    },
+    "pheno_store": {
+      "type": "file",
+      "description": "Path to phenotype data file",
+      "default": "/omic/eureka/digital-patients/ukbb_phenotypes_filtered.csv",
+      "required": true,
+      "pipeline_io": "input",
+      "var_name": "params.pheno_store",
+      "examples": ["/omic/eureka/digital-patients/ukbb_phenotypes_filtered.csv"],
+      "pattern": ".*\\.csv$",
+      "notes": "CSV file containing filtered UKBB phenotype data"
+    },
+    "healthy_dir": {
+      "type": "folder",
+      "description": "Path to healthy patient data directory (gnomad files)",
+      "default": "/omic/eureka/digital-patients/healthy",
+      "required": true,
+      "pipeline_io": "input",
+      "var_name": "params.healthy_dir",
+      "examples": ["/omic/eureka/digital-patients/healthy"],
+      "notes": "Directory containing gnomad.genomes.v4.1.sites.{male,female}.txt"
+    },
+    "synthea_support_dir": {
+      "type": "folder",
+      "description": "Path to Synthea supporting data (VCF templates, liftover, genome)",
+      "default": "/omic/eureka/digital-patients/supporting-data",
+      "required": true,
+      "pipeline_io": "input",
+      "var_name": "params.synthea_support_dir",
+      "examples": ["/omic/eureka/digital-patients/supporting-data"],
+      "notes": "Must contain vcf/vcf_template.vcf, ucsc-liftover/hg19ToHg38.over.chain.gz, genome/hg38.fa"
    },
    "mane": {
      "type": "file",
      "description": "Path to MANE reference file",
-      "default": "/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv",
+      "default": "/omic/eureka/digital-patients/MANE.GRCh38.v1.3.update.tsv",
      "required": true,
      "pipeline_io": "input",
      "var_name": "params.mane",
-      "examples": [
-        "/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv"
-      ],
+      "examples": ["/omic/eureka/digital-patients/MANE.GRCh38.v1.3.update.tsv"],
      "pattern": ".*\\.tsv$",
-      "validation": {},
      "notes": "MANE transcripts reference file in TSV format"
    },
-    "// VCF2PROT PARAMETERS": {},
-    "container_vcf2prot": {
-      "type": "string",
-      "description": "VCF2PROT container image",
-      "default": "vcf2prot:latest",
-      "required": true,
-      "pipeline_io": "parameter",
-      "var_name": "params.container_vcf2prot",
-      "examples": [
-        "vcf2prot:latest",
-        "vcf2prot:v1.0"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Docker container image for VCF2PROT module"
-    },
-    "// RNA2PROTEINEXPRESSION PARAMETERS": {},
-    "container_rna2protexpression": {
-      "type": "string",
-      "description": "RNA2PROTEINEXPRESSION container image",
-      "default": "rna2protexpression:latest",
-      "required": true,
-      "pipeline_io": "parameter",
-      "var_name": "params.container_rna2protexpression",
-      "examples": [
-        "rna2protexpression:latest",
-        "rna2protexpression:v1.0"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Docker container image for RNA2PROTEINEXPRESSION module"
-    },
-    "containerOptions_rna2protexpression": {
-      "type": "string",
-      "description": "Container options for RNA2PROTEINEXPRESSION",
-      "default": "--gpus all --rm -v /mnt:/mnt -v /dbs:/dbs",
-      "required": false,
-      "pipeline_io": "parameter",
-      "var_name": "params.containerOptions_rna2protexpression",
-      "examples": [
-        "--gpus all --rm -v /mnt:/mnt -v /dbs:/dbs"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Docker container runtime options for RNA2PROTEINEXPRESSION with GPU and database volume mounts"
-    },
-    "// CORTO PARAMETERS": {},
-    "container_corto": {
-      "type": "string",
-      "description": "CORTO container image",
-      "default": "corto:latest",
-      "required": true,
-      "pipeline_io": "parameter",
-      "var_name": "params.container_corto",
-      "examples": [
-        "corto:latest",
-        "corto:v1.0"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Docker container image for CORTO module"
-    },
    "regulon": {
      "type": "file",
      "description": "Path to regulon RDA file",
-      "default": "/Workspace/next/registry/pipelines/digital_patient/regulon.rda",
+      "default": "/omic/eureka/digital-patients/regulon.rda",
      "required": true,
      "pipeline_io": "input",
      "var_name": "params.regulon",
-      "examples": [
-        "/Workspace/next/registry/pipelines/digital_patient/regulon.rda"
-      ],
+      "examples": ["/omic/eureka/digital-patients/regulon.rda"],
      "pattern": ".*\\.rda$",
-      "validation": {},
      "notes": "Regulon data file in RDA format for CORTO module"
    },
-    "// CIBERSORT PARAMETERS": {},
+    "signature_matrix": {
+      "type": "file",
+      "description": "Path to signature matrix file for CIBERSORTx",
+      "default": "/omic/eureka/digital-patients/LM22_sourceGEP_ensg.txt",
+      "required": true,
+      "pipeline_io": "input",
+      "var_name": "params.signature_matrix",
+      "examples": ["/omic/eureka/digital-patients/LM22_sourceGEP_ensg.txt"],
+      "pattern": ".*\\.txt$",
+      "notes": "Signature matrix file for CIBERSORTx analysis"
+    },
+    "outdir": {
+      "type": "folder",
+      "description": "Output directory for results",
+      "default": "/omic/eureka/digital-patients/output",
+      "required": true,
+      "pipeline_io": "output",
+      "var_name": "params.outdir",
+      "examples": ["/omic/eureka/digital-patients/output"],
+      "notes": "Directory where all pipeline results will be stored"
+    },
    "cibersortx_username": {
      "type": "string",
      "description": "CIBERSORTx username for authentication",
@@ -257,11 +134,6 @@
      "required": true,
      "pipeline_io": "parameter",
      "var_name": "params.cibersortx_username",
-      "examples": [
-        "gabriel.richman.2009@anderson.ucla.edu"
-      ],
-      "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
-      "validation": {},
      "notes": "Username (email) for CIBERSORTx authentication"
    },
    "cibersortx_token": {
@@ -271,55 +143,55 @@
      "required": true,
      "pipeline_io": "parameter",
      "var_name": "params.cibersortx_token",
-      "examples": [
-        "b5b39e563fb947df4cfd4843d40fdb99"
-      ],
-      "pattern": "^[a-f0-9]{32}$",
-      "validation": {},
      "notes": "Authentication token for CIBERSORTx API access"
    },
+    "container_borzoi": {
+      "type": "string",
+      "description": "Borzoi container image",
+      "default": "harbor.cluster.omic.ai/omic/digital-patients/borzoi:latest",
+      "required": true,
+      "pipeline_io": "parameter",
+      "var_name": "params.container_borzoi"
+    },
+    "container_vcf2prot": {
+      "type": "string",
+      "description": "VCF2PROT container image",
+      "default": "harbor.cluster.omic.ai/omic/digital-patients/vcf2prot:latest",
+      "required": true,
+      "pipeline_io": "parameter",
+      "var_name": "params.container_vcf2prot"
+    },
+    "container_rna2protexpression": {
+      "type": "string",
+      "description": "RNA2PROTEINEXPRESSION container image",
+      "default": "harbor.cluster.omic.ai/omic/digital-patients/rna2protexpression:latest",
+      "required": true,
+      "pipeline_io": "parameter",
+      "var_name": "params.container_rna2protexpression"
+    },
+    "container_corto": {
+      "type": "string",
+      "description": "CORTO container image",
+      "default": "harbor.cluster.omic.ai/omic/digital-patients/corto:latest",
+      "required": true,
+      "pipeline_io": "parameter",
+      "var_name": "params.container_corto"
+    },
    "container_ecotyper": {
      "type": "string",
-      "description": "ECOTyper container image",
-      "default": "ecotyper:latest",
+      "description": "ECOTyper/CIBERSORTx container image",
+      "default": "harbor.cluster.omic.ai/omic/digital-patients/ecotyper:latest",
      "required": true,
      "pipeline_io": "parameter",
-      "var_name": "params.container_ecotyper",
-      "examples": [
-        "ecotyper:latest",
-        "ecotyper:v1.0"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Docker container image for ECOTyper module"
+      "var_name": "params.container_ecotyper"
    },
-    "signature_matrix": {
-      "type": "file",
-      "description": "Path to signature matrix file",
-      "default": "/Workspace/next/registry/pipelines/digital_patient/LM22_sourceGEP_ensg.txt",
+    "container_synthea": {
+      "type": "string",
+      "description": "Synthea container image",
+      "default": "harbor.cluster.omic.ai/omic/digital-patients/synthea:cudf",
      "required": true,
-      "pipeline_io": "input",
-      "var_name": "params.signature_matrix",
-      "examples": [
-        "/Workspace/next/registry/pipelines/digital_patient/LM22_sourceGEP_ensg.txt"
-      ],
-      "pattern": ".*\\.txt$",
-      "validation": {},
-      "notes": "Signature matrix file for CIBERSORTx analysis"
-    },
-    "ecotyper_outdir": {
-      "type": "folder",
-      "description": "Output directory for ECOTyper results",
-      "default": "/mnt/omic-next-apis/wes/digital-patients/ecotyper",
-      "required": true,
-      "pipeline_io": "output",
-      "var_name": "params.ecotyper_outdir",
-      "examples": [
-        "/mnt/OmicNAS/olamide/ecotyper/results/ecotyper"
-      ],
-      "pattern": ".*",
-      "validation": {},
-      "notes": "Directory where ECOTyper results will be stored"
+      "pipeline_io": "parameter",
+      "var_name": "params.container_synthea"
    }
  }
 }
--- a/reassemble.nf
+++ b/reassemble.nf
@@ -0,0 +1,66 @@
+#!/usr/bin/env nextflow
+nextflow.enable.dsl=2
+
+// One-time script to reassemble chunked files on the PVC
+// Run once, then delete. Not part of the main pipeline.
+
+params.base_dir = '/omic/eureka/digital-patients'
+params.outdir = '/omic/eureka/digital-patients'
+
+process REASSEMBLE {
+    container 'alpine:latest'
+    executor 'local'
+
+    script:
+    """
+    echo "=== Reassembling chunked files ==="
+
+    # hg38.fa (35 parts)
+    if [ -d "${params.base_dir}/supporting-data/genome/hg38.fa_parts" ]; then
+        echo "Reassembling hg38.fa..."
+        cat ${params.base_dir}/supporting-data/genome/hg38.fa_parts/part_* > ${params.base_dir}/supporting-data/genome/hg38.fa
+        echo "  Done: \$(du -h ${params.base_dir}/supporting-data/genome/hg38.fa | cut -f1)"
+    fi
+
+    # gnomad male (12 parts)
+    if [ -d "${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.male.txt_parts" ]; then
+        echo "Reassembling gnomad male..."
+        cat ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.male.txt_parts/part_* > ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.male.txt
+        echo "  Done: \$(du -h ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.male.txt | cut -f1)"
+    fi
+
+    # gnomad female (12 parts)
+    if [ -d "${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.female.txt_parts" ]; then
+        echo "Reassembling gnomad female..."
+        cat ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.female.txt_parts/part_* > ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.female.txt
+        echo "  Done: \$(du -h ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.female.txt | cut -f1)"
+    fi
+
+    # F5_SCHIZO male (7 parts)
+    if [ -d "${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz_parts" ]; then
+        echo "Reassembling F5_SCHIZO male..."
+        cat ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz_parts/part_* > ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz
+        echo "  Done: \$(du -h ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz | cut -f1)"
+    fi
+
+    # F5_SCHIZO female (7 parts)
+    if [ -d "${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz_parts" ]; then
+        echo "Reassembling F5_SCHIZO female..."
+        cat ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz_parts/part_* > ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz
+        echo "  Done: \$(du -h ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz | cut -f1)"
+    fi
+
+    # F5_SCHIZO both_sexes (7 parts)
+    if [ -d "${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz_parts" ]; then
+        echo "Reassembling F5_SCHIZO both_sexes..."
+        cat ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz_parts/part_* > ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz
+        echo "  Done: \$(du -h ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz | cut -f1)"
+    fi
+
+    echo "=== All reassembly complete ==="
+    """
+}
+
+workflow {
+    REASSEMBLE()
+}