From 88627edc254295c55c91bcb05e81ecbb425f2389 Mon Sep 17 00:00:00 2001 From: Olamide Isreal Date: Thu, 26 Mar 2026 17:58:28 +0100 Subject: [PATCH] Configure pipeline for WES execution on k8s cluster - Add main.nf as WES entry point with PVC-based paths - Update nextflow.config with k8s/k8s_gpu profiles (eureka-pvc) - Update params.json defaults to /omic/eureka/digital-patients/ paths - Remove stageInMode 'copy' from corto and vcf2prot for k8s compat - Add reassemble.nf for one-time chunk reassembly on PVC --- main.nf | 161 ++++++++++++++++++++++ main_corto.nf | 1 - main_vcf2prot.nf | 1 - nextflow.config | 115 +++++----------- params.json | 352 +++++++++++++++-------------------------------- reassemble.nf | 66 +++++++++ 6 files changed, 371 insertions(+), 325 deletions(-) create mode 100644 main.nf create mode 100644 reassemble.nf diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..dc9ac33 --- /dev/null +++ b/main.nf @@ -0,0 +1,161 @@ +nextflow.enable.dsl=2 + +// ================= CONTAINER IMAGES ================================================================ + +params.container_borzoi = 'harbor.cluster.omic.ai/omic/digital-patients/borzoi:latest' +params.container_vcf2prot = 'harbor.cluster.omic.ai/omic/digital-patients/vcf2prot:latest' +params.container_rna2protexpression = 'harbor.cluster.omic.ai/omic/digital-patients/rna2protexpression:latest' +params.container_corto = 'harbor.cluster.omic.ai/omic/digital-patients/corto:latest' +params.container_ecotyper = 'harbor.cluster.omic.ai/omic/digital-patients/ecotyper:latest' +params.container_synthea = 'harbor.cluster.omic.ai/omic/digital-patients/synthea:cudf' + +// Container options - empty for k8s (k8s handles GPU scheduling via pod specs) +params.containerOptions = '' +params.containerOptions_synthea = '' +params.containerOptions_borzoi = '' +params.containerOptions_rna2protexpression = '' + +// ================================================================================================= + +// SYNTHEA +params.n_pat = 2 +params.percent_male = 0.5 +params.disease = 'schizophrenia' +params.project_name = 'test' + +// CIBERSORT +params.cibersortx_username = "gabriel.richman.2009@anderson.ucla.edu" +params.cibersortx_token = "b5b39e563fb947df4cfd4843d40fdb99" + +// ====================== FILEPATHS (PVC mount paths) =============================================== + +params.imputed_store = '/omic/eureka/digital-patients/imputed' +params.pheno_store = '/omic/eureka/digital-patients/ukbb_phenotypes_filtered.csv' +params.regulon = '/omic/eureka/digital-patients/regulon.rda' +params.healthy_dir = '/omic/eureka/digital-patients/healthy' +params.synthea_support_dir = '/omic/eureka/digital-patients/supporting-data' +params.mane = '/omic/eureka/digital-patients/MANE.GRCh38.v1.3.update.tsv' +params.signature_matrix = '/omic/eureka/digital-patients/LM22_sourceGEP_ensg.txt' +params.outdir = '/omic/eureka/digital-patients/output' +params.ecotyper_outdir = "${params.outdir}/ecotyper" + +// ====================== FILTERING PARAMETERS ===================================================== + +params.enable_filtering = true +params.top_n_variants = 500 +params.top_n_genes = 1000 +params.transcriptome_log2fc_threshold = 1.5 +params.top_n_proteins = 500 +params.top_n_immune_cells = 20 +params.top_n_metabolites = 50 +params.metabolome_pvalue_threshold = 0.05 + +// ====================== INCLUDES ================================================================ + +//SYNTHEA +include {get_disease_stats_no_patients} from './main_synthea.nf' +include {generate_m_variants_cudf} from './main_synthea.nf' +include {generate_f_variants_cudf} from './main_synthea.nf' +include {make_vcfs} from './main_synthea.nf' +include {generate_m_healthy_cudf} from './main_synthea.nf' +include {generate_f_healthy_cudf} from './main_synthea.nf' + +//BORZOI +include {FILTER_VCF} from './main_borzoi.nf' +include {PREDICT_EXPRESSION} from './main_borzoi.nf' +include {CREATE_PROTEIN_CLUSTER} from './main_borzoi.nf' + +//VCF2PROT +include {VCF2PROT} from './main_vcf2prot.nf' + +//RNA2PROTEINEXPRESSION +include {RNA2PROTEXPRESSION} from './main_rna2proteinexpression' + +//CORTO +include {CORTO} from './main_corto.nf' + +//CIBERSORT +include {CONVERT_TO_TXT} from './main_cibersortx.nf' +include {CIBERSORTx_FRACTIONS} from './main_cibersortx.nf' +include {CIBERSORTx_HIRES} from './main_cibersortx.nf' +include {ADD_TISSUE_NAMES_TO_CIBERSORTX} from './main_cibersortx.nf' + +//FILTERING PROCESSES +include {FILTER_VARIANTS} from './main_filter_outputs.nf' +include {FILTER_TRANSCRIPTOME} from './main_filter_outputs.nf' +include {FILTER_PROTEOME} from './main_filter_outputs.nf' +include {FILTER_IMMUNE_CELLS} from './main_filter_outputs.nf' +include {FILTER_METABOLOME} from './main_filter_outputs.nf' +include {FILTER_MUTATED_PROTEINS} from './main_filter_outputs.nf' +include {CREATE_SUMMARY_REPORT} from './main_filter_outputs.nf' + + +workflow { + pheno_store_ch = file(params.pheno_store) + imputed_store_ch = file(params.imputed_store) + synthea_support_ch = file(params.synthea_support_dir) + regulon_ch = file(params.regulon) + signature_matrix_ch = file(params.signature_matrix) + mane_ch = file(params.mane) + health_dir_ch = file(params.healthy_dir) + + //SYNTHEA + switch (params.disease) { + case 'healthy': + generate_m_healthy_cudf(health_dir_ch) + generate_f_healthy_cudf(health_dir_ch) + m_healthy = generate_m_healthy_cudf.out + f_healthy = generate_f_healthy_cudf.out + txt_ch = f_healthy.mix(m_healthy).flatten() + break + default: + get_disease_stats_no_patients(pheno_store_ch, imputed_store_ch) + generate_m_variants_cudf(get_disease_stats_no_patients.out) + generate_f_variants_cudf(get_disease_stats_no_patients.out) + f_var = generate_f_variants_cudf.out + m_var = generate_m_variants_cudf.out + txt_ch = f_var.mix(m_var).flatten() + } + make_vcfs(txt_ch, synthea_support_ch) + + //BORZOI + FILTER_VCF(mane_ch, make_vcfs.out) + PREDICT_EXPRESSION(FILTER_VCF.out, mane_ch) + + //VCF2PROT + VCF2PROT(make_vcfs.out, FILTER_VCF.out) + + //RNA2PROTEINEXPRESSION + PREDICT_EXPRESSION.out + .collect() + .flatten() + .set { rna_input } + RNA2PROTEXPRESSION(rna_input) + + //CORTO + CORTO(PREDICT_EXPRESSION.out, regulon_ch) + + //CIBERSORT + signature_file = Channel.fromPath(signature_matrix_ch, checkIfExists: true) + CONVERT_TO_TXT(PREDICT_EXPRESSION.out) + CIBERSORTx_FRACTIONS(CONVERT_TO_TXT.out, signature_file) + CIBERSORTx_HIRES(CONVERT_TO_TXT.out, CIBERSORTx_FRACTIONS.out, signature_file) + ADD_TISSUE_NAMES_TO_CIBERSORTX(CONVERT_TO_TXT.out, CIBERSORTx_HIRES.out) + + // FILTERING STAGE + if (params.enable_filtering) { + FILTER_VARIANTS(make_vcfs.out) + FILTER_TRANSCRIPTOME(PREDICT_EXPRESSION.out) + FILTER_PROTEOME(RNA2PROTEXPRESSION.out) + FILTER_IMMUNE_CELLS(ADD_TISSUE_NAMES_TO_CIBERSORTX.out) + FILTER_METABOLOME(CORTO.out) + FILTER_MUTATED_PROTEINS(VCF2PROT.out, FILTER_VARIANTS.out.filtered_vcf) + CREATE_SUMMARY_REPORT( + FILTER_TRANSCRIPTOME.out, + FILTER_PROTEOME.out, + FILTER_IMMUNE_CELLS.out, + FILTER_METABOLOME.out, + FILTER_VARIANTS.out.filtered_vcf + ) + } +} diff --git a/main_corto.nf b/main_corto.nf index 5c042e8..d029c78 100644 --- a/main_corto.nf +++ b/main_corto.nf @@ -8,7 +8,6 @@ process CORTO { publishDir "${params.outdir}/${params.project_name}", mode: 'copy' // debug true // maxForks 1 - stageInMode 'copy' input: path TPM diff --git a/main_vcf2prot.nf b/main_vcf2prot.nf index afbe508..f0ee7fc 100644 --- a/main_vcf2prot.nf +++ b/main_vcf2prot.nf @@ -7,7 +7,6 @@ process VCF2PROT { containerOptions "${params.containerOptions}" // echo true publishDir "${params.outdir}/vcf2prot", mode: 'copy' - stageInMode 'copy' maxForks 1 input: diff --git a/nextflow.config b/nextflow.config index 79c7888..84e42cf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,98 +1,47 @@ manifest { - name = 'digital patients' + name = 'digital-patients' author = 'omic' recurseSubmodules = true - homePage = 'https://gitlab.com/omic/next/registry/pipelines/digitalpatients' + homePage = 'https://trs-gitea.cluster.omic.ai/omic/digital-patients' description = 'generative digital patients and multi-omics pipeline' mainScript = 'main.nf' nextflowVersion = '!>=21.04.3' - defaultBranch = 'master' + defaultBranch = 'main' } -// docker { -// enabled = true -// temp = 'auto' -// } - -// process { -// withLabel: 'gpu_process' { -// containerOptions = '--gpus all --rm' -// } -// -// withLabel: 'cpu_process' { -// containerOptions = '--rm' -// } -// } - -def sharedPod = [ - [env: 'NXF_DEBUG', value: '0'], - [label: 'omic-app', value: 'digitalpatients'], - [imagePullSecret: 'gitlab-registry-secret'], - [volumeClaim: 'avatar-new', mountPath: '/mnt/Avatar/'], -] - profiles { - docker { - docker.enabled = true - - process { - executor = 'local' - - withLabel: 'gpu_process' { - maxForks = 1 // Only one GPU task at a time on single GPU system - containerOptions = '--gpus all --rm' - } - - withLabel: 'cpu_process' { - containerOptions = '--rm' - } - } - } - - k8s { - process { - - executor = 'k8s' - namespace = 'bioinformatics' - debug = true - - pod = sharedPod - - withLabel: 'gpu_process' { - - pod = sharedPod + [ - [nodeSelector: [gpu: 'yes', 'gpu-type': 'geforce-rtx-3090']] - ] - } + standard { + docker { + enabled = true + temp = 'auto' + } } - workDir = "/mnt/dreamdock-data/digital-patient-data/work" + k8s_gpu { + process { + executor = 'k8s' + pod = [[nodeSelector: 'nvidia.com/gpu.present=true']] + accelerator = [request: 1, type: 'nvidia.com/gpu'] + } + docker { + enabled = true + } + k8s { + storageClaimName = 'eureka-pvc' + storageMountPath = '/omic/eureka' + } + } k8s { - serviceAccount = 'nextflow-sa' - namespace = 'bioinformatics' - storageClaimName = 'dreamdock-data' - storageMountPath = '/mnt/dreamdock-data' - - pullPolicy = 'IfNotPresent' - cleanup = true // delete pods after Ctrl+C or finished? - // cleanup = false // delete pods after Ctrl+C or finished? - - - // RUN AS DIFFERENT USERS - // securityContext = [fsGroup: 1000] - // securityContext = [ - // runAsUser: 1000, - // fsGroup: 1000, - // runAsNonRoot: true - // ] + process { + executor = 'k8s' + } + docker { + enabled = true + } + k8s { + storageClaimName = 'eureka-pvc' + storageMountPath = '/omic/eureka' + } } - - - - // Use container image - // process.container = 'harbor.cluster.omic.ai/omic/faiss-indexer:latest' - - } } - diff --git a/params.json b/params.json index 152470f..d438a4a 100644 --- a/params.json +++ b/params.json @@ -1,6 +1,5 @@ { "params": { - "// SYNTHEA PARAMETERS": {}, "n_pat": { "type": "integer", "description": "Number of patients to generate (must be >= 2)", @@ -10,9 +9,7 @@ "var_name": "params.n_pat", "examples": [2, 10, 100], "pattern": "^[0-9]+$", - "validation": { - "min": 2 - }, + "validation": { "min": 2 }, "notes": "Minimum of 2 patients required (one male, one female)" }, "percent_male": { @@ -23,42 +20,9 @@ "pipeline_io": "parameter", "var_name": "params.percent_male", "examples": [0.0, 0.5, 1.0], - "pattern": "^0(\\.\\d+)?|1(\\.0)?$", - "validation": { - "min": 0.0, - "max": 1.0 - }, + "validation": { "min": 0.0, "max": 1.0 }, "notes": "Value must be between 0 and 1 inclusive" }, - "imputed_store": { - "type": "folder", - "description": "Path to imputed UKBB data", - "default": "/mnt/Avatar/imputed/ukbb/imputed", - "required": true, - "pipeline_io": "input", - "var_name": "params.imputed_store", - "examples": [ - "/mnt/Avatar/imputed/ukbb/imputed", - "/rosalind/ukbb/imputed" - ], - "pattern": ".*", - "validation": {}, - "notes": "Directory containing imputed UKBB data" - }, - "pheno_store": { - "type": "file", - "description": "Path to phenotype data file", - "default": "/mnt/Avatar/dd/synthea/metadata/ukbb_phenotypes_filtered.csv", - "required": true, - "pipeline_io": "input", - "var_name": "params.pheno_store", - "examples": [ - "/mnt/Avatar/dd/synthea/metadata/ukbb_phenotypes_filtered.csv" - ], - "pattern": ".*\\.csv$", - "validation": {}, - "notes": "CSV file containing filtered UKBB phenotype data" - }, "disease": { "type": "string", "description": "Disease or condition to simulate", @@ -66,83 +30,10 @@ "required": true, "pipeline_io": "parameter", "var_name": "params.disease", - "examples": [ - "schizophrenia", - "healthy", - "leukaemia", - "Purpura and other haemorrhagic conditions" - ], - "pattern": ".*", - "enum": [ - "schizophrenia", - "healthy", - "leukaemia", - "Purpura and other haemorrhagic conditions" - ], - "validation": {}, + "examples": ["schizophrenia", "healthy", "leukaemia"], + "enum": ["schizophrenia", "healthy", "leukaemia", "Purpura and other haemorrhagic conditions"], "notes": "Use 'healthy' for healthy individuals or specify a disease condition" }, - "healthy_dir": { - "type": "folder", - "description": "Path to healthy patient data directory", - "default": "/Workspace/next/registry/pipelines/digital_patient", - "required": true, - "pipeline_io": "input", - "var_name": "params.healthy_dir", - "examples": [ - "/Workspace/next/registry/pipelines/digital_patient", - "/mnt/Avatar/digital_patient" - ], - "pattern": ".*", - "validation": {}, - "notes": "Directory containing data for healthy patient generation" - }, - "outdir": { - "type": "folder", - "description": "Output directory for results", - "default": "/mnt/omic-next-apis/wes/digital_patients", - "required": true, - "pipeline_io": "output", - "var_name": "params.outdir", - "examples": [ - "/mnt/OmicNAS/dd/digital_patient", - "/path/to/custom/output" - ], - "pattern": ".*", - "validation": {}, - "notes": "Directory where all pipeline results will be stored" - }, - "// BORZOI PARAMETERS": {}, - "container_borzoi": { - "type": "string", - "description": "Borzoi container image", - "default": "borzoi:latest", - "required": true, - "pipeline_io": "parameter", - "var_name": "params.container_borzoi", - "examples": [ - "borzoi:latest", - "borzoi:v1.0" - ], - "pattern": ".*", - "validation": {}, - "notes": "Docker container image for Borzoi module" - }, - "containerOptions": { - "type": "string", - "description": "Container runtime options", - "default": "--gpus all --rm -v /mnt:/mnt", - "required": false, - "pipeline_io": "parameter", - "var_name": "params.containerOptions", - "examples": [ - "--gpus all --rm -v /mnt:/mnt", - "--rm -v /data:/data" - ], - "pattern": ".*", - "validation": {}, - "notes": "Docker container runtime options for GPU usage and volume mounts" - }, "project_name": { "type": "string", "description": "Project identifier", @@ -150,106 +41,92 @@ "required": true, "pipeline_io": "parameter", "var_name": "params.project_name", - "examples": [ - "test", - "production", - "schizophrenia_study" - ], - "pattern": ".*", - "validation": {}, - "notes": "Identifier for the digital patient project" + "examples": ["test", "production", "schizophrenia_study"] + }, + "imputed_store": { + "type": "folder", + "description": "Path to imputed UKBB data directory", + "default": "/omic/eureka/digital-patients/imputed", + "required": true, + "pipeline_io": "input", + "var_name": "params.imputed_store", + "examples": ["/omic/eureka/digital-patients/imputed"], + "notes": "Directory containing imputed UKBB GWAS .bgz files" + }, + "pheno_store": { + "type": "file", + "description": "Path to phenotype data file", + "default": "/omic/eureka/digital-patients/ukbb_phenotypes_filtered.csv", + "required": true, + "pipeline_io": "input", + "var_name": "params.pheno_store", + "examples": ["/omic/eureka/digital-patients/ukbb_phenotypes_filtered.csv"], + "pattern": ".*\\.csv$", + "notes": "CSV file containing filtered UKBB phenotype data" + }, + "healthy_dir": { + "type": "folder", + "description": "Path to healthy patient data directory (gnomad files)", + "default": "/omic/eureka/digital-patients/healthy", + "required": true, + "pipeline_io": "input", + "var_name": "params.healthy_dir", + "examples": ["/omic/eureka/digital-patients/healthy"], + "notes": "Directory containing gnomad.genomes.v4.1.sites.{male,female}.txt" + }, + "synthea_support_dir": { + "type": "folder", + "description": "Path to Synthea supporting data (VCF templates, liftover, genome)", + "default": "/omic/eureka/digital-patients/supporting-data", + "required": true, + "pipeline_io": "input", + "var_name": "params.synthea_support_dir", + "examples": ["/omic/eureka/digital-patients/supporting-data"], + "notes": "Must contain vcf/vcf_template.vcf, ucsc-liftover/hg19ToHg38.over.chain.gz, genome/hg38.fa" }, "mane": { "type": "file", "description": "Path to MANE reference file", - "default": "/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv", + "default": "/omic/eureka/digital-patients/MANE.GRCh38.v1.3.update.tsv", "required": true, "pipeline_io": "input", "var_name": "params.mane", - "examples": [ - "/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv" - ], + "examples": ["/omic/eureka/digital-patients/MANE.GRCh38.v1.3.update.tsv"], "pattern": ".*\\.tsv$", - "validation": {}, "notes": "MANE transcripts reference file in TSV format" }, - "// VCF2PROT PARAMETERS": {}, - "container_vcf2prot": { - "type": "string", - "description": "VCF2PROT container image", - "default": "vcf2prot:latest", - "required": true, - "pipeline_io": "parameter", - "var_name": "params.container_vcf2prot", - "examples": [ - "vcf2prot:latest", - "vcf2prot:v1.0" - ], - "pattern": ".*", - "validation": {}, - "notes": "Docker container image for VCF2PROT module" - }, - "// RNA2PROTEINEXPRESSION PARAMETERS": {}, - "container_rna2protexpression": { - "type": "string", - "description": "RNA2PROTEINEXPRESSION container image", - "default": "rna2protexpression:latest", - "required": true, - "pipeline_io": "parameter", - "var_name": "params.container_rna2protexpression", - "examples": [ - "rna2protexpression:latest", - "rna2protexpression:v1.0" - ], - "pattern": ".*", - "validation": {}, - "notes": "Docker container image for RNA2PROTEINEXPRESSION module" - }, - "containerOptions_rna2protexpression": { - "type": "string", - "description": "Container options for RNA2PROTEINEXPRESSION", - "default": "--gpus all --rm -v /mnt:/mnt -v /dbs:/dbs", - "required": false, - "pipeline_io": "parameter", - "var_name": "params.containerOptions_rna2protexpression", - "examples": [ - "--gpus all --rm -v /mnt:/mnt -v /dbs:/dbs" - ], - "pattern": ".*", - "validation": {}, - "notes": "Docker container runtime options for RNA2PROTEINEXPRESSION with GPU and database volume mounts" - }, - "// CORTO PARAMETERS": {}, - "container_corto": { - "type": "string", - "description": "CORTO container image", - "default": "corto:latest", - "required": true, - "pipeline_io": "parameter", - "var_name": "params.container_corto", - "examples": [ - "corto:latest", - "corto:v1.0" - ], - "pattern": ".*", - "validation": {}, - "notes": "Docker container image for CORTO module" - }, "regulon": { "type": "file", "description": "Path to regulon RDA file", - "default": "/Workspace/next/registry/pipelines/digital_patient/regulon.rda", + "default": "/omic/eureka/digital-patients/regulon.rda", "required": true, "pipeline_io": "input", "var_name": "params.regulon", - "examples": [ - "/Workspace/next/registry/pipelines/digital_patient/regulon.rda" - ], + "examples": ["/omic/eureka/digital-patients/regulon.rda"], "pattern": ".*\\.rda$", - "validation": {}, "notes": "Regulon data file in RDA format for CORTO module" }, - "// CIBERSORT PARAMETERS": {}, + "signature_matrix": { + "type": "file", + "description": "Path to signature matrix file for CIBERSORTx", + "default": "/omic/eureka/digital-patients/LM22_sourceGEP_ensg.txt", + "required": true, + "pipeline_io": "input", + "var_name": "params.signature_matrix", + "examples": ["/omic/eureka/digital-patients/LM22_sourceGEP_ensg.txt"], + "pattern": ".*\\.txt$", + "notes": "Signature matrix file for CIBERSORTx analysis" + }, + "outdir": { + "type": "folder", + "description": "Output directory for results", + "default": "/omic/eureka/digital-patients/output", + "required": true, + "pipeline_io": "output", + "var_name": "params.outdir", + "examples": ["/omic/eureka/digital-patients/output"], + "notes": "Directory where all pipeline results will be stored" + }, "cibersortx_username": { "type": "string", "description": "CIBERSORTx username for authentication", @@ -257,11 +134,6 @@ "required": true, "pipeline_io": "parameter", "var_name": "params.cibersortx_username", - "examples": [ - "gabriel.richman.2009@anderson.ucla.edu" - ], - "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", - "validation": {}, "notes": "Username (email) for CIBERSORTx authentication" }, "cibersortx_token": { @@ -271,55 +143,55 @@ "required": true, "pipeline_io": "parameter", "var_name": "params.cibersortx_token", - "examples": [ - "b5b39e563fb947df4cfd4843d40fdb99" - ], - "pattern": "^[a-f0-9]{32}$", - "validation": {}, "notes": "Authentication token for CIBERSORTx API access" }, + "container_borzoi": { + "type": "string", + "description": "Borzoi container image", + "default": "harbor.cluster.omic.ai/omic/digital-patients/borzoi:latest", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.container_borzoi" + }, + "container_vcf2prot": { + "type": "string", + "description": "VCF2PROT container image", + "default": "harbor.cluster.omic.ai/omic/digital-patients/vcf2prot:latest", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.container_vcf2prot" + }, + "container_rna2protexpression": { + "type": "string", + "description": "RNA2PROTEINEXPRESSION container image", + "default": "harbor.cluster.omic.ai/omic/digital-patients/rna2protexpression:latest", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.container_rna2protexpression" + }, + "container_corto": { + "type": "string", + "description": "CORTO container image", + "default": "harbor.cluster.omic.ai/omic/digital-patients/corto:latest", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.container_corto" + }, "container_ecotyper": { "type": "string", - "description": "ECOTyper container image", - "default": "ecotyper:latest", + "description": "ECOTyper/CIBERSORTx container image", + "default": "harbor.cluster.omic.ai/omic/digital-patients/ecotyper:latest", "required": true, "pipeline_io": "parameter", - "var_name": "params.container_ecotyper", - "examples": [ - "ecotyper:latest", - "ecotyper:v1.0" - ], - "pattern": ".*", - "validation": {}, - "notes": "Docker container image for ECOTyper module" + "var_name": "params.container_ecotyper" }, - "signature_matrix": { - "type": "file", - "description": "Path to signature matrix file", - "default": "/Workspace/next/registry/pipelines/digital_patient/LM22_sourceGEP_ensg.txt", + "container_synthea": { + "type": "string", + "description": "Synthea container image", + "default": "harbor.cluster.omic.ai/omic/digital-patients/synthea:cudf", "required": true, - "pipeline_io": "input", - "var_name": "params.signature_matrix", - "examples": [ - "/Workspace/next/registry/pipelines/digital_patient/LM22_sourceGEP_ensg.txt" - ], - "pattern": ".*\\.txt$", - "validation": {}, - "notes": "Signature matrix file for CIBERSORTx analysis" - }, - "ecotyper_outdir": { - "type": "folder", - "description": "Output directory for ECOTyper results", - "default": "/mnt/omic-next-apis/wes/digital-patients/ecotyper", - "required": true, - "pipeline_io": "output", - "var_name": "params.ecotyper_outdir", - "examples": [ - "/mnt/OmicNAS/olamide/ecotyper/results/ecotyper" - ], - "pattern": ".*", - "validation": {}, - "notes": "Directory where ECOTyper results will be stored" + "pipeline_io": "parameter", + "var_name": "params.container_synthea" } } } diff --git a/reassemble.nf b/reassemble.nf new file mode 100644 index 0000000..dcfd6a9 --- /dev/null +++ b/reassemble.nf @@ -0,0 +1,66 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// One-time script to reassemble chunked files on the PVC +// Run once, then delete. Not part of the main pipeline. + +params.base_dir = '/omic/eureka/digital-patients' +params.outdir = '/omic/eureka/digital-patients' + +process REASSEMBLE { + container 'alpine:latest' + executor 'local' + + script: + """ + echo "=== Reassembling chunked files ===" + + # hg38.fa (35 parts) + if [ -d "${params.base_dir}/supporting-data/genome/hg38.fa_parts" ]; then + echo "Reassembling hg38.fa..." + cat ${params.base_dir}/supporting-data/genome/hg38.fa_parts/part_* > ${params.base_dir}/supporting-data/genome/hg38.fa + echo " Done: \$(du -h ${params.base_dir}/supporting-data/genome/hg38.fa | cut -f1)" + fi + + # gnomad male (12 parts) + if [ -d "${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.male.txt_parts" ]; then + echo "Reassembling gnomad male..." + cat ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.male.txt_parts/part_* > ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.male.txt + echo " Done: \$(du -h ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.male.txt | cut -f1)" + fi + + # gnomad female (12 parts) + if [ -d "${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.female.txt_parts" ]; then + echo "Reassembling gnomad female..." + cat ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.female.txt_parts/part_* > ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.female.txt + echo " Done: \$(du -h ${params.base_dir}/healthy/gnomad.genomes.v4.1.sites.female.txt | cut -f1)" + fi + + # F5_SCHIZO male (7 parts) + if [ -d "${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz_parts" ]; then + echo "Reassembling F5_SCHIZO male..." + cat ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz_parts/part_* > ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz + echo " Done: \$(du -h ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz | cut -f1)" + fi + + # F5_SCHIZO female (7 parts) + if [ -d "${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz_parts" ]; then + echo "Reassembling F5_SCHIZO female..." + cat ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz_parts/part_* > ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz + echo " Done: \$(du -h ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz | cut -f1)" + fi + + # F5_SCHIZO both_sexes (7 parts) + if [ -d "${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz_parts" ]; then + echo "Reassembling F5_SCHIZO both_sexes..." + cat ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz_parts/part_* > ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz + echo " Done: \$(du -h ${params.base_dir}/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz | cut -f1)" + fi + + echo "=== All reassembly complete ===" + """ +} + +workflow { + REASSEMBLE() +}