Initial commit: digital-patients pipeline (clean, no large files)

Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
This commit is contained in:
2026-03-26 15:15:23 +01:00
commit 9e6a16c19b
45 changed files with 7207 additions and 0 deletions

125
test_gen_patient.nf Normal file
View File

@@ -0,0 +1,125 @@
nextflow.enable.dsl=2
//SYNTHEA
params.n_pat = 2 //10 //number of patients to generate, must be => 2 (one male one female)
params.percent_male = 0.5 // between 0-1, percent of male patients
params.age = '18-80' //age range of the population, expressed as age-age
// params.state = '"District of Columbia"' //'Hawaii' //'Texas' //'Georgia'
// params.city = '' //'Washington' //'' //'Honolulu' //'Houston' //'Atlanta'
params.imputed_store = '/mnt/Avatar/imputed/ukbb/imputed' //'/rosalind/ukbb/imputed'
params.pheno_store = '/mnt/Avatar/dd/synthea/metadata/ukbb_phenotypes_filtered.csv'
// Keep both versions of disease params
params.disease = 'schizophrenia' //'schizophrenia' //'healthy' //'leukaemia' //'leukaemia' //'Purpura and other haemorrhagic conditions' //['tongue cancer', 'dementia', 'arthritis'] //'tongue cancer' //'dementia'
params.healthy_dir = '/Workspace/next/registry/pipelines/digital_patient' //'/mnt/Avatar/digital_patient' // data copy is here
//params.n_var = 100
params.outdir = '/mnt/OmicNAS/dd/digital_patient'
//BORZOI
params.container_borzoi = 'borzoi:latest'
params.containerOptions = '--gpus all --rm -v /mnt:/mnt'
params.project_name = 'test'
params.mane = '/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv'
//VCF2PROT
params.container_vcf2prot = "vcf2prot:latest"
//RNA2PROTEINEXPRESSION
params.container_rna2protexpression = 'rna2protexpression:latest'
params.containerOptions_rna2protexpression = '--gpus all --rm -v /mnt:/mnt -v /dbs:/dbs'
//CORTO
params.container_corto = 'corto:latest'
params.regulon = '/Workspace/next/registry/pipelines/digital_patient/regulon.rda'
//CIBERSORT
params.cibersortx_username = "gabriel.richman.2009@anderson.ucla.edu"
params.cibersortx_token = "b5b39e563fb947df4cfd4843d40fdb99"
params.container_ecotyper = 'ecotyper:latest'
params.signature_matrix = "/Workspace/next/registry/pipelines/digital_patient/LM22_sourceGEP_ensg.txt"
params.ecotyper_outdir = "/mnt/OmicNAS/olamide/ecotyper/results/ecotyper"
//SYNTHEA
include {generate_patients} from './main_synthea.nf'
// include {get_disease_stats_no_patients} from './main_synthea.nf'
// include {generate_m_variants_cudf} from './main_synthea.nf'
// include {generate_f_variants_cudf} from './main_synthea.nf'
// include {make_vcfs} from './main_synthea.nf'
// include {generate_m_healthy_cudf} from './main_synthea.nf'
// include {generate_f_healthy_cudf} from './main_synthea.nf'
// //BORZOI
// include {FILTER_VCF} from './main_borzoi.nf'
// include {PREDICT_EXPRESSION} from './main_borzoi.nf'
// include {CREATE_PROTEIN_CLUSTER} from './main_borzoi.nf'
// //VCF2PROT
// include {VCF2PROT} from './main_vcf2prot.nf'
// //RNA2PROTEINEXPRESSION
// include {RNA2PROTEXPRESSION} from './main_rna2proteinexpression'
// //CORTO
// include {CORTO} from './main_corto.nf'
// //CIBERSORT
// include {CONVERT_TO_TXT} from './main_cibersortx.nf'
// include {CIBERSORTx_FRACTIONS} from './main_cibersortx.nf'
// include {CIBERSORTx_HIRES} from './main_cibersortx.nf'
// include {ADD_TISSUE_NAMES_TO_CIBERSORTX} from './main_cibersortx.nf'
workflow {
generate_patients()
// //SYNTHEA
// switch (params.disease) {
// case 'healthy':
// //healthy
// generate_m_healthy_cudf(params.healthy_dir)
// generate_f_healthy_cudf(params.healthy_dir)
// m_healthy = generate_m_healthy_cudf.out
// f_healthy = generate_f_healthy_cudf.out
// txt_ch = f_healthy.mix(m_healthy).flatten()
// break
// default:
// //disease
// get_disease_stats_no_patients()
// generate_m_variants_cudf(get_disease_stats_no_patients.out)
// generate_f_variants_cudf(get_disease_stats_no_patients.out)
// f_var = generate_f_variants_cudf.out
// m_var = generate_m_variants_cudf.out
// txt_ch = f_var.mix(m_var).flatten()
// }
// make_vcfs(txt_ch)
// //BORZOI
// // Prepare the VCF file(s)
// //vcf_ch = Channel.fromPath(make_vcfs.out)
// // THIS IS USES TO CREATE PROTEIN CLUSTER ON A SAME BORZOI INPUT AND SPLIT ONES TOO BIG FOR BORZOI (LEN = 524288)
// // THIS STEP NEEDS TO BE RUN TO CREATE DATA FOR NEXT STEP ONLY IF NEW REFERENCE/RNAS ARE GOING TO BE USED
// //CREATE_PROTEIN_CLUSTER(params.mane)
// FILTER_VCF(params.mane, make_vcfs.out)
// PREDICT_EXPRESSION(FILTER_VCF.out, params.mane)
// //VCF2PROT
// VCF2PROT(make_vcfs.out, FILTER_VCF.out)
// //RNA2PROTEINEXPRESSION
// // Updated RNA to protein expression step to run only a single process at a time, avoids CUDA out-of-memory errors.
// PREDICT_EXPRESSION.out
// .collect()
// .flatten()
// .set { rna_input }
// RNA2PROTEXPRESSION(rna_input)
// //CORTO
// CORTO(PREDICT_EXPRESSION.out, params.regulon)
// //CIBERSORT
// signature_file = Channel.fromPath(params.signature_matrix, checkIfExists: true)
// CONVERT_TO_TXT(PREDICT_EXPRESSION.out)
// CIBERSORTx_FRACTIONS(CONVERT_TO_TXT.out, signature_file)
// CIBERSORTx_HIRES(CONVERT_TO_TXT.out, CIBERSORTx_FRACTIONS.out, signature_file)
// ADD_TISSUE_NAMES_TO_CIBERSORTX(CONVERT_TO_TXT.out, CIBERSORTx_HIRES.out)
}