Initial commit: digital-patients pipeline (clean, no large files)

Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
This commit is contained in:
2026-03-26 15:15:23 +01:00
commit 9e6a16c19b
45 changed files with 7207 additions and 0 deletions

481
main_synthea.nf.bk Normal file
View File

@@ -0,0 +1,481 @@
nextflow.enable.dsl=2
process generate_patients {
container 'synthea:cudf'
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
//publishDir "${params.outdir}/patients", mode: 'copy'
echo true
// errorStrategy 'ignore'
output:
tuple path('*/patients.csv'), path('*/conditions.csv'), emit: patient_tuple
path "*"
script:
"""
workdir=`pwd`
cd /synthea
./run_synthea -p ${params.n_pat} -a ${params.age} ${params.state} ${params.city} --exporter.csv.export true
cd /synthea/output
cp -r * \$workdir
"""
}
process get_disease_stats_no_patients {
container 'synthea:cudf'
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
echo true
output:
tuple path("*_female.tsv"), path("*_male.tsv")
script:
"""
#!/opt/conda/envs/synthea/bin/python3
import pandas as pd
import subprocess
import os.path
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'], engine='python')
for target in '${params.disease}'.replace("[", "").replace("]", "").replace(" ", "").split(','):
# lookup ukbb condition from metadata using fuzzy matching
ukbb_cond = process.extractOne(target , df_meta['phenotype_description'], score_cutoff=80)
#print(ukbb_cond)
ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code']
## get male imputed variants, replace if not found
try:
if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
subprocess.run(["cp", ukbb_var_m, "."])
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv'])
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
subprocess.run(["cp", ukbb_var_m, "."])
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv'])
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
subprocess.run(["cp", ukbb_var_m, "."])
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv'])
else:
print('imputed variants not present')
except:
print('no files exist')
## get female imputed variants, replace if not found
try:
if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
subprocess.run(["cp", ukbb_var_f, "."])
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv'])
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
subprocess.run(["cp", ukbb_var_f, "."])
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv'])
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
subprocess.run(["cp", ukbb_var_f, "."])
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv'])
else:
print('imputed variants not present')
except:
print('no files exist')
"""
}
process get_disease_stats_cudf {
container 'synthea:cudf'
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
echo true
input:
tuple path(pat_csv), path(cond_csv)
output:
tuple path(pat_csv), path(cond_csv), path("*_female.tsv"), path("*_male.tsv")
script:
"""
#!/opt/conda/envs/synthea/bin/python3
#import pandas as pd
import cudf as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import numpy as np
import subprocess
import os.path
################## IMPORT SYNTHETIC PATIENTS ##############################################
## conditions, used to find the right imputed variant file
df_cond = pd.read_csv('${cond_csv}', delim_whitespace=False)
## patients
df_pat = pd.read_csv('${pat_csv}', delim_whitespace=False)
try:
df_pat_m = df_pat.loc[df_pat['GENDER'] == 'M']
except:
'No men in dataset'
try:
df_pat_f = df_pat.loc[df_pat['GENDER'] == 'F']
except:
'No women in dataset'
try:
df_pat_bs = df_pat.loc[df_pat['GENDER'] == '']
except:
'No intersex/undefined sex in dataset'
df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'])
print(df_meta)
## find variant file associated with condition - need the patient's sex m/f/other or unknown = both_sexes - add sex to condition table based on patient id
# lookup ukbb condition from metadata using fuzzy matching
ukbb_cond = process.extractOne('${params.disease}', df_meta['phenotype_description'].to_pandas(), score_cutoff=80)
print(ukbb_cond)
ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code']
## get male imputed variants, replace if not found
try:
if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
subprocess.run(["cp", ukbb_var_m, "."])
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv'])
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
subprocess.run(["cp", ukbb_var_m, "."])
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv'])
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
subprocess.run(["cp", ukbb_var_m, "."])
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv'])
else:
print('imputed variants not present')
except:
print('no files exist')
## get female imputed variants, replace if not found
try:
if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
subprocess.run(["cp", ukbb_var_f, "."])
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv'])
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
subprocess.run(["cp", ukbb_var_f, "."])
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv'])
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
subprocess.run(["cp", ukbb_var_f, "."])
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv'])
else:
print('imputed variants not present')
except:
print('no files exist')
"""
}
process generate_m_variants_cudf {
container 'synthea:cudf'
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
//publishDir "${params.outdir}/variants", mode: 'copy'
//errorStrategy 'ignore'
echo true
// memory '15.0 GB'
maxForks 1
input:
tuple path(dis_var_f), path(dis_var_m)
//tuple path(pat_csv), path(cond_csv), path(dis_var_f), path(dis_var_m)
output:
path "*_variants.csv", emit: m_var
//path "selected_proteins.txt", emit: proteins
//path "selected_proteins_liftover_m.bed", emit: m_bed
script:
"""
#!/opt/conda/envs/synthea/bin/python3
import cudf as pd
import numpy as np
import os
import subprocess
import io
import requests
import itertools
import re
import json
for target in '${dis_var_m}'.split(" "):
try:
subprocess.run(['mkdir', 'gzip'])
subprocess.run(['cp', target, 'gzip/'])
subprocess.run(['gzip', 'gzip/' + target])
print(target,'.gz')
df_var_m = pd.read_csv('gzip/' + target + '.gz', sep='\\t', header=0)
except:
print("the male variant file does not exist")
################## GENERATE VARIANTS USING UKBB IMPUTED DATA ##############################
##read variantes
df_var = df_var_m
## calculate major and minor alleles and frequencies
df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF'])
major_af = 1 - df_var[['minor_AF']]
df_var['major_AF'] = major_af
## split variant into parts
df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True)
df_var['chr']=df_var['chr'].astype('str')
df_var['pos']=df_var['pos'].astype('int')
## filter out low confidence variants
df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False]
df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1)
## determine which is the minor allele
# Convert to pandas DataFrame in host memory
df_var_all_host = df_var_all.to_pandas()
# Perform the operation on host memory
df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref']
df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt']
# Convert back to cuDF DataFrame in device memory
df_var_all = pd.from_pandas(df_var_all_host)
## define variant array and probabilities, patient array
arr_var = df_var_all[['minor_allele','major_allele']].to_numpy()
arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy()
n_patients = int(${params.n_pat} * ${params.percent_male})
## loop through rows, using weighted random choice by variant probability to assign predicted allele
for p in range(n_patients):
df_var_all['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']]
df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']]
df_var_final.to_csv(target.split('.')[0] + '_patient_' + str(p) + '_variants.csv', index=False)
"""
}
process generate_f_variants_cudf {
container 'synthea:cudf'
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
//publishDir "${params.outdir}/variants", mode: 'copy'
maxForks 1
//errorStrategy 'ignore'
echo true
input:
tuple path(dis_var_f), path(dis_var_m)
output:
path "*_variants.csv", emit: f_var
//path "selected_proteins.txt", emit: proteins
//path "selected_proteins_liftover_f.bed", emit: f_bed
script:
"""
#!/opt/conda/envs/synthea/bin/python3
import cudf as pd
import numpy as np
import os
import subprocess
import io
import requests
import itertools
import re
import json
for target in '${dis_var_f}'.split(" "):
try:
subprocess.run(['mkdir', 'gzip'])
subprocess.run(['cp', target, 'gzip/'])
subprocess.run(['gzip', 'gzip/' + target])
print(target,'.gz')
df_var_f = pd.read_csv('gzip/' + target + '.gz', sep='\\t', header=0)
except:
print("the female variant file does not exist")
##################### GENERATE VARIANTS USING UKBB IMPUTED DATA ##############################
## read variant data
df_var = df_var_f
## calculate major and minor alleles and frequencies
df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF'])
major_af = 1 - df_var[['minor_AF']]
df_var['major_AF'] = major_af
## split variant into parts
df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True)
df_var['chr']=df_var['chr'].astype('str')
df_var['pos']=df_var['pos'].astype('int')
## filter out low confidence variants
df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False]
df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1)
## determine which is the minor allele
# Convert to pandas DataFrame in host memory
df_var_all_host = df_var_all.to_pandas()
df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref']
df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt']
# Convert back to cuDF DataFrame in device memory
df_var_all = pd.from_pandas(df_var_all_host)
## define variant array and probabilities, patient array
arr_var = pd.DataFrame(data=df_var_all, columns=['minor_allele','major_allele']).to_numpy()
arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy()
n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male})
## loop through rows, using weighted random choice by variant probability to assign predicted allele
for p in range(n_patients):
df_var_all['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']]
df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']]
df_var_final.to_csv(target.split('.')[0] + '_patient_' + str(p) + '_variants.csv', index=False)
"""
}
process generate_m_healthy_cudf {
container 'synthea:cudf'
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
maxForks 1
echo true
input:
tuple path(healthy_dir)
output:
path "*_variants.csv", emit: m_healthy
script:
"""
#!/opt/conda/envs/synthea/bin/python3
import cudf as pd
import numpy as np
df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.male.txt')
df_var['minor_AF'] = pd.to_numeric(df_var['AF_XY'])
major_af = 1 - df_var[['minor_AF']]
df_var['major_AF'] = major_af
arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy()
arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy()
n_patients = int(${params.n_pat} * ${params.percent_male})
for p in range(n_patients):
df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']]
df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']]
#randomly select mutation if more than one mutation is passed for a position
df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index()
df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True)
df_var_final.to_csv('healthy_male_patient_' + str(p) + '_variants.csv', index=False)
"""
}
process generate_f_healthy_cudf {
container 'synthea:cudf'
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
maxForks 1
echo true
input:
tuple path(healthy_dir)
output:
path "*_variants.csv", emit: f_healthy
script:
"""
#!/opt/conda/envs/synthea/bin/python3
import cudf as pd
import numpy as np
df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.female.txt')
df_var['minor_AF'] = pd.to_numeric(df_var['AF_XX'])
major_af = 1 - df_var[['minor_AF']]
df_var['major_AF'] = major_af
arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy()
arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy()
n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male})
for p in range(n_patients):
df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']]
df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']]
#randomly select mutation if more than one mutation is passed for a position
df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index()
df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True)
df_var_final.to_csv('healthy_female_patient_' + str(p) + '_variants.csv', index=False)
"""
}
process make_vcfs {
container 'synthea:cudf'
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
//publishDir "${params.outdir}/variants_vcfs", mode: 'copy'
echo true
maxForks 1
input:
path patient_vars
output:
path "${patient_vars.baseName}.vcf", emit: patient_vcfs
//path "${patient_vars.baseName}.vcf.gz", emit: patient_gz
//path "${patient_vars.baseName}.vcf.gz.tbi", emit: patient_index
//path "${patient_vars.baseName}_filt.csv", emit: filtered_csv
script:
"""
## only print disease-associated variants not found in the ref/major allele position as they will already be covered by the ref genome
#I don't think this is correct \$16 == \$18 cuz we changed csv file format
#it only removes first line from csv
awk '\$16 == \$18 { print \$0 }' ${patient_vars} | tail -n +2 > ${patient_vars.baseName}_filt.csv
## copy the variant template to the workdir
cp /mnt/Avatar/dd/synthea/supporting-data/vcf/vcf_template.vcf .
## print the filtered variants to vcf format
printf "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNA00001" >> vcf_template.vcf
printf "\\n" >> vcf_template.vcf
awk -F',' '{print "chr"\$1,\$2,".",\$3,\$4,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf
#awk -F',' '{print "chr"\$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf
#awk -F',' '{print \$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf
# sort the genome karyotypically (in chromosomal order) and index
#files are sorted already just for security
java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar SortVcf -I vcf_template.vcf -O ${patient_vars.baseName}_sorted.vcf
#Liftover form 19 to 38 standard
java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar LiftoverVcf -I ${patient_vars.baseName}_sorted.vcf -O ${patient_vars.baseName}.vcf -C /mnt/Avatar/dd/synthea/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz -REJECT rejected_variants.vcf -R /mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa
#change this part to save on memory usage
#bgzip < ${patient_vars.baseName}.vcf > ${patient_vars.baseName}.vcf.gz
#gatk IndexFeatureFile -I ${patient_vars.baseName}.vcf.gz
#delete unnecessary files
rm vcf_template.vcf
rm ${patient_vars.baseName}_filt.csv
rm ${patient_vars.baseName}_sorted.vcf
rm rejected_variants.vcf
rm *vcf.idx
"""
}