nextflow.enable.dsl=2 process generate_patients { container 'synthea:cudf' containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" //publishDir "${params.outdir}/patients", mode: 'copy' echo true // errorStrategy 'ignore' output: tuple path('*/patients.csv'), path('*/conditions.csv'), emit: patient_tuple path "*" script: """ workdir=`pwd` cd /synthea ./run_synthea -p ${params.n_pat} -a ${params.age} ${params.state} ${params.city} --exporter.csv.export true cd /synthea/output cp -r * \$workdir """ } process get_disease_stats_no_patients { container 'synthea:cudf' containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" echo true output: tuple path("*_female.tsv"), path("*_male.tsv") script: """ #!/opt/conda/envs/synthea/bin/python3 import pandas as pd import subprocess import os.path from fuzzywuzzy import fuzz from fuzzywuzzy import process df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'], engine='python') for target in '${params.disease}'.replace("[", "").replace("]", "").replace(" ", "").split(','): # lookup ukbb condition from metadata using fuzzy matching ukbb_cond = process.extractOne(target , df_meta['phenotype_description'], score_cutoff=80) #print(ukbb_cond) ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code'] ## get male imputed variants, replace if not found try: if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' subprocess.run(["cp", ukbb_var_m, "."]) subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv']) elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' subprocess.run(["cp", ukbb_var_m, "."]) subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv']) elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' subprocess.run(["cp", ukbb_var_m, "."]) subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv']) else: print('imputed variants not present') except: print('no files exist') ## get female imputed variants, replace if not found try: if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' subprocess.run(["cp", ukbb_var_f, "."]) subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv']) elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' subprocess.run(["cp", ukbb_var_f, "."]) subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv']) elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' subprocess.run(["cp", ukbb_var_f, "."]) subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv']) else: print('imputed variants not present') except: print('no files exist') """ } process get_disease_stats_cudf { container 'synthea:cudf' containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" echo true input: tuple path(pat_csv), path(cond_csv) output: tuple path(pat_csv), path(cond_csv), path("*_female.tsv"), path("*_male.tsv") script: """ #!/opt/conda/envs/synthea/bin/python3 #import pandas as pd import cudf as pd from fuzzywuzzy import fuzz from fuzzywuzzy import process import numpy as np import subprocess import os.path ################## IMPORT SYNTHETIC PATIENTS ############################################## ## conditions, used to find the right imputed variant file df_cond = pd.read_csv('${cond_csv}', delim_whitespace=False) ## patients df_pat = pd.read_csv('${pat_csv}', delim_whitespace=False) try: df_pat_m = df_pat.loc[df_pat['GENDER'] == 'M'] except: 'No men in dataset' try: df_pat_f = df_pat.loc[df_pat['GENDER'] == 'F'] except: 'No women in dataset' try: df_pat_bs = df_pat.loc[df_pat['GENDER'] == ''] except: 'No intersex/undefined sex in dataset' df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file']) print(df_meta) ## find variant file associated with condition - need the patient's sex m/f/other or unknown = both_sexes - add sex to condition table based on patient id # lookup ukbb condition from metadata using fuzzy matching ukbb_cond = process.extractOne('${params.disease}', df_meta['phenotype_description'].to_pandas(), score_cutoff=80) print(ukbb_cond) ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code'] ## get male imputed variants, replace if not found try: if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' subprocess.run(["cp", ukbb_var_m, "."]) subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv']) elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' subprocess.run(["cp", ukbb_var_m, "."]) subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv']) elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' subprocess.run(["cp", ukbb_var_m, "."]) subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv']) else: print('imputed variants not present') except: print('no files exist') ## get female imputed variants, replace if not found try: if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' subprocess.run(["cp", ukbb_var_f, "."]) subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv']) elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' subprocess.run(["cp", ukbb_var_f, "."]) subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv']) elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' subprocess.run(["cp", ukbb_var_f, "."]) subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv']) else: print('imputed variants not present') except: print('no files exist') """ } process generate_m_variants_cudf { container 'synthea:cudf' containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" //publishDir "${params.outdir}/variants", mode: 'copy' //errorStrategy 'ignore' echo true // memory '15.0 GB' maxForks 1 input: tuple path(dis_var_f), path(dis_var_m) //tuple path(pat_csv), path(cond_csv), path(dis_var_f), path(dis_var_m) output: path "*_variants.csv", emit: m_var //path "selected_proteins.txt", emit: proteins //path "selected_proteins_liftover_m.bed", emit: m_bed script: """ #!/opt/conda/envs/synthea/bin/python3 import cudf as pd import numpy as np import os import subprocess import io import requests import itertools import re import json for target in '${dis_var_m}'.split(" "): try: subprocess.run(['mkdir', 'gzip']) subprocess.run(['cp', target, 'gzip/']) subprocess.run(['gzip', 'gzip/' + target]) print(target,'.gz') df_var_m = pd.read_csv('gzip/' + target + '.gz', sep='\\t', header=0) except: print("the male variant file does not exist") ################## GENERATE VARIANTS USING UKBB IMPUTED DATA ############################## ##read variantes df_var = df_var_m ## calculate major and minor alleles and frequencies df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF']) major_af = 1 - df_var[['minor_AF']] df_var['major_AF'] = major_af ## split variant into parts df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True) df_var['chr']=df_var['chr'].astype('str') df_var['pos']=df_var['pos'].astype('int') ## filter out low confidence variants df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False] df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1) ## determine which is the minor allele # Convert to pandas DataFrame in host memory df_var_all_host = df_var_all.to_pandas() # Perform the operation on host memory df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref'] df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt'] # Convert back to cuDF DataFrame in device memory df_var_all = pd.from_pandas(df_var_all_host) ## define variant array and probabilities, patient array arr_var = df_var_all[['minor_allele','major_allele']].to_numpy() arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy() n_patients = int(${params.n_pat} * ${params.percent_male}) ## loop through rows, using weighted random choice by variant probability to assign predicted allele for p in range(n_patients): df_var_all['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']] df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']] df_var_final.to_csv(target.split('.')[0] + '_patient_' + str(p) + '_variants.csv', index=False) """ } process generate_f_variants_cudf { container 'synthea:cudf' containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" //publishDir "${params.outdir}/variants", mode: 'copy' maxForks 1 //errorStrategy 'ignore' echo true input: tuple path(dis_var_f), path(dis_var_m) output: path "*_variants.csv", emit: f_var //path "selected_proteins.txt", emit: proteins //path "selected_proteins_liftover_f.bed", emit: f_bed script: """ #!/opt/conda/envs/synthea/bin/python3 import cudf as pd import numpy as np import os import subprocess import io import requests import itertools import re import json for target in '${dis_var_f}'.split(" "): try: subprocess.run(['mkdir', 'gzip']) subprocess.run(['cp', target, 'gzip/']) subprocess.run(['gzip', 'gzip/' + target]) print(target,'.gz') df_var_f = pd.read_csv('gzip/' + target + '.gz', sep='\\t', header=0) except: print("the female variant file does not exist") ##################### GENERATE VARIANTS USING UKBB IMPUTED DATA ############################## ## read variant data df_var = df_var_f ## calculate major and minor alleles and frequencies df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF']) major_af = 1 - df_var[['minor_AF']] df_var['major_AF'] = major_af ## split variant into parts df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True) df_var['chr']=df_var['chr'].astype('str') df_var['pos']=df_var['pos'].astype('int') ## filter out low confidence variants df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False] df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1) ## determine which is the minor allele # Convert to pandas DataFrame in host memory df_var_all_host = df_var_all.to_pandas() df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref'] df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt'] # Convert back to cuDF DataFrame in device memory df_var_all = pd.from_pandas(df_var_all_host) ## define variant array and probabilities, patient array arr_var = pd.DataFrame(data=df_var_all, columns=['minor_allele','major_allele']).to_numpy() arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy() n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male}) ## loop through rows, using weighted random choice by variant probability to assign predicted allele for p in range(n_patients): df_var_all['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']] df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']] df_var_final.to_csv(target.split('.')[0] + '_patient_' + str(p) + '_variants.csv', index=False) """ } process generate_m_healthy_cudf { container 'synthea:cudf' containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" maxForks 1 echo true input: tuple path(healthy_dir) output: path "*_variants.csv", emit: m_healthy script: """ #!/opt/conda/envs/synthea/bin/python3 import cudf as pd import numpy as np df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.male.txt') df_var['minor_AF'] = pd.to_numeric(df_var['AF_XY']) major_af = 1 - df_var[['minor_AF']] df_var['major_AF'] = major_af arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy() arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy() n_patients = int(${params.n_pat} * ${params.percent_male}) for p in range(n_patients): df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']] df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']] #randomly select mutation if more than one mutation is passed for a position df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index() df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True) df_var_final.to_csv('healthy_male_patient_' + str(p) + '_variants.csv', index=False) """ } process generate_f_healthy_cudf { container 'synthea:cudf' containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" maxForks 1 echo true input: tuple path(healthy_dir) output: path "*_variants.csv", emit: f_healthy script: """ #!/opt/conda/envs/synthea/bin/python3 import cudf as pd import numpy as np df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.female.txt') df_var['minor_AF'] = pd.to_numeric(df_var['AF_XX']) major_af = 1 - df_var[['minor_AF']] df_var['major_AF'] = major_af arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy() arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy() n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male}) for p in range(n_patients): df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']] df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']] #randomly select mutation if more than one mutation is passed for a position df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index() df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True) df_var_final.to_csv('healthy_female_patient_' + str(p) + '_variants.csv', index=False) """ } process make_vcfs { container 'synthea:cudf' containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" //publishDir "${params.outdir}/variants_vcfs", mode: 'copy' echo true maxForks 1 input: path patient_vars output: path "${patient_vars.baseName}.vcf", emit: patient_vcfs //path "${patient_vars.baseName}.vcf.gz", emit: patient_gz //path "${patient_vars.baseName}.vcf.gz.tbi", emit: patient_index //path "${patient_vars.baseName}_filt.csv", emit: filtered_csv script: """ ## only print disease-associated variants not found in the ref/major allele position as they will already be covered by the ref genome #I don't think this is correct \$16 == \$18 cuz we changed csv file format #it only removes first line from csv awk '\$16 == \$18 { print \$0 }' ${patient_vars} | tail -n +2 > ${patient_vars.baseName}_filt.csv ## copy the variant template to the workdir cp /mnt/Avatar/dd/synthea/supporting-data/vcf/vcf_template.vcf . ## print the filtered variants to vcf format printf "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNA00001" >> vcf_template.vcf printf "\\n" >> vcf_template.vcf awk -F',' '{print "chr"\$1,\$2,".",\$3,\$4,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf #awk -F',' '{print "chr"\$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf #awk -F',' '{print \$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf # sort the genome karyotypically (in chromosomal order) and index #files are sorted already just for security java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar SortVcf -I vcf_template.vcf -O ${patient_vars.baseName}_sorted.vcf #Liftover form 19 to 38 standard java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar LiftoverVcf -I ${patient_vars.baseName}_sorted.vcf -O ${patient_vars.baseName}.vcf -C /mnt/Avatar/dd/synthea/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz -REJECT rejected_variants.vcf -R /mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa #change this part to save on memory usage #bgzip < ${patient_vars.baseName}.vcf > ${patient_vars.baseName}.vcf.gz #gatk IndexFeatureFile -I ${patient_vars.baseName}.vcf.gz #delete unnecessary files rm vcf_template.vcf rm ${patient_vars.baseName}_filt.csv rm ${patient_vars.baseName}_sorted.vcf rm rejected_variants.vcf rm *vcf.idx """ }