Initial commit: digital-patients pipeline (clean, no large files)
Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
This commit is contained in:
515
main_synthea.nf.bk.2
Normal file
515
main_synthea.nf.bk.2
Normal file
@@ -0,0 +1,515 @@
|
||||
nextflow.enable.dsl=2
|
||||
|
||||
process generate_patients {
|
||||
container 'synthea:cudf'
|
||||
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
|
||||
//publishDir "${params.outdir}/patients", mode: 'copy'
|
||||
// echo true
|
||||
// errorStrategy 'ignore'
|
||||
output:
|
||||
tuple path('*/patients.csv'), path('*/conditions.csv'), emit: patient_tuple
|
||||
path "*"
|
||||
script:
|
||||
"""
|
||||
workdir=`pwd`
|
||||
cd /synthea
|
||||
./run_synthea -p ${params.n_pat} -a ${params.age} ${params.state} ${params.city} --exporter.csv.export true
|
||||
cd /synthea/output
|
||||
cp -r * \$workdir
|
||||
"""
|
||||
}
|
||||
|
||||
process get_disease_stats_no_patients {
|
||||
container 'synthea:cudf'
|
||||
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
|
||||
echo true
|
||||
output:
|
||||
tuple path("*_female.tsv"), path("*_male.tsv")
|
||||
script:
|
||||
"""
|
||||
#!/opt/conda/envs/synthea/bin/python3
|
||||
import pandas as pd
|
||||
import subprocess
|
||||
import os.path
|
||||
from fuzzywuzzy import fuzz
|
||||
from fuzzywuzzy import process
|
||||
|
||||
df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'], engine='python')
|
||||
|
||||
for target in '${params.disease}'.replace("[", "").replace("]", "").replace(" ", "").split(','):
|
||||
# lookup ukbb condition from metadata using fuzzy matching
|
||||
ukbb_cond = process.extractOne(target , df_meta['phenotype_description'], score_cutoff=80)
|
||||
#print(ukbb_cond)
|
||||
ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code']
|
||||
|
||||
## get male imputed variants, replace if not found
|
||||
try:
|
||||
if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
|
||||
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
|
||||
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_m, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv'])
|
||||
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
|
||||
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
|
||||
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_m, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv'])
|
||||
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
|
||||
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
|
||||
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_m, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv'])
|
||||
else:
|
||||
print('imputed variants not present')
|
||||
except:
|
||||
print('no files exist')
|
||||
|
||||
## get female imputed variants, replace if not found
|
||||
try:
|
||||
if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
|
||||
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
|
||||
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_f, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv'])
|
||||
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
|
||||
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
|
||||
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_f, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv'])
|
||||
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
|
||||
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
|
||||
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_f, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv'])
|
||||
else:
|
||||
print('imputed variants not present')
|
||||
except:
|
||||
print('no files exist')
|
||||
"""
|
||||
}
|
||||
|
||||
|
||||
process get_disease_stats_cudf {
|
||||
container 'synthea:cudf'
|
||||
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
|
||||
echo true
|
||||
input:
|
||||
tuple path(pat_csv), path(cond_csv)
|
||||
|
||||
output:
|
||||
tuple path(pat_csv), path(cond_csv), path("*_female.tsv"), path("*_male.tsv")
|
||||
script:
|
||||
"""
|
||||
#!/opt/conda/envs/synthea/bin/python3
|
||||
|
||||
#import pandas as pd
|
||||
import cudf as pd
|
||||
from fuzzywuzzy import fuzz
|
||||
from fuzzywuzzy import process
|
||||
import numpy as np
|
||||
import subprocess
|
||||
import os.path
|
||||
|
||||
################## IMPORT SYNTHETIC PATIENTS ##############################################
|
||||
|
||||
## conditions, used to find the right imputed variant file
|
||||
df_cond = pd.read_csv('${cond_csv}', delim_whitespace=False)
|
||||
|
||||
## patients
|
||||
df_pat = pd.read_csv('${pat_csv}', delim_whitespace=False)
|
||||
try:
|
||||
df_pat_m = df_pat.loc[df_pat['GENDER'] == 'M']
|
||||
except:
|
||||
'No men in dataset'
|
||||
try:
|
||||
df_pat_f = df_pat.loc[df_pat['GENDER'] == 'F']
|
||||
except:
|
||||
'No women in dataset'
|
||||
try:
|
||||
df_pat_bs = df_pat.loc[df_pat['GENDER'] == '']
|
||||
except:
|
||||
'No intersex/undefined sex in dataset'
|
||||
|
||||
df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'])
|
||||
print(df_meta)
|
||||
|
||||
## find variant file associated with condition - need the patient's sex m/f/other or unknown = both_sexes - add sex to condition table based on patient id
|
||||
|
||||
# lookup ukbb condition from metadata using fuzzy matching
|
||||
ukbb_cond = process.extractOne('${params.disease}', df_meta['phenotype_description'].to_pandas(), score_cutoff=80)
|
||||
if not ukbb_cond:
|
||||
raise ValueError(f"No match found for disease '${params.disease}' with score >= 80")
|
||||
print(ukbb_cond)
|
||||
ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code']
|
||||
|
||||
## get male imputed variants, replace if not found
|
||||
try:
|
||||
if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
|
||||
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
|
||||
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_m, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv'])
|
||||
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
|
||||
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
|
||||
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_m, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv'])
|
||||
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
|
||||
ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
|
||||
ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_m, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv'])
|
||||
else:
|
||||
print('imputed variants not present')
|
||||
except:
|
||||
print('no files exist')
|
||||
|
||||
## get female imputed variants, replace if not found
|
||||
try:
|
||||
if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
|
||||
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
|
||||
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_f, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv'])
|
||||
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
|
||||
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
|
||||
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_f, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv'])
|
||||
elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
|
||||
ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
|
||||
ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
|
||||
subprocess.run(["cp", ukbb_var_f, "."])
|
||||
subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
|
||||
subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv'])
|
||||
else:
|
||||
print('imputed variants not present')
|
||||
except:
|
||||
print('no files exist')
|
||||
"""
|
||||
}
|
||||
|
||||
|
||||
process generate_m_variants_cudf {
|
||||
container 'synthea:cudf'
|
||||
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
|
||||
echo true
|
||||
maxForks 1
|
||||
|
||||
input:
|
||||
tuple path(dis_var_f), path(dis_var_m)
|
||||
output:
|
||||
path "*_variants.csv", emit: m_var
|
||||
|
||||
script:
|
||||
"""
|
||||
#!/opt/conda/envs/synthea/bin/python3
|
||||
import cudf as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import subprocess
|
||||
import io
|
||||
import requests
|
||||
import itertools
|
||||
import re
|
||||
import json
|
||||
|
||||
def process_variants(target_file):
|
||||
# Create gzip directory if it doesn't exist
|
||||
os.makedirs('gzip', exist_ok=True)
|
||||
|
||||
# Copy and compress file
|
||||
subprocess.run(['cp', target_file, 'gzip/'], check=True)
|
||||
subprocess.run(['gzip', 'gzip/' + target_file], check=True)
|
||||
print(f"Processing {target_file}.gz")
|
||||
|
||||
# Read the compressed file
|
||||
df_var_m = pd.read_csv('gzip/' + target_file + '.gz', sep='\\t', header=0)
|
||||
|
||||
################## GENERATE VARIANTS USING UKBB IMPUTED DATA ##############################
|
||||
##read variants
|
||||
df_var = df_var_m
|
||||
|
||||
## calculate major and minor alleles and frequencies
|
||||
df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF'])
|
||||
major_af = 1 - df_var[['minor_AF']]
|
||||
df_var['major_AF'] = major_af
|
||||
|
||||
## split variant into parts
|
||||
df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True)
|
||||
df_var['chr'] = df_var['chr'].astype('str')
|
||||
df_var['pos'] = df_var['pos'].astype('int')
|
||||
|
||||
## filter out low confidence variants
|
||||
df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False]
|
||||
df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1)
|
||||
|
||||
## determine which is the minor allele
|
||||
# Convert to pandas DataFrame in host memory
|
||||
df_var_all_host = df_var_all.to_pandas()
|
||||
|
||||
# Perform the operation on host memory
|
||||
df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref']
|
||||
df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt']
|
||||
|
||||
# Convert back to cuDF DataFrame in device memory
|
||||
df_var_all = pd.from_pandas(df_var_all_host)
|
||||
|
||||
## define variant array and probabilities, patient array
|
||||
arr_var = df_var_all[['minor_allele','major_allele']].to_numpy()
|
||||
arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy()
|
||||
|
||||
n_patients = int(${params.n_pat} * ${params.percent_male})
|
||||
|
||||
## loop through rows, using weighted random choice by variant probability to assign predicted allele
|
||||
for p in range(n_patients):
|
||||
df_var_all['prediction'] = arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
|
||||
df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']]
|
||||
df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']]
|
||||
output_file = f"{target_file.split('.')[0]}_patient_{p}_variants.csv"
|
||||
#randomly select mutation if more than one mutation is passed for a position
|
||||
df_var_final = df_var_final.sample(frac=1).drop_duplicates(['chr','pos']).sort_index()
|
||||
df_var_final.to_csv(output_file, index=False)
|
||||
|
||||
# Main execution
|
||||
for target in '${dis_var_m}'.split():
|
||||
try:
|
||||
if not os.path.exists(target):
|
||||
raise FileNotFoundError(f"Input file {target} does not exist")
|
||||
process_variants(target)
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error processing {target}: {str(e)}")
|
||||
raise
|
||||
"""
|
||||
}
|
||||
|
||||
|
||||
process generate_f_variants_cudf {
|
||||
container 'synthea:cudf'
|
||||
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
|
||||
maxForks 1
|
||||
echo true
|
||||
|
||||
input:
|
||||
tuple path(dis_var_f), path(dis_var_m)
|
||||
output:
|
||||
path "*_variants.csv", emit: f_var
|
||||
|
||||
script:
|
||||
"""
|
||||
#!/opt/conda/envs/synthea/bin/python3
|
||||
import cudf as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import subprocess
|
||||
import io
|
||||
import requests
|
||||
import itertools
|
||||
import re
|
||||
import json
|
||||
|
||||
def process_variants(target_file):
|
||||
# Create gzip directory if it doesn't exist
|
||||
os.makedirs('gzip', exist_ok=True)
|
||||
|
||||
# Copy and compress file
|
||||
subprocess.run(['cp', target_file, 'gzip/'], check=True)
|
||||
subprocess.run(['gzip', 'gzip/' + target_file], check=True)
|
||||
print(f"Processing {target_file}.gz")
|
||||
|
||||
# Read the compressed file
|
||||
df_var_f = pd.read_csv('gzip/' + target_file + '.gz', sep='\\t', header=0)
|
||||
|
||||
##################### GENERATE VARIANTS USING UKBB IMPUTED DATA ##############################
|
||||
## read variant data
|
||||
df_var = df_var_f
|
||||
|
||||
## calculate major and minor alleles and frequencies
|
||||
df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF'])
|
||||
major_af = 1 - df_var[['minor_AF']]
|
||||
df_var['major_AF'] = major_af
|
||||
|
||||
## split variant into parts
|
||||
df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True)
|
||||
df_var['chr'] = df_var['chr'].astype('str')
|
||||
df_var['pos'] = df_var['pos'].astype('int')
|
||||
|
||||
## filter out low confidence variants
|
||||
df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False]
|
||||
df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1)
|
||||
|
||||
## determine which is the minor allele
|
||||
# Convert to pandas DataFrame in host memory
|
||||
df_var_all_host = df_var_all.to_pandas()
|
||||
df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref']
|
||||
df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt']
|
||||
|
||||
# Convert back to cuDF DataFrame in device memory
|
||||
df_var_all = pd.from_pandas(df_var_all_host)
|
||||
|
||||
## define variant array and probabilities, patient array
|
||||
arr_var = pd.DataFrame(data=df_var_all, columns=['minor_allele','major_allele']).to_numpy()
|
||||
arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy()
|
||||
n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male})
|
||||
|
||||
## loop through rows, using weighted random choice by variant probability to assign predicted allele
|
||||
for p in range(n_patients):
|
||||
df_var_all['prediction'] = arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
|
||||
df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']]
|
||||
df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']]
|
||||
#randomly select mutation if more than one mutation is passed for a position
|
||||
df_var_final = df_var_final.sample(frac=1).drop_duplicates(['chr','pos']).sort_index()
|
||||
output_file = f"{target_file.split('.')[0]}_patient_{p}_variants.csv"
|
||||
df_var_final.to_csv(output_file, index=False)
|
||||
|
||||
# Main execution
|
||||
for target in '${dis_var_f}'.split():
|
||||
try:
|
||||
if not os.path.exists(target):
|
||||
raise FileNotFoundError(f"Input file {target} does not exist")
|
||||
process_variants(target)
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error processing {target}: {str(e)}")
|
||||
raise
|
||||
"""
|
||||
}
|
||||
|
||||
process generate_m_healthy_cudf {
|
||||
container 'synthea:cudf'
|
||||
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
|
||||
maxForks 1
|
||||
echo true
|
||||
|
||||
input:
|
||||
tuple path(healthy_dir)
|
||||
|
||||
output:
|
||||
path "*_variants.csv", emit: m_healthy
|
||||
|
||||
script:
|
||||
"""
|
||||
#!/opt/conda/envs/synthea/bin/python3
|
||||
import cudf as pd
|
||||
import numpy as np
|
||||
|
||||
df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.male.txt')
|
||||
|
||||
df_var['minor_AF'] = pd.to_numeric(df_var['AF_XY'])
|
||||
major_af = 1 - df_var[['minor_AF']]
|
||||
df_var['major_AF'] = major_af
|
||||
|
||||
arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy()
|
||||
arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy()
|
||||
|
||||
n_patients = int(${params.n_pat} * ${params.percent_male})
|
||||
for p in range(n_patients):
|
||||
df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
|
||||
df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']]
|
||||
df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']]
|
||||
#randomly select mutation if more than one mutation is passed for a position
|
||||
df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index()
|
||||
df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True)
|
||||
df_var_final['chr'] = [i.split('chr')[-1] for i in df_var_final.to_pandas()['chr']]
|
||||
df_var_final.to_csv('healthy_male_patient_' + str(p) + '_variants.csv', index=False)
|
||||
"""
|
||||
}
|
||||
|
||||
process generate_f_healthy_cudf {
|
||||
container 'synthea:cudf'
|
||||
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
|
||||
maxForks 1
|
||||
echo true
|
||||
|
||||
input:
|
||||
tuple path(healthy_dir)
|
||||
|
||||
output:
|
||||
path "*_variants.csv", emit: f_healthy
|
||||
script:
|
||||
"""
|
||||
#!/opt/conda/envs/synthea/bin/python3
|
||||
import cudf as pd
|
||||
import numpy as np
|
||||
df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.female.txt')
|
||||
df_var['minor_AF'] = pd.to_numeric(df_var['AF_XX'])
|
||||
major_af = 1 - df_var[['minor_AF']]
|
||||
df_var['major_AF'] = major_af
|
||||
arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy()
|
||||
arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy()
|
||||
n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male})
|
||||
for p in range(n_patients):
|
||||
df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
|
||||
df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']]
|
||||
df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']]
|
||||
#randomly select mutation if more than one mutation is passed for a position
|
||||
df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index()
|
||||
df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True)
|
||||
df_var_final['chr'] = [i.split('chr')[-1] for i in df_var_final.to_pandas()['chr']]
|
||||
df_var_final.to_csv('healthy_female_patient_' + str(p) + '_variants.csv', index=False)
|
||||
"""
|
||||
}
|
||||
|
||||
process make_vcfs {
|
||||
container 'synthea:cudf'
|
||||
containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
|
||||
//publishDir "${params.outdir}/variants_vcfs", mode: 'copy'
|
||||
echo true
|
||||
maxForks 1
|
||||
input:
|
||||
path patient_vars
|
||||
output:
|
||||
path "${patient_vars.baseName}.vcf", emit: patient_vcfs
|
||||
//path "${patient_vars.baseName}.vcf.gz", emit: patient_gz
|
||||
//path "${patient_vars.baseName}.vcf.gz.tbi", emit: patient_index
|
||||
//path "${patient_vars.baseName}_filt.csv", emit: filtered_csv
|
||||
script:
|
||||
"""
|
||||
## only print disease-associated variants not found in the ref/major allele position as they will already be covered by the ref genome
|
||||
#I don't think this is correct \$16 == \$18 cuz we changed csv file format
|
||||
#it only removes first line from csv
|
||||
awk '\$16 == \$18 { print \$0 }' ${patient_vars} | tail -n +2 > ${patient_vars.baseName}_filt.csv
|
||||
|
||||
## copy the variant template to the workdir
|
||||
cp /mnt/Avatar/dd/synthea/supporting-data/vcf/vcf_template.vcf .
|
||||
|
||||
## print the filtered variants to vcf format
|
||||
printf "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNA00001" >> vcf_template.vcf
|
||||
printf "\\n" >> vcf_template.vcf
|
||||
awk -F',' '{print "chr"\$1,\$2,".",\$3,\$4,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf
|
||||
#awk -F',' '{print "chr"\$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf
|
||||
#awk -F',' '{print \$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf
|
||||
|
||||
# sort the genome karyotypically (in chromosomal order) and index
|
||||
#files are sorted already just for security
|
||||
java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar SortVcf -I vcf_template.vcf -O ${patient_vars.baseName}_sorted.vcf
|
||||
#Liftover form 19 to 38 standard
|
||||
java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar LiftoverVcf -I ${patient_vars.baseName}_sorted.vcf -O ${patient_vars.baseName}.vcf -C /mnt/Avatar/dd/synthea/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz -REJECT rejected_variants.vcf -R /mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa
|
||||
|
||||
#change this part to save on memory usage
|
||||
#bgzip < ${patient_vars.baseName}.vcf > ${patient_vars.baseName}.vcf.gz
|
||||
#gatk IndexFeatureFile -I ${patient_vars.baseName}.vcf.gz
|
||||
#delete unnecessary files
|
||||
rm vcf_template.vcf
|
||||
rm ${patient_vars.baseName}_filt.csv
|
||||
rm ${patient_vars.baseName}_sorted.vcf
|
||||
rm rejected_variants.vcf
|
||||
rm *vcf.idx
|
||||
"""
|
||||
}
|
||||
Reference in New Issue
Block a user