Initial commit: digital-patients pipeline (clean, no large files)

Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
2026-03-26 15:15:23 +01:00
commit 9e6a16c19b
45 changed files with 7207 additions and 0 deletions
--- a/main_synthea.nf.bk.2
+++ b/main_synthea.nf.bk.2
@@ -0,0 +1,515 @@
+nextflow.enable.dsl=2
+
+process generate_patients {
+  container 'synthea:cudf'
+  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
+  //publishDir "${params.outdir}/patients", mode: 'copy'
+  // echo true
+  // errorStrategy 'ignore'
+  output:
+    tuple path('*/patients.csv'), path('*/conditions.csv'), emit: patient_tuple
+    path "*"
+  script:
+    """
+    workdir=`pwd`
+    cd /synthea
+    ./run_synthea -p ${params.n_pat} -a ${params.age} ${params.state} ${params.city} --exporter.csv.export true
+    cd /synthea/output
+    cp -r * \$workdir
+    """
+}
+
+process get_disease_stats_no_patients {
+  container 'synthea:cudf'
+  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
+  echo true
+  output:
+    tuple path("*_female.tsv"), path("*_male.tsv")
+  script:
+    """
+    #!/opt/conda/envs/synthea/bin/python3 
+    import pandas as pd
+    import subprocess
+    import os.path
+    from fuzzywuzzy import fuzz
+    from fuzzywuzzy import process
+
+    df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'], engine='python')
+    
+    for target in '${params.disease}'.replace("[", "").replace("]", "").replace(" ", "").split(','):
+      # lookup ukbb condition from metadata using fuzzy matching
+      ukbb_cond = process.extractOne(target , df_meta['phenotype_description'], score_cutoff=80)
+      #print(ukbb_cond)
+      ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code']
+
+      ## get male imputed variants, replace if not found
+      try:
+        if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
+          ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
+          ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
+          subprocess.run(["cp", ukbb_var_m, "."])
+          subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
+          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv'])
+        elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
+          ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
+          ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
+          subprocess.run(["cp", ukbb_var_m, "."])
+          subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
+          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv'])
+        elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
+          ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
+          ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
+          subprocess.run(["cp", ukbb_var_m, "."])
+          subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
+          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv'])
+        else:
+          print('imputed variants not present')
+      except:
+        print('no files exist')
+
+      ## get female imputed variants, replace if not found
+      try:
+        if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
+          ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
+          ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
+          subprocess.run(["cp", ukbb_var_f, "."])
+          subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
+          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv'])
+        elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
+          ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
+          ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
+          subprocess.run(["cp", ukbb_var_f, "."])
+          subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
+          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv'])
+        elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
+          ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
+          ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
+          subprocess.run(["cp", ukbb_var_f, "."])
+          subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
+          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv'])
+        else:
+          print('imputed variants not present')
+      except:
+        print('no files exist')
+    """
+}
+
+
+process get_disease_stats_cudf {
+  container 'synthea:cudf'
+  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
+  echo true
+  input:
+    tuple path(pat_csv), path(cond_csv)
+
+  output:
+    tuple path(pat_csv), path(cond_csv), path("*_female.tsv"), path("*_male.tsv")
+  script:
+    """
+    #!/opt/conda/envs/synthea/bin/python3
+
+    #import pandas as pd
+    import cudf as pd
+    from fuzzywuzzy import fuzz
+    from fuzzywuzzy import process
+    import numpy as np
+    import subprocess
+    import os.path
+
+    ################## IMPORT SYNTHETIC PATIENTS ##############################################
+
+    ## conditions, used to find the right imputed variant file
+    df_cond = pd.read_csv('${cond_csv}', delim_whitespace=False)
+
+    ## patients
+    df_pat = pd.read_csv('${pat_csv}', delim_whitespace=False)
+    try:
+      df_pat_m = df_pat.loc[df_pat['GENDER'] == 'M']
+    except:
+      'No men in dataset'
+    try:
+      df_pat_f = df_pat.loc[df_pat['GENDER'] == 'F']
+    except:
+      'No women in dataset'
+    try:
+      df_pat_bs = df_pat.loc[df_pat['GENDER'] == '']
+    except:
+      'No intersex/undefined sex in dataset'
+
+    df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'])
+    print(df_meta)
+
+    ## find variant file associated with condition - need the patient's sex m/f/other or unknown = both_sexes - add sex to condition table based on patient id
+
+    # lookup ukbb condition from metadata using fuzzy matching
+    ukbb_cond = process.extractOne('${params.disease}', df_meta['phenotype_description'].to_pandas(), score_cutoff=80)
+    if not ukbb_cond:
+        raise ValueError(f"No match found for disease '${params.disease}' with score >= 80")
+    print(ukbb_cond)
+    ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code']
+
+    ## get male imputed variants, replace if not found
+    try:
+      if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
+        ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
+        ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
+        subprocess.run(["cp", ukbb_var_m, "."])
+        subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
+        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv'])
+      elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
+        ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
+        ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
+        subprocess.run(["cp", ukbb_var_m, "."])
+        subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
+        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv'])
+      elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
+        ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
+        ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
+        subprocess.run(["cp", ukbb_var_m, "."])
+        subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
+        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv'])
+      else:
+        print('imputed variants not present')
+    except:
+      print('no files exist')
+
+    ## get female imputed variants, replace if not found
+    try:
+      if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
+        ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
+        ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
+        subprocess.run(["cp", ukbb_var_f, "."])
+        subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
+        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv'])
+      elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
+        ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
+        ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
+        subprocess.run(["cp", ukbb_var_f, "."])
+        subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
+        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv'])
+      elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
+        ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
+        ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
+        subprocess.run(["cp", ukbb_var_f, "."])
+        subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
+        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv'])
+      else:
+        print('imputed variants not present')
+    except:
+      print('no files exist')
+    """
+}
+
+
+process generate_m_variants_cudf {
+    container 'synthea:cudf'
+    containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
+    echo true
+    maxForks 1
+
+    input:
+        tuple path(dis_var_f), path(dis_var_m)
+    output:
+        path "*_variants.csv", emit: m_var
+
+    script:
+    """
+    #!/opt/conda/envs/synthea/bin/python3
+    import cudf as pd 
+    import numpy as np     
+    import os    
+    import subprocess         
+    import io            
+    import requests         
+    import itertools             
+    import re                                                                  
+    import json
+
+    def process_variants(target_file):
+        # Create gzip directory if it doesn't exist
+        os.makedirs('gzip', exist_ok=True)
+        
+        # Copy and compress file
+        subprocess.run(['cp', target_file, 'gzip/'], check=True)
+        subprocess.run(['gzip', 'gzip/' + target_file], check=True)
+        print(f"Processing {target_file}.gz")
+        
+        # Read the compressed file
+        df_var_m = pd.read_csv('gzip/' + target_file + '.gz', sep='\\t', header=0)
+        
+        ################## GENERATE VARIANTS USING UKBB IMPUTED DATA ##############################
+        ##read variants
+        df_var = df_var_m
+        
+        ## calculate major and minor alleles and frequencies
+        df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF'])
+        major_af = 1 - df_var[['minor_AF']]
+        df_var['major_AF'] = major_af
+        
+        ## split variant into parts
+        df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True)
+        df_var['chr'] = df_var['chr'].astype('str')
+        df_var['pos'] = df_var['pos'].astype('int')
+        
+        ## filter out low confidence variants
+        df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False]
+        df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1)
+        
+        ## determine which is the minor allele
+        # Convert to pandas DataFrame in host memory
+        df_var_all_host = df_var_all.to_pandas()
+        
+        # Perform the operation on host memory
+        df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref']
+        df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt']
+        
+        # Convert back to cuDF DataFrame in device memory
+        df_var_all = pd.from_pandas(df_var_all_host)
+        
+        ## define variant array and probabilities, patient array
+        arr_var = df_var_all[['minor_allele','major_allele']].to_numpy()
+        arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy()
+        
+        n_patients = int(${params.n_pat} * ${params.percent_male})
+        
+        ## loop through rows, using weighted random choice by variant probability to assign predicted allele
+        for p in range(n_patients):
+            df_var_all['prediction'] = arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
+            df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']]
+            df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']]
+            output_file = f"{target_file.split('.')[0]}_patient_{p}_variants.csv"
+            #randomly select mutation if more than one mutation is passed for a position
+            df_var_final = df_var_final.sample(frac=1).drop_duplicates(['chr','pos']).sort_index()
+            df_var_final.to_csv(output_file, index=False)
+
+    # Main execution
+    for target in '${dis_var_m}'.split():
+        try:
+            if not os.path.exists(target):
+                raise FileNotFoundError(f"Input file {target} does not exist")
+            process_variants(target)
+        except FileNotFoundError as e:
+            print(f"Error: {e}")
+            raise
+        except Exception as e:
+            print(f"Error processing {target}: {str(e)}")
+            raise
+    """
+}
+
+
+process generate_f_variants_cudf {
+    container 'synthea:cudf'
+    containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
+    maxForks 1
+    echo true
+
+    input:
+        tuple path(dis_var_f), path(dis_var_m)
+    output:
+        path "*_variants.csv", emit: f_var
+
+    script:
+    """
+    #!/opt/conda/envs/synthea/bin/python3
+    import cudf as pd
+    import numpy as np
+    import os
+    import subprocess
+    import io
+    import requests
+    import itertools
+    import re
+    import json
+
+    def process_variants(target_file):
+        # Create gzip directory if it doesn't exist
+        os.makedirs('gzip', exist_ok=True)
+        
+        # Copy and compress file
+        subprocess.run(['cp', target_file, 'gzip/'], check=True)
+        subprocess.run(['gzip', 'gzip/' + target_file], check=True)
+        print(f"Processing {target_file}.gz")
+        
+        # Read the compressed file
+        df_var_f = pd.read_csv('gzip/' + target_file + '.gz', sep='\\t', header=0)
+        
+        ##################### GENERATE VARIANTS USING UKBB IMPUTED DATA ##############################
+        ## read variant data
+        df_var = df_var_f
+        
+        ## calculate major and minor alleles and frequencies
+        df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF'])
+        major_af = 1 - df_var[['minor_AF']]
+        df_var['major_AF'] = major_af
+        
+        ## split variant into parts
+        df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True)
+        df_var['chr'] = df_var['chr'].astype('str')
+        df_var['pos'] = df_var['pos'].astype('int')
+        
+        ## filter out low confidence variants
+        df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False]
+        df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1)
+        
+        ## determine which is the minor allele
+        # Convert to pandas DataFrame in host memory
+        df_var_all_host = df_var_all.to_pandas()
+        df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref']
+        df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt']
+        
+        # Convert back to cuDF DataFrame in device memory
+        df_var_all = pd.from_pandas(df_var_all_host)
+        
+        ## define variant array and probabilities, patient array
+        arr_var = pd.DataFrame(data=df_var_all, columns=['minor_allele','major_allele']).to_numpy()
+        arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy()
+        n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male})
+        
+        ## loop through rows, using weighted random choice by variant probability to assign predicted allele
+        for p in range(n_patients):
+            df_var_all['prediction'] = arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
+            df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']]
+            df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']]
+            #randomly select mutation if more than one mutation is passed for a position 
+            df_var_final = df_var_final.sample(frac=1).drop_duplicates(['chr','pos']).sort_index() 
+            output_file = f"{target_file.split('.')[0]}_patient_{p}_variants.csv"
+            df_var_final.to_csv(output_file, index=False)
+
+    # Main execution
+    for target in '${dis_var_f}'.split():
+        try:
+            if not os.path.exists(target):
+                raise FileNotFoundError(f"Input file {target} does not exist")
+            process_variants(target)
+        except FileNotFoundError as e:
+            print(f"Error: {e}")
+            raise
+        except Exception as e:
+            print(f"Error processing {target}: {str(e)}")
+            raise
+    """
+}
+
+process generate_m_healthy_cudf {
+  container 'synthea:cudf'
+  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
+  maxForks 1
+  echo true
+
+  input:
+    tuple path(healthy_dir)
+
+  output:
+    path "*_variants.csv", emit: m_healthy
+
+  script:
+    """
+    #!/opt/conda/envs/synthea/bin/python3
+    import cudf as pd
+    import numpy as np
+
+    df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.male.txt')
+
+    df_var['minor_AF'] = pd.to_numeric(df_var['AF_XY'])
+    major_af = 1 - df_var[['minor_AF']]
+    df_var['major_AF'] = major_af
+
+    arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy()
+    arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy()
+
+    n_patients = int(${params.n_pat} * ${params.percent_male})
+    for p in range(n_patients): 
+       df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]    
+       df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']] 
+       df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']] 
+       #randomly select mutation if more than one mutation is passed for a position
+       df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index()
+       df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True)
+       df_var_final['chr'] = [i.split('chr')[-1] for i in df_var_final.to_pandas()['chr']]
+       df_var_final.to_csv('healthy_male_patient_'  + str(p) + '_variants.csv', index=False) 
+    """
+}   
+
+process generate_f_healthy_cudf {
+  container 'synthea:cudf'
+  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
+  maxForks 1
+  echo true
+
+  input:
+    tuple path(healthy_dir)
+
+  output:
+    path "*_variants.csv", emit: f_healthy
+  script:
+    """
+    #!/opt/conda/envs/synthea/bin/python3
+    import cudf as pd
+    import numpy as np
+    df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.female.txt')
+    df_var['minor_AF'] = pd.to_numeric(df_var['AF_XX'])
+    major_af = 1 - df_var[['minor_AF']]
+    df_var['major_AF'] = major_af
+    arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy()
+    arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy()
+    n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male})
+    for p in range(n_patients):
+      df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
+      df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']]
+      df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']]
+      #randomly select mutation if more than one mutation is passed for a position
+      df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index()
+      df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True)
+      df_var_final['chr'] = [i.split('chr')[-1] for i in df_var_final.to_pandas()['chr']]
+      df_var_final.to_csv('healthy_female_patient_'  + str(p) + '_variants.csv', index=False)
+    """
+}  
+   
+process make_vcfs {
+  container 'synthea:cudf'
+  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
+  //publishDir "${params.outdir}/variants_vcfs", mode: 'copy'
+  echo true
+  maxForks 1
+  input:
+    path patient_vars
+  output:
+    path "${patient_vars.baseName}.vcf", emit: patient_vcfs
+    //path "${patient_vars.baseName}.vcf.gz", emit: patient_gz
+    //path "${patient_vars.baseName}.vcf.gz.tbi", emit: patient_index
+    //path "${patient_vars.baseName}_filt.csv", emit: filtered_csv
+  script:
+    """
+    ## only print disease-associated variants not found in the ref/major allele position as they will already be covered by the ref genome
+    #I don't think this is correct \$16 == \$18 cuz we changed csv file format
+    #it only removes first line from csv
+    awk '\$16 == \$18 { print \$0 }' ${patient_vars} | tail -n +2 > ${patient_vars.baseName}_filt.csv
+
+    ## copy the variant template to the workdir
+    cp /mnt/Avatar/dd/synthea/supporting-data/vcf/vcf_template.vcf .
+
+    ## print the filtered variants to vcf format
+    printf "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNA00001" >> vcf_template.vcf
+    printf "\\n" >> vcf_template.vcf
+    awk -F',' '{print "chr"\$1,\$2,".",\$3,\$4,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv  >> vcf_template.vcf
+    #awk -F',' '{print "chr"\$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf
+    #awk -F',' '{print \$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf
+
+    # sort the genome karyotypically (in chromosomal order) and index
+    #files are sorted already just for security
+    java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar SortVcf -I vcf_template.vcf  -O ${patient_vars.baseName}_sorted.vcf 
+    #Liftover form 19 to 38 standard
+    java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar LiftoverVcf -I ${patient_vars.baseName}_sorted.vcf  -O ${patient_vars.baseName}.vcf  -C /mnt/Avatar/dd/synthea/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz -REJECT rejected_variants.vcf -R /mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa 
+
+    #change this part to save on memory usage
+    #bgzip < ${patient_vars.baseName}.vcf > ${patient_vars.baseName}.vcf.gz 
+    #gatk IndexFeatureFile -I ${patient_vars.baseName}.vcf.gz
+    #delete unnecessary files
+    rm vcf_template.vcf
+    rm ${patient_vars.baseName}_filt.csv
+    rm ${patient_vars.baseName}_sorted.vcf
+    rm rejected_variants.vcf
+    rm *vcf.idx
+    """
+}