nextflow.enable.dsl=2

process generate_patients {
  container 'synthea:cudf'
  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
  //publishDir "${params.outdir}/patients", mode: 'copy'
  echo true
  // errorStrategy 'ignore'
  output:
    tuple path('*/patients.csv'), path('*/conditions.csv'), emit: patient_tuple
    path "*"
  script:
    """
    workdir=`pwd`
    cd /synthea
    ./run_synthea -p ${params.n_pat} -a ${params.age} ${params.state} ${params.city} --exporter.csv.export true
    cd /synthea/output
    cp -r * \$workdir
    """
}

process get_disease_stats_no_patients {
  container 'synthea:cudf'
  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
  echo true
  output:
    tuple path("*_female.tsv"), path("*_male.tsv")
  script:
    """
    #!/opt/conda/envs/synthea/bin/python3 
    import pandas as pd
    import subprocess
    import os.path
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process

    df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'], engine='python')
    
    for target in '${params.disease}'.replace("[", "").replace("]", "").replace(" ", "").split(','):
      # lookup ukbb condition from metadata using fuzzy matching
      ukbb_cond = process.extractOne(target , df_meta['phenotype_description'], score_cutoff=80)
      #print(ukbb_cond)
      ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code']

      ## get male imputed variants, replace if not found
      try:
        if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
          ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
          ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
          subprocess.run(["cp", ukbb_var_m, "."])
          subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv'])
        elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
          ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
          ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
          subprocess.run(["cp", ukbb_var_m, "."])
          subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv'])
        elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
          ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
          ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
          subprocess.run(["cp", ukbb_var_m, "."])
          subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv'])
        else:
          print('imputed variants not present')
      except:
        print('no files exist')

      ## get female imputed variants, replace if not found
      try:
        if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
          ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
          ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
          subprocess.run(["cp", ukbb_var_f, "."])
          subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv'])
        elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
          ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
          ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
          subprocess.run(["cp", ukbb_var_f, "."])
          subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv'])
        elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
          ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
          ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
          subprocess.run(["cp", ukbb_var_f, "."])
          subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
          subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv'])
        else:
          print('imputed variants not present')
      except:
        print('no files exist')
    """
}


process get_disease_stats_cudf {
  container 'synthea:cudf'
  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
  echo true
  input:
    tuple path(pat_csv), path(cond_csv)

  output:
    tuple path(pat_csv), path(cond_csv), path("*_female.tsv"), path("*_male.tsv")
  script:
    """
    #!/opt/conda/envs/synthea/bin/python3

    #import pandas as pd
    import cudf as pd
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    import numpy as np
    import subprocess
    import os.path

    ################## IMPORT SYNTHETIC PATIENTS ##############################################

    ## conditions, used to find the right imputed variant file
    df_cond = pd.read_csv('${cond_csv}', delim_whitespace=False)

    ## patients
    df_pat = pd.read_csv('${pat_csv}', delim_whitespace=False)
    try:
      df_pat_m = df_pat.loc[df_pat['GENDER'] == 'M']
    except:
      'No men in dataset'
    try:
      df_pat_f = df_pat.loc[df_pat['GENDER'] == 'F']
    except:
      'No women in dataset'
    try:
      df_pat_bs = df_pat.loc[df_pat['GENDER'] == '']
    except:
      'No intersex/undefined sex in dataset'

    df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'])
    print(df_meta)

    ## find variant file associated with condition - need the patient's sex m/f/other or unknown = both_sexes - add sex to condition table based on patient id

    # lookup ukbb condition from metadata using fuzzy matching
    ukbb_cond = process.extractOne('${params.disease}', df_meta['phenotype_description'].to_pandas(), score_cutoff=80)
    print(ukbb_cond)
    ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code']

    ## get male imputed variants, replace if not found
    try:
      if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
        ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
        ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
        subprocess.run(["cp", ukbb_var_m, "."])
        subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv'])
      elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
        ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
        ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
        subprocess.run(["cp", ukbb_var_m, "."])
        subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv'])
      elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
        ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
        ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
        subprocess.run(["cp", ukbb_var_m, "."])
        subprocess.run(["bgzip", "-d", ukbb_file_m, "."])
        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv'])
      else:
        print('imputed variants not present')
    except:
      print('no files exist')

    ## get female imputed variants, replace if not found
    try:
      if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'):
        ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
        ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'
        subprocess.run(["cp", ukbb_var_f, "."])
        subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv'])
      elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'):
        ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
        ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'
        subprocess.run(["cp", ukbb_var_f, "."])
        subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv'])
      elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'):
        ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
        ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'
        subprocess.run(["cp", ukbb_var_f, "."])
        subprocess.run(["bgzip", "-d", ukbb_file_f, "."])
        subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv'])
      else:
        print('imputed variants not present')
    except:
      print('no files exist')
    """
}


process generate_m_variants_cudf {
  container 'synthea:cudf'
  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
  //publishDir "${params.outdir}/variants", mode: 'copy'
  //errorStrategy 'ignore'
  echo true
  // memory '15.0 GB'
  maxForks 1
  input:
    tuple path(dis_var_f), path(dis_var_m)
    //tuple path(pat_csv), path(cond_csv), path(dis_var_f), path(dis_var_m)
  output:
    path "*_variants.csv", emit: m_var
    //path "selected_proteins.txt", emit: proteins
    //path "selected_proteins_liftover_m.bed", emit: m_bed
  script:
    """
    #!/opt/conda/envs/synthea/bin/python3
    import cudf as pd 
    import numpy as np     
    import os    
    import subprocess         
    import io            
    import requests         
    import itertools             
    import re                                                                  
    import json
  
    for target in '${dis_var_m}'.split(" "):
      
      try:
        subprocess.run(['mkdir', 'gzip'])
        subprocess.run(['cp', target, 'gzip/'])
        subprocess.run(['gzip', 'gzip/' + target])
        print(target,'.gz')
        df_var_m = pd.read_csv('gzip/' + target + '.gz', sep='\\t', header=0)
      except:
        print("the male variant file does not exist")
      
      ################## GENERATE VARIANTS USING UKBB IMPUTED DATA ##############################
      ##read variantes
      df_var = df_var_m
 
      ## calculate major and minor alleles and frequencies
      df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF'])
      major_af = 1 - df_var[['minor_AF']]
      df_var['major_AF'] = major_af

      ## split variant into parts
      df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True)
      df_var['chr']=df_var['chr'].astype('str')
      df_var['pos']=df_var['pos'].astype('int')
      
      ## filter out low confidence variants
      df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False]
      df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1)

      ## determine which is the minor allele
      # Convert to pandas DataFrame in host memory
      df_var_all_host = df_var_all.to_pandas()

      # Perform the operation on host memory
      df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref']
      df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt']

      # Convert back to cuDF DataFrame in device memory
      df_var_all = pd.from_pandas(df_var_all_host)

      ## define variant array and probabilities, patient array
      arr_var = df_var_all[['minor_allele','major_allele']].to_numpy()
      arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy()
     
      n_patients = int(${params.n_pat} * ${params.percent_male})   
      ## loop through rows, using weighted random choice by variant probability to assign predicted allele
      for p in range(n_patients):  
        df_var_all['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
        df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']]
        df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']]
        df_var_final.to_csv(target.split('.')[0] + '_patient_'  + str(p) + '_variants.csv', index=False)
    """
}


process generate_f_variants_cudf {
  container 'synthea:cudf'
  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
  //publishDir "${params.outdir}/variants", mode: 'copy'
  maxForks 1
  //errorStrategy 'ignore'
  echo true
  input:
    tuple path(dis_var_f), path(dis_var_m)
  output:
    path "*_variants.csv", emit: f_var
    //path "selected_proteins.txt", emit: proteins
    //path "selected_proteins_liftover_f.bed", emit: f_bed
  script:
    """
    #!/opt/conda/envs/synthea/bin/python3
    import cudf as pd
    import numpy as np
    import os
    import subprocess
    import io
    import requests
    import itertools
    import re
    import json

    for target in '${dis_var_f}'.split(" "):
      try:
        subprocess.run(['mkdir', 'gzip'])
        subprocess.run(['cp', target, 'gzip/'])
        subprocess.run(['gzip', 'gzip/' + target])
        print(target,'.gz')
        df_var_f = pd.read_csv('gzip/' + target + '.gz', sep='\\t', header=0) 
      except:
        print("the female variant file does not exist")
      ##################### GENERATE VARIANTS USING UKBB IMPUTED DATA ##############################

      ## read variant data
      df_var = df_var_f

      ## calculate major and minor alleles and frequencies
      df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF'])
      major_af = 1 - df_var[['minor_AF']]
      df_var['major_AF'] = major_af

      ## split variant into parts
      df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True)
      df_var['chr']=df_var['chr'].astype('str')
      df_var['pos']=df_var['pos'].astype('int')
    
      ## filter out low confidence variants
      df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False]
      df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1)

      ## determine which is the minor allele
      # Convert to pandas DataFrame in host memory
      df_var_all_host = df_var_all.to_pandas()
      df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref']
      df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt']
    
      # Convert back to cuDF DataFrame in device memory
      df_var_all = pd.from_pandas(df_var_all_host)
    
      ## define variant array and probabilities, patient array
      arr_var = pd.DataFrame(data=df_var_all, columns=['minor_allele','major_allele']).to_numpy()
      arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy()

      n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male})
      ## loop through rows, using weighted random choice by variant probability to assign predicted allele
      for p in range(n_patients): 
        df_var_all['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]    
        df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']] 
        df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']]                                                                         
        df_var_final.to_csv(target.split('.')[0] + '_patient_'  + str(p) + '_variants.csv', index=False) 
    """
}

process generate_m_healthy_cudf {
  container 'synthea:cudf'
  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
  maxForks 1
  echo true

  input:
    tuple path(healthy_dir)

  output:
    path "*_variants.csv", emit: m_healthy

  script:
    """
    #!/opt/conda/envs/synthea/bin/python3
    import cudf as pd
    import numpy as np

    df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.male.txt')

    df_var['minor_AF'] = pd.to_numeric(df_var['AF_XY'])
    major_af = 1 - df_var[['minor_AF']]
    df_var['major_AF'] = major_af

    arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy()
    arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy()

    n_patients = int(${params.n_pat} * ${params.percent_male})
    for p in range(n_patients): 
       df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]    
       df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']] 
       df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']] 
       #randomly select mutation if more than one mutation is passed for a position
       df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index()
       df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True)
       df_var_final.to_csv('healthy_male_patient_'  + str(p) + '_variants.csv', index=False) 
    """
}   

process generate_f_healthy_cudf {
  container 'synthea:cudf'
  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
  maxForks 1
  echo true

  input:
    tuple path(healthy_dir)

  output:
    path "*_variants.csv", emit: f_healthy
  script:
    """
    #!/opt/conda/envs/synthea/bin/python3
    import cudf as pd
    import numpy as np
    df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.female.txt')
    df_var['minor_AF'] = pd.to_numeric(df_var['AF_XX'])
    major_af = 1 - df_var[['minor_AF']]
    df_var['major_AF'] = major_af
    arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy()
    arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy()
    n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male})
    for p in range(n_patients):
      df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)]
      df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']]
      df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']]
      #randomly select mutation if more than one mutation is passed for a position
      df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index()
      df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True)
      df_var_final.to_csv('healthy_female_patient_'  + str(p) + '_variants.csv', index=False)
    """
}  
   
process make_vcfs {
  container 'synthea:cudf'
  containerOptions "--rm --privileged --gpus all -v /mnt:/mnt"
  //publishDir "${params.outdir}/variants_vcfs", mode: 'copy'
  echo true
  maxForks 1
  input:
    path patient_vars
  output:
    path "${patient_vars.baseName}.vcf", emit: patient_vcfs
    //path "${patient_vars.baseName}.vcf.gz", emit: patient_gz
    //path "${patient_vars.baseName}.vcf.gz.tbi", emit: patient_index
    //path "${patient_vars.baseName}_filt.csv", emit: filtered_csv
  script:
    """
    ## only print disease-associated variants not found in the ref/major allele position as they will already be covered by the ref genome
    #I don't think this is correct \$16 == \$18 cuz we changed csv file format
    #it only removes first line from csv
    awk '\$16 == \$18 { print \$0 }' ${patient_vars} | tail -n +2 > ${patient_vars.baseName}_filt.csv

    ## copy the variant template to the workdir
    cp /mnt/Avatar/dd/synthea/supporting-data/vcf/vcf_template.vcf .

    ## print the filtered variants to vcf format
    printf "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNA00001" >> vcf_template.vcf
    printf "\\n" >> vcf_template.vcf
    awk -F',' '{print "chr"\$1,\$2,".",\$3,\$4,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv  >> vcf_template.vcf
    #awk -F',' '{print "chr"\$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf
    #awk -F',' '{print \$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf

    # sort the genome karyotypically (in chromosomal order) and index
    #files are sorted already just for security
    java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar SortVcf -I vcf_template.vcf  -O ${patient_vars.baseName}_sorted.vcf 
    #Liftover form 19 to 38 standard
    java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar LiftoverVcf -I ${patient_vars.baseName}_sorted.vcf  -O ${patient_vars.baseName}.vcf  -C /mnt/Avatar/dd/synthea/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz -REJECT rejected_variants.vcf -R /mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa 

    #change this part to save on memory usage
    #bgzip < ${patient_vars.baseName}.vcf > ${patient_vars.baseName}.vcf.gz 
    #gatk IndexFeatureFile -I ${patient_vars.baseName}.vcf.gz
    #delete unnecessary files
    rm vcf_template.vcf
    rm ${patient_vars.baseName}_filt.csv
    rm ${patient_vars.baseName}_sorted.vcf
    rm rejected_variants.vcf
    rm *vcf.idx
    """
}