nextflow.enable.dsl=2 process VCF2PROT { memory 2.GB container "${params.container_vcf2prot}" containerOptions "${params.containerOptions}" // echo true publishDir "${params.outdir}/vcf2prot", mode: 'copy' stageInMode 'copy' maxForks 1 input: path vcf path vcf_filtered output: path "*.fasta" script: """ #!/bin/bash workdir=`pwd` patient_name=\$(basename $vcf .vcf) grep '^#' $vcf > work1.vcf /opt/conda/envs/vcf2prot/bin/python3 -c " import pickle import numpy as np import pandas as pd with open('$vcf_filtered', 'rb') as fp: vcf_file = pickle.load(fp) vcf_filtered=[] for i in vcf_file: vcf_filtered.extend(i[1]) vcf_filtered = pd.DataFrame(vcf_filtered) #drop duplicates, it returns error, consequence of overlapping genes on genome vcf_filtered = vcf_filtered.drop_duplicates() vcf_filtered = vcf_filtered.replace({'chr':''}, regex=True) vcf_filtered = vcf_filtered.sort_values(by=[0,1]) with open('work1.vcf', 'a') as f: for line in np.array(vcf_filtered): li = str(line).replace('[', '').replace(']', '').replace('\\'', '').replace(' ', '\\t').replace('\\n', '') f.write(f'{li}\\n') " awk '{gsub(/^##contig= \$workdir/work_2.vcf ###remove non canonical chromososms grep -v '[0-9]_\\|Y_\\|X_\\|Un_' \$workdir/work_2.vcf > \$workdir/work.vcf ### bcftools csq -f /home/omic/Homo_sapiens.GRCh38.dna.toplevel.fa -g /home/omic/Homo_sapiens.GRCh38.112.gff3 \$workdir/work.vcf --phase a --ncsq 120 -O v -o \$workdir/annotated.vcf /home/omic/vcf2prot/bins/Linux/vcf2prot -f \$workdir/annotated.vcf -r /home/omic/vcf2prot/MANE_transcipts_reference.fasta -v -g st -o \$workdir mv *.fasta \${patient_name}_transcript_id_mutations.fasta """ }