Files
digital-patients/main_vcf2prot.nf
Olamide Isreal 9e6a16c19b Initial commit: digital-patients pipeline (clean, no large files)
Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
2026-03-26 15:15:23 +01:00

66 lines
1.8 KiB
Plaintext

nextflow.enable.dsl=2
process VCF2PROT {
memory 2.GB
container "${params.container_vcf2prot}"
containerOptions "${params.containerOptions}"
// echo true
publishDir "${params.outdir}/vcf2prot", mode: 'copy'
stageInMode 'copy'
maxForks 1
input:
path vcf
path vcf_filtered
output:
path "*.fasta"
script:
"""
#!/bin/bash
workdir=`pwd`
patient_name=\$(basename $vcf .vcf)
grep '^#' $vcf > work1.vcf
/opt/conda/envs/vcf2prot/bin/python3 -c "
import pickle
import numpy as np
import pandas as pd
with open('$vcf_filtered', 'rb') as fp:
vcf_file = pickle.load(fp)
vcf_filtered=[]
for i in vcf_file:
vcf_filtered.extend(i[1])
vcf_filtered = pd.DataFrame(vcf_filtered)
#drop duplicates, it returns error, consequence of overlapping genes on genome
vcf_filtered = vcf_filtered.drop_duplicates()
vcf_filtered = vcf_filtered.replace({'chr':''}, regex=True)
vcf_filtered = vcf_filtered.sort_values(by=[0,1])
with open('work1.vcf', 'a') as f:
for line in np.array(vcf_filtered):
li = str(line).replace('[', '').replace(']', '').replace('\\'', '').replace(' ', '\\t').replace('\\n', '')
f.write(f'{li}\\n')
"
awk '{gsub(/^##contig=<ID=chr/,"##contig=<ID="); print}' work1.vcf > \$workdir/work_2.vcf
###remove non canonical chromososms
grep -v '[0-9]_\\|Y_\\|X_\\|Un_' \$workdir/work_2.vcf > \$workdir/work.vcf
###
bcftools csq -f /home/omic/Homo_sapiens.GRCh38.dna.toplevel.fa -g /home/omic/Homo_sapiens.GRCh38.112.gff3 \$workdir/work.vcf --phase a --ncsq 120 -O v -o \$workdir/annotated.vcf
/home/omic/vcf2prot/bins/Linux/vcf2prot -f \$workdir/annotated.vcf -r /home/omic/vcf2prot/MANE_transcipts_reference.fasta -v -g st -o \$workdir
mv *.fasta \${patient_name}_transcript_id_mutations.fasta
"""
}