Initial commit: digital-patients pipeline (clean, no large files)
Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
This commit is contained in:
65
main_vcf2prot.nf
Normal file
65
main_vcf2prot.nf
Normal file
@@ -0,0 +1,65 @@
|
||||
nextflow.enable.dsl=2
|
||||
|
||||
process VCF2PROT {
|
||||
memory 2.GB
|
||||
|
||||
container "${params.container_vcf2prot}"
|
||||
containerOptions "${params.containerOptions}"
|
||||
// echo true
|
||||
publishDir "${params.outdir}/vcf2prot", mode: 'copy'
|
||||
stageInMode 'copy'
|
||||
maxForks 1
|
||||
|
||||
input:
|
||||
path vcf
|
||||
path vcf_filtered
|
||||
|
||||
output:
|
||||
path "*.fasta"
|
||||
|
||||
script:
|
||||
"""
|
||||
#!/bin/bash
|
||||
workdir=`pwd`
|
||||
patient_name=\$(basename $vcf .vcf)
|
||||
|
||||
grep '^#' $vcf > work1.vcf
|
||||
|
||||
/opt/conda/envs/vcf2prot/bin/python3 -c "
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
with open('$vcf_filtered', 'rb') as fp:
|
||||
vcf_file = pickle.load(fp)
|
||||
|
||||
vcf_filtered=[]
|
||||
for i in vcf_file:
|
||||
vcf_filtered.extend(i[1])
|
||||
|
||||
vcf_filtered = pd.DataFrame(vcf_filtered)
|
||||
|
||||
#drop duplicates, it returns error, consequence of overlapping genes on genome
|
||||
vcf_filtered = vcf_filtered.drop_duplicates()
|
||||
|
||||
vcf_filtered = vcf_filtered.replace({'chr':''}, regex=True)
|
||||
|
||||
vcf_filtered = vcf_filtered.sort_values(by=[0,1])
|
||||
|
||||
with open('work1.vcf', 'a') as f:
|
||||
for line in np.array(vcf_filtered):
|
||||
li = str(line).replace('[', '').replace(']', '').replace('\\'', '').replace(' ', '\\t').replace('\\n', '')
|
||||
f.write(f'{li}\\n')
|
||||
"
|
||||
|
||||
|
||||
awk '{gsub(/^##contig=<ID=chr/,"##contig=<ID="); print}' work1.vcf > \$workdir/work_2.vcf
|
||||
###remove non canonical chromososms
|
||||
grep -v '[0-9]_\\|Y_\\|X_\\|Un_' \$workdir/work_2.vcf > \$workdir/work.vcf
|
||||
###
|
||||
bcftools csq -f /home/omic/Homo_sapiens.GRCh38.dna.toplevel.fa -g /home/omic/Homo_sapiens.GRCh38.112.gff3 \$workdir/work.vcf --phase a --ncsq 120 -O v -o \$workdir/annotated.vcf
|
||||
/home/omic/vcf2prot/bins/Linux/vcf2prot -f \$workdir/annotated.vcf -r /home/omic/vcf2prot/MANE_transcipts_reference.fasta -v -g st -o \$workdir
|
||||
|
||||
mv *.fasta \${patient_name}_transcript_id_mutations.fasta
|
||||
"""
|
||||
}
|
||||
Reference in New Issue
Block a user