Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
449 lines
20 KiB
Plaintext
449 lines
20 KiB
Plaintext
nextflow.enable.dsl=2
|
|
|
|
process FILTER_VCF {
|
|
memory 4.GB
|
|
|
|
container "${params.container_borzoi}"
|
|
containerOptions "${params.containerOptions}"
|
|
debug true
|
|
// maxForks 1
|
|
|
|
input:
|
|
path MANE
|
|
path vcf_filtered
|
|
|
|
output:
|
|
path "*.vcf" , emit: expression_output
|
|
|
|
script:
|
|
"""
|
|
#!/opt/conda/envs/borzoi/bin/python
|
|
#Filter VCF to extract only mutations in coding sequences plus 1000 upstream regulatory elements
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pickle
|
|
|
|
#If ncbiRefSeq data need to be filtered in different manner change commented part
|
|
#This part is needed if we want to change ncbiRefSeq filtering
|
|
#column_names = [
|
|
# "bin", "name", "chrom", "strand", "txStart", "txEnd",
|
|
# "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds",
|
|
# "score", "name2", "cdsStartStat", "cdsEndStat", "exonFrames"
|
|
#]
|
|
|
|
#ncbiRefSeq = pd.read_csv("{ncbiRefSeq}", sep='\t', names=column_names) #dollar sign is missing
|
|
|
|
##Get onyl protein coding genes (ncbiRefSeq['cdsStartStat'] != 'none')
|
|
#ncbiRefSeq = ncbiRefSeq[ncbiRefSeq['cdsStartStat'] != 'none']
|
|
|
|
##Remove duplicates
|
|
#ncbiRefSeq = ncbiRefSeq.drop_duplicates()
|
|
|
|
##Filter out isoforms for longest DNA sequenc. Filter by protein name and chromosom(differnet version of some shromosoms)
|
|
#ncbiRefSeq_filtered = []
|
|
#for i in np.unique(ncbiRefSeq['name2']):
|
|
# prot = ncbiRefSeq[ncbiRefSeq['name2'] == i]
|
|
# for j in np.unique(prot['chrom']):
|
|
# prot_chr = prot[prot['chrom'] == j]
|
|
# d1 = [prot_chr['txEnd'].astype(int).max(), prot_chr['txStart'].astype(int).min()] #get lowest starting position and maximum stop position
|
|
# max_len = np.argmax(prot_chr['txEnd'].astype(int) - prot_chr['txStart'].astype(int))
|
|
# position = prot_chr.iloc[max_len]
|
|
# position['txEnd'] = str(d1[0]) #end for isoform can be loaded from exonEnds
|
|
# position['txStart'] = str(d1[1]) #start for isoform can be loaded from exonStarts
|
|
# ncbiRefSeq_filtered.append(position)
|
|
#
|
|
#ncbiRefSeq = pd.DataFrame(ncbiRefSeq_filtered).reset_index().drop('index', axis = 1)
|
|
##protrin coding genes only 23314, all genes 48420
|
|
|
|
#Load ncbiRefSeq filtered
|
|
MANE_data = pd.read_csv("${MANE}", sep = '\t')
|
|
|
|
#Load vcf
|
|
with open("${vcf_filtered}") as f:
|
|
lines = f.readlines()
|
|
|
|
#Remove header/start of vcf file
|
|
vcf = []
|
|
for i in lines:
|
|
if i[:2] != '##':
|
|
vcf.append(i[:-1].split('\t'))
|
|
|
|
vcf = pd.DataFrame(vcf[1:], columns=vcf[0])
|
|
|
|
# Assuming 'vcf' and 'ncbiRefSeq' are already loaded DataFrames.
|
|
vcf['POS'] = vcf['POS'].astype(int)
|
|
vcf['#CHROM'] = vcf['#CHROM'].astype('category')
|
|
MANE_data['chrom'] = MANE_data['chrom'].astype('category')
|
|
MANE_data_renamed = MANE_data.rename(columns={'chrom': '#CHROM', 'chr_start': 'Start', 'chr_end': 'End', 'chr_strand': 'strand'})
|
|
relevant_chromosomes = vcf['#CHROM'].unique()
|
|
MANE_data_renamed = MANE_data_renamed[MANE_data_renamed['#CHROM'].isin(relevant_chromosomes)]
|
|
|
|
dataset = []
|
|
|
|
#Filter VCF
|
|
for chrom in relevant_chromosomes:
|
|
chrom_MANE = MANE_data_renamed[MANE_data_renamed['#CHROM'] == chrom]
|
|
# Find the indices in `vcf` that match the conditions for the current chromosome
|
|
for name, strand, start, end in chrom_MANE[['symbol', 'strand', 'Start', 'End']].itertuples(index=False):
|
|
condition = (vcf['#CHROM'] == chrom) & (vcf['POS'] >= start) & (vcf['POS'] <= end)
|
|
if np.sum(condition) != 0:
|
|
if strand == '+':
|
|
start_ = start - 1000
|
|
end_ = end
|
|
else:
|
|
end_ = end + 1000
|
|
start_ = start
|
|
dataset.append([[name, strand, start_, end_], np.array(vcf[condition]).tolist()])
|
|
#sort vcf by length
|
|
len_ = []
|
|
for vcf_entery in dataset:
|
|
len_.append(vcf_entery[0][3] - vcf_entery[0][2])
|
|
dataset = [dataset[index] for index in np.argsort(len_)]
|
|
#save vcf
|
|
#get name
|
|
patient_name = "${vcf_filtered}".split('/')[-1].split('_variants.vcf')[0]
|
|
with open(f"{patient_name}_rna_coding_vcf_with_mutations.vcf", "wb") as fp:
|
|
pickle.dump(dataset, fp)
|
|
"""
|
|
}
|
|
|
|
process PREDICT_EXPRESSION {
|
|
accelerator 1
|
|
memory 4.GB
|
|
|
|
container "${params.container_borzoi}"
|
|
label 'gpu_process'
|
|
// containerOptions "${params.containerOptions_borzoi}"
|
|
// debug true
|
|
maxForks 1 // TODO: COMMENT THIS IN KUBERNETES
|
|
|
|
input:
|
|
path vcf_filtered
|
|
path MANE
|
|
//path ncbiRefSeq_bigger
|
|
//path ncbiRefSeq_subset
|
|
|
|
output:
|
|
path "*_TPM.csv", emit: expression_output
|
|
|
|
script:
|
|
"""
|
|
#!/opt/conda/envs/borzoi/bin/python
|
|
#Predict expression based on mutation
|
|
import json
|
|
import os
|
|
import time
|
|
import warnings
|
|
import pickle
|
|
from itertools import compress
|
|
|
|
import h5py
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.patches as patches
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pysam
|
|
import pyfaidx
|
|
import tensorflow as tf
|
|
|
|
from baskerville import seqnn
|
|
from baskerville import gene as bgene
|
|
from baskerville import dna
|
|
|
|
import sys
|
|
sys.path.append( '/home/omic/borzoi' )
|
|
from examples.borzoi_helpers import *
|
|
|
|
#Load VCF and mutation files
|
|
#ncbiRefSeq_bigger = pd.read_csv("{ncbiRefSeq_bigger}")
|
|
#with open("{ncbiRefSeq_subset}", "rb") as fp:
|
|
# ncbiRefSeq_subset = pickle.load(fp)
|
|
prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv")
|
|
with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp:
|
|
prot_subset = pickle.load(fp)
|
|
with open("${vcf_filtered}", "rb") as fp:
|
|
vcf_file = pickle.load(fp)
|
|
MANE_data = pd.read_csv("${MANE}", sep = '\t')
|
|
|
|
batch_size = 4
|
|
#Define batch size and stop if batch size is to small for big transcripts
|
|
#min_batch_size = max(((np.array(ncbiRefSeq_bigger['txEnd']).astype('int')-np.array(ncbiRefSeq_bigger['txStart']).astype('int'))/524288).astype('int')+1)
|
|
#if min_batch_size > batch_size:
|
|
# print('batch size is ',batch_size,'and min_batch_size is ',min_batch_size)
|
|
# sys.exit('Batch size has to be bigger or same as number of big trnascripts split into chunks')
|
|
|
|
#Model configuration
|
|
params_file = '/home/omic/borzoi/examples/params_pred.json'
|
|
targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt
|
|
n_folds = 1 #4 #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
|
|
rc = True #Average across reverse-complement prediction
|
|
|
|
#Read model parameters
|
|
with open(params_file) as params_open :
|
|
params = json.load(params_open)
|
|
params_model = params['model']
|
|
params_train = params['train']
|
|
|
|
#Read targets
|
|
targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
|
|
target_index = targets_df.index
|
|
|
|
#Create local index of strand_pair (relative to sliced targets)
|
|
if rc :
|
|
strand_pair = targets_df.strand_pair
|
|
target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
|
|
slice_pair = np.array([
|
|
target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
|
|
], dtype='int32')
|
|
|
|
#Initialize model ensemble
|
|
models = []
|
|
for fold_ix in range(n_folds) :
|
|
model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5"
|
|
seqnn_model = seqnn.SeqNN(params_model)
|
|
seqnn_model.restore(model_file, 0)
|
|
seqnn_model.build_slice(target_index)
|
|
if rc :
|
|
seqnn_model.strand_pair.append(slice_pair)
|
|
seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0]
|
|
models.append(seqnn_model)
|
|
fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa')
|
|
|
|
#Create mutations(s) from WT
|
|
def create_mut(sequence_one_hot, poses, alts, start):
|
|
sequence_one_hot_mut = np.copy(sequence_one_hot_wt)
|
|
for pos, alt in zip(poses, alts) :
|
|
alt_ix = -1
|
|
if alt == 'A' :
|
|
alt_ix = 0
|
|
elif alt == 'C' :
|
|
alt_ix = 1
|
|
elif alt == 'G' :
|
|
alt_ix = 2
|
|
elif alt == 'T' :
|
|
alt_ix = 3
|
|
sequence_one_hot_mut[pos-start-1] = 0.
|
|
sequence_one_hot_mut[pos-start-1, alt_ix] = 1.
|
|
return sequence_one_hot_mut
|
|
|
|
#Make predictions/ run model
|
|
def predict_tracks(models, sequence_one_hot):
|
|
predicted_tracks = []
|
|
for fold_ix in range(len(models)):
|
|
yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16")
|
|
predicted_tracks.append(yh)
|
|
predicted_tracks = np.concatenate(predicted_tracks, axis=1)
|
|
return predicted_tracks
|
|
|
|
#calculate TPM from borzoi
|
|
def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False):
|
|
TPM_list = []
|
|
#loop over all protrin in cluster
|
|
for i in range(len(cluster)):
|
|
#get exon start and end
|
|
ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')]
|
|
ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')]
|
|
#exon bool mask
|
|
exon_mask = np.zeros(borzoi_data.shape[-2])
|
|
for s,n in zip(ex_st,ex_en):
|
|
exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n))
|
|
#protrin TPM per person per tissue
|
|
TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1)
|
|
#Plot proteins with exon marks if needed
|
|
if plot == True:
|
|
#Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue
|
|
plt.plot(borzoi_data[0,:,0])
|
|
plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height')
|
|
plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height')
|
|
plt.xlim(ex_st[0]-100, ex_en[-1]+100)
|
|
plt.show()
|
|
#Get average for tissue replicates
|
|
TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])]
|
|
TPM_list.append(TPM_per_tissue)
|
|
#cretae Datafreame
|
|
TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0])
|
|
return(TPM_dataframe)
|
|
|
|
#Protrin cluster list of list
|
|
protein_clusters = [np.array(i[2]) for i in prot_subset]
|
|
#Get all proteins from VCF
|
|
proteins_with_mutations = [i[0][0] for i in vcf_file]
|
|
|
|
proteins_with_mutations_working = proteins_with_mutations
|
|
|
|
TPM = []
|
|
#run until the expression of all proteins is predicted
|
|
while len(proteins_with_mutations_working) > 0:
|
|
TPM_dfs = []
|
|
sequences_one_hot_muts = []
|
|
st = []
|
|
cl = []
|
|
#append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append)
|
|
while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0:
|
|
#get work protein
|
|
protein = proteins_with_mutations_working[0]
|
|
#print(protein)
|
|
#get cluster
|
|
mask = [protein in i for i in protein_clusters]
|
|
cluster = list(compress(protein_clusters, mask))
|
|
#run borzoi for big proteins
|
|
if protein in np.array(prot_bigger['symbol']):
|
|
sequences_one_hot_muts_big = []
|
|
proteins_with_mutations_working = proteins_with_mutations_working[1:]
|
|
protein_data = prot_bigger[prot_bigger['symbol'] == protein]
|
|
prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000
|
|
prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000
|
|
mutations_ = np.array(list(compress(vcf_file, np.array(proteins_with_mutations) == protein))[0][1])
|
|
#only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover
|
|
mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]]
|
|
chrom = mutations_[0,0]
|
|
all_poses = mutations_[:,1].astype('int')
|
|
all_alts = mutations_[:,4]
|
|
star = prot_start
|
|
st_big = star
|
|
while star < prot_end:
|
|
end = star + 524288
|
|
poses = all_poses[(all_poses > star) & (all_poses < end)]
|
|
alts = all_alts[(all_poses > star) & (all_poses < end)]
|
|
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
|
|
sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star)
|
|
sequences_one_hot_muts_big.append(sequence_one_hot_mut)
|
|
star = end - (32*32)
|
|
sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big)
|
|
#if number of protein splits is begger than batch size
|
|
#print(sequences_one_hot_muts_big.shape)
|
|
if sequences_one_hot_muts_big.shape[0] > batch_size:
|
|
borzoi_pred_list = []
|
|
for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)):
|
|
borzoi_pred_list.append(predict_tracks(models, seq_slice))
|
|
y_mut = np.concatenate(borzoi_pred_list)
|
|
else:
|
|
y_mut = predict_tracks(models, sequences_one_hot_muts_big)
|
|
y_mut = np.reshape(y_mut, [1,1,-1,89])
|
|
TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df))
|
|
#np.save('expression_predictions_%s.npy' %protein, y_mut)
|
|
else:
|
|
#append to a list of proteins to run
|
|
#get star and end of the cluste
|
|
star, end = (list(compress(prot_subset, mask))[0][:2])
|
|
#get mutated proteins in the cluster
|
|
mask = [i in cluster[0] for i in proteins_with_mutations_working]
|
|
proteins_in_cluster = list(compress(proteins_with_mutations_working, mask))
|
|
#remove cluster proteins from the ptoein list
|
|
proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask)))
|
|
#print(proteins_in_cluster)
|
|
mutations_ = [list(compress(vcf_file, [np.array(proteins_with_mutations) == i][0]))[0][1] for i in proteins_in_cluster]
|
|
mutations_ = np.concatenate([np.array(i) for i in mutations_],0)
|
|
#only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover
|
|
mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]]
|
|
chrom = mutations_[0,0]
|
|
poses = mutations_[:,1].astype('int')
|
|
alts = mutations_[:,4]
|
|
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
|
|
sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star)
|
|
sequences_one_hot_muts.append(sequence_one_hot_mut)
|
|
st.append(star)
|
|
cl.append(cluster)
|
|
### Test wt
|
|
#sequences_one_hot_muts.append(sequence_one_hot_wt)
|
|
###
|
|
sequences_one_hot_muts = np.array(sequences_one_hot_muts)
|
|
#run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step)
|
|
if sequences_one_hot_muts.shape != (0,):
|
|
y_mut = predict_tracks(models, sequences_one_hot_muts)
|
|
for i in range(len(y_mut)):
|
|
TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df))
|
|
TPM_dfs = pd.concat(TPM_dfs)
|
|
TPM.append(TPM_dfs)
|
|
#np.save('expression_predictions_%s.npy' %protein, y_mut)
|
|
TPM = pd.concat(TPM)
|
|
|
|
#add RNA expression for positions with no mutations
|
|
#load reference genome expression
|
|
tpm_no_mut = pd.read_csv('/home/omic/borzoi/TPM_NO_MUTATIONS.csv',index_col=0)
|
|
#concat missing expression
|
|
TPM = pd.concat([TPM, tpm_no_mut.loc[list(set(tpm_no_mut.index) - set(TPM.index))]])
|
|
#change symbol to ENSG
|
|
TPM.index.names = ['RNA']
|
|
MANE_data['ENSG'] = [i.split('.')[0] for i in MANE_data['Ensembl_Gene']]
|
|
mane_map = MANE_data[['symbol','ENSG']]
|
|
TPM = mane_map.merge(TPM, left_on='symbol', right_on='RNA').dropna().drop_duplicates(subset=['symbol'])
|
|
#drop duplicates
|
|
TPM =TPM.drop_duplicates()
|
|
TPM = TPM.iloc[:,2:].set_index(TPM.iloc[:,1])
|
|
#save TPM
|
|
#get name
|
|
patient_name = "${vcf_filtered}".split('/')[-1].split('_rna_coding_vcf_with_mutations.vcf')[0]
|
|
TPM.to_csv(f'{patient_name}_TPM.csv')
|
|
|
|
gpu_stats = tf.config.experimental.get_memory_info('GPU:0')
|
|
print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB")
|
|
print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB")
|
|
"""
|
|
}
|
|
|
|
|
|
process CREATE_PROTEIN_CLUSTER {
|
|
container "${params.container_borzoi}"
|
|
containerOptions "${params.containerOptions}"
|
|
debug true
|
|
|
|
input:
|
|
path MANE
|
|
|
|
output:
|
|
path "prot_bigger.csv"
|
|
path "prot_subset.pickle"
|
|
|
|
script:
|
|
"""
|
|
#!/opt/conda/envs/borzoi/bin/python
|
|
# Use this if new RNAs transcripts need to be added to list of predicted RNAs
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pickle
|
|
|
|
data = pd.read_csv("${MANE}", sep='\t')
|
|
|
|
data_subset = []
|
|
data_bigger = []
|
|
for ch in np.unique(data['chrom']):
|
|
data_working = data[data['chrom']==ch]
|
|
arg = np.argsort(np.array(data_working['chr_start']).astype('int'))
|
|
data_working = data_working.iloc[arg]
|
|
#protrin start poistion wirh regulatori elements negativ 1000 from start for + strand, plus 1000 from end for - strand
|
|
#add (16*32) as buffer because borzoi will cut tham
|
|
prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000
|
|
prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000
|
|
input_start = prot_start
|
|
input_end = input_start + 524288
|
|
while len(input_start) > 0:
|
|
st = input_start[0]
|
|
en = input_end[0]
|
|
mask = (prot_start >= st) & (prot_end <= en)
|
|
data_mask = data_working[mask]
|
|
data_subset.append([st, en, data_mask['symbol']])
|
|
#ncbiRefSeq_subset.append(ncbiRefSeq_working[mask])
|
|
if mask[0] == False:
|
|
data_bigger.append(data_working.iloc[0])
|
|
mask[0] = True
|
|
data_working = data_working[~mask]
|
|
arg = np.argsort(np.array(data_working['chr_start']).astype('int'))
|
|
data_working = data_working.iloc[arg]
|
|
prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000
|
|
prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000
|
|
input_start = prot_start
|
|
input_end = input_start + 524288
|
|
|
|
prot_bigger = pd.DataFrame(data_bigger)
|
|
|
|
prot_bigger.to_csv('prot_bigger.csv')
|
|
with open("prot_subset.pickle", "wb") as fp:
|
|
pickle.dump(data_subset, fp)
|
|
|
|
gpu_stats = tf.config.experimental.get_memory_info('GPU:0')
|
|
print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB")
|
|
print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB")
|
|
"""
|
|
}
|