Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
209 lines
9.6 KiB
Plaintext
209 lines
9.6 KiB
Plaintext
nextflow.enable.dsl=2
|
|
|
|
process PREDICT_EXPRESSION_NO_MUTATIONS {
|
|
container "${params.container_borzoi}"
|
|
containerOptions "${params.containerOptions}"
|
|
debug true
|
|
maxForks 1
|
|
|
|
input:
|
|
path MANE
|
|
|
|
output:
|
|
path "TPM_NO_MUTATIONS.csv", emit: expression_output
|
|
|
|
script:
|
|
"""
|
|
#!/opt/conda/envs/borzoi/bin/python
|
|
#Predict expression of reference genom
|
|
import json
|
|
import os
|
|
import time
|
|
import warnings
|
|
import pickle
|
|
from itertools import compress
|
|
|
|
import h5py
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.patches as patches
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pysam
|
|
import pyfaidx
|
|
import tensorflow as tf
|
|
|
|
from baskerville import seqnn
|
|
from baskerville import gene as bgene
|
|
from baskerville import dna
|
|
|
|
import sys
|
|
sys.path.append( '/home/omic/borzoi' )
|
|
from examples.borzoi_helpers import *
|
|
|
|
#Reference protein dna sequence from MANE dataset
|
|
prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv")
|
|
with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp:
|
|
prot_subset = pickle.load(fp)
|
|
MANE_data = pd.read_csv("${MANE}", sep = '\t')
|
|
|
|
batch_size = 4
|
|
|
|
#Model configuration
|
|
params_file = '/home/omic/borzoi/examples/params_pred.json'
|
|
targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt
|
|
n_folds = 1 #4 #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
|
|
rc = True #Average across reverse-complement prediction
|
|
|
|
#Read model parameters
|
|
with open(params_file) as params_open :
|
|
params = json.load(params_open)
|
|
params_model = params['model']
|
|
params_train = params['train']
|
|
|
|
#Read targets
|
|
targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
|
|
target_index = targets_df.index
|
|
|
|
#Create local index of strand_pair (relative to sliced targets)
|
|
if rc :
|
|
strand_pair = targets_df.strand_pair
|
|
target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
|
|
slice_pair = np.array([
|
|
target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
|
|
], dtype='int32')
|
|
|
|
#Initialize model ensemble
|
|
models = []
|
|
for fold_ix in range(n_folds) :
|
|
model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5"
|
|
seqnn_model = seqnn.SeqNN(params_model)
|
|
seqnn_model.restore(model_file, 0)
|
|
seqnn_model.build_slice(target_index)
|
|
if rc :
|
|
seqnn_model.strand_pair.append(slice_pair)
|
|
seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0]
|
|
models.append(seqnn_model)
|
|
fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa')
|
|
|
|
#Make predictions/ run model
|
|
def predict_tracks(models, sequence_one_hot):
|
|
predicted_tracks = []
|
|
for fold_ix in range(len(models)):
|
|
yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16")
|
|
predicted_tracks.append(yh)
|
|
predicted_tracks = np.concatenate(predicted_tracks, axis=1)
|
|
return predicted_tracks
|
|
|
|
#calculate TPM from borzoi
|
|
def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False):
|
|
TPM_list = []
|
|
#loop over all protrin in cluster
|
|
for i in range(len(cluster)):
|
|
#get exon start and end
|
|
ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')]
|
|
ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')]
|
|
#exon bool mask
|
|
exon_mask = np.zeros(borzoi_data.shape[-2])
|
|
for s,n in zip(ex_st,ex_en):
|
|
exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n))
|
|
#protrin TPM per person per tissue
|
|
TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1)
|
|
#Plot proteins with exon marks if needed
|
|
if plot == True:
|
|
#Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue
|
|
plt.plot(borzoi_data[0,:,0])
|
|
plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height')
|
|
plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height')
|
|
plt.xlim(ex_st[0]-100, ex_en[-1]+100)
|
|
plt.show()
|
|
#Get average for tissue replicates
|
|
TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])]
|
|
TPM_list.append(TPM_per_tissue)
|
|
#cretae Datafreame
|
|
TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0])
|
|
return(TPM_dataframe)
|
|
|
|
#Protrin cluster list of list
|
|
protein_clusters = [np.array(i[2]) for i in prot_subset]
|
|
|
|
#all proroteins present in dataset
|
|
all_reference_proteins = list(pd.concat([pd.concat([i[-1] for i in prot_subset]), prot_bigger['symbol']]))
|
|
#use variable names from mutation
|
|
proteins_with_mutations = all_reference_proteins
|
|
proteins_with_mutations_working = proteins_with_mutations
|
|
|
|
TPM = []
|
|
#run until the expression of all proteins is predicted
|
|
while len(proteins_with_mutations_working) > 0:
|
|
TPM_dfs = []
|
|
sequences_one_hot_muts = []
|
|
st = []
|
|
cl = []
|
|
#append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append)
|
|
while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0:
|
|
#get work protein
|
|
protein = proteins_with_mutations_working[0]
|
|
#print(protein)
|
|
#get cluster
|
|
mask = [protein in i for i in protein_clusters]
|
|
cluster = list(compress(protein_clusters, mask))
|
|
#run borzoi for big proteins
|
|
if protein in np.array(prot_bigger['symbol']):
|
|
sequences_one_hot_muts_big = []
|
|
proteins_with_mutations_working = proteins_with_mutations_working[1:]
|
|
protein_data = prot_bigger[prot_bigger['symbol'] == protein]
|
|
prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000
|
|
prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000
|
|
chrom = protein_data.iloc[0]['chrom'].split('_')[0]
|
|
star = prot_start
|
|
st_big = star
|
|
while star < prot_end:
|
|
end = star + 524288
|
|
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
|
|
sequences_one_hot_muts_big.append(sequence_one_hot_wt)
|
|
star = end - (32*32)
|
|
sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big)
|
|
#if number of protein splits is begger than batch size
|
|
#print(sequences_one_hot_muts_big.shape)
|
|
if sequences_one_hot_muts_big.shape[0] > batch_size:
|
|
borzoi_pred_list = []
|
|
for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)):
|
|
borzoi_pred_list.append(predict_tracks(models, seq_slice))
|
|
y_mut = np.concatenate(borzoi_pred_list)
|
|
else:
|
|
y_mut = predict_tracks(models, sequences_one_hot_muts_big)
|
|
y_mut = np.reshape(y_mut, [1,1,-1,89])
|
|
TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df))
|
|
#np.save('expression_predictions_%s.npy' %protein, y_mut)
|
|
else:
|
|
#append to a list of proteins to run
|
|
#get star and end of the cluste
|
|
star, end = (list(compress(prot_subset, mask))[0][:2])
|
|
#get mutated proteins in the cluster
|
|
mask = [i in cluster[0] for i in proteins_with_mutations_working]
|
|
proteins_in_cluster = list(compress(proteins_with_mutations_working, mask))
|
|
#remove cluster proteins from the ptoein list
|
|
proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask)))
|
|
chrom = MANE_data[MANE_data['symbol'] == proteins_in_cluster[0]].iloc[0]['chrom'].split('_')[0]
|
|
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
|
|
sequences_one_hot_muts.append(sequence_one_hot_wt)
|
|
st.append(star)
|
|
cl.append(cluster)
|
|
### Test wt
|
|
#sequences_one_hot_muts.append(sequence_one_hot_wt)
|
|
###
|
|
sequences_one_hot_muts = np.array(sequences_one_hot_muts)
|
|
#run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step)
|
|
if sequences_one_hot_muts.shape != (0,):
|
|
y_mut = predict_tracks(models, sequences_one_hot_muts)
|
|
for i in range(len(y_mut)):
|
|
TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df))
|
|
TPM_dfs = pd.concat(TPM_dfs)
|
|
TPM.append(TPM_dfs)
|
|
#np.save('expression_predictions_%s.npy' %protein, y_mut)
|
|
TPM = pd.concat(TPM)
|
|
TPM.to_csv('TPM_NO_MUTATIONS.csv')
|
|
"""
|
|
}
|
|
|