Initial commit: digital-patients pipeline (clean, no large files)
Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
This commit is contained in:
208
main_no_mutations.nf
Normal file
208
main_no_mutations.nf
Normal file
@@ -0,0 +1,208 @@
|
||||
nextflow.enable.dsl=2
|
||||
|
||||
process PREDICT_EXPRESSION_NO_MUTATIONS {
|
||||
container "${params.container_borzoi}"
|
||||
containerOptions "${params.containerOptions}"
|
||||
debug true
|
||||
maxForks 1
|
||||
|
||||
input:
|
||||
path MANE
|
||||
|
||||
output:
|
||||
path "TPM_NO_MUTATIONS.csv", emit: expression_output
|
||||
|
||||
script:
|
||||
"""
|
||||
#!/opt/conda/envs/borzoi/bin/python
|
||||
#Predict expression of reference genom
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import warnings
|
||||
import pickle
|
||||
from itertools import compress
|
||||
|
||||
import h5py
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pysam
|
||||
import pyfaidx
|
||||
import tensorflow as tf
|
||||
|
||||
from baskerville import seqnn
|
||||
from baskerville import gene as bgene
|
||||
from baskerville import dna
|
||||
|
||||
import sys
|
||||
sys.path.append( '/home/omic/borzoi' )
|
||||
from examples.borzoi_helpers import *
|
||||
|
||||
#Reference protein dna sequence from MANE dataset
|
||||
prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv")
|
||||
with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp:
|
||||
prot_subset = pickle.load(fp)
|
||||
MANE_data = pd.read_csv("${MANE}", sep = '\t')
|
||||
|
||||
batch_size = 4
|
||||
|
||||
#Model configuration
|
||||
params_file = '/home/omic/borzoi/examples/params_pred.json'
|
||||
targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt
|
||||
n_folds = 1 #4 #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
|
||||
rc = True #Average across reverse-complement prediction
|
||||
|
||||
#Read model parameters
|
||||
with open(params_file) as params_open :
|
||||
params = json.load(params_open)
|
||||
params_model = params['model']
|
||||
params_train = params['train']
|
||||
|
||||
#Read targets
|
||||
targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
|
||||
target_index = targets_df.index
|
||||
|
||||
#Create local index of strand_pair (relative to sliced targets)
|
||||
if rc :
|
||||
strand_pair = targets_df.strand_pair
|
||||
target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
|
||||
slice_pair = np.array([
|
||||
target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
|
||||
], dtype='int32')
|
||||
|
||||
#Initialize model ensemble
|
||||
models = []
|
||||
for fold_ix in range(n_folds) :
|
||||
model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5"
|
||||
seqnn_model = seqnn.SeqNN(params_model)
|
||||
seqnn_model.restore(model_file, 0)
|
||||
seqnn_model.build_slice(target_index)
|
||||
if rc :
|
||||
seqnn_model.strand_pair.append(slice_pair)
|
||||
seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0]
|
||||
models.append(seqnn_model)
|
||||
fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa')
|
||||
|
||||
#Make predictions/ run model
|
||||
def predict_tracks(models, sequence_one_hot):
|
||||
predicted_tracks = []
|
||||
for fold_ix in range(len(models)):
|
||||
yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16")
|
||||
predicted_tracks.append(yh)
|
||||
predicted_tracks = np.concatenate(predicted_tracks, axis=1)
|
||||
return predicted_tracks
|
||||
|
||||
#calculate TPM from borzoi
|
||||
def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False):
|
||||
TPM_list = []
|
||||
#loop over all protrin in cluster
|
||||
for i in range(len(cluster)):
|
||||
#get exon start and end
|
||||
ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')]
|
||||
ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')]
|
||||
#exon bool mask
|
||||
exon_mask = np.zeros(borzoi_data.shape[-2])
|
||||
for s,n in zip(ex_st,ex_en):
|
||||
exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n))
|
||||
#protrin TPM per person per tissue
|
||||
TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1)
|
||||
#Plot proteins with exon marks if needed
|
||||
if plot == True:
|
||||
#Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue
|
||||
plt.plot(borzoi_data[0,:,0])
|
||||
plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height')
|
||||
plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height')
|
||||
plt.xlim(ex_st[0]-100, ex_en[-1]+100)
|
||||
plt.show()
|
||||
#Get average for tissue replicates
|
||||
TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])]
|
||||
TPM_list.append(TPM_per_tissue)
|
||||
#cretae Datafreame
|
||||
TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0])
|
||||
return(TPM_dataframe)
|
||||
|
||||
#Protrin cluster list of list
|
||||
protein_clusters = [np.array(i[2]) for i in prot_subset]
|
||||
|
||||
#all proroteins present in dataset
|
||||
all_reference_proteins = list(pd.concat([pd.concat([i[-1] for i in prot_subset]), prot_bigger['symbol']]))
|
||||
#use variable names from mutation
|
||||
proteins_with_mutations = all_reference_proteins
|
||||
proteins_with_mutations_working = proteins_with_mutations
|
||||
|
||||
TPM = []
|
||||
#run until the expression of all proteins is predicted
|
||||
while len(proteins_with_mutations_working) > 0:
|
||||
TPM_dfs = []
|
||||
sequences_one_hot_muts = []
|
||||
st = []
|
||||
cl = []
|
||||
#append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append)
|
||||
while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0:
|
||||
#get work protein
|
||||
protein = proteins_with_mutations_working[0]
|
||||
#print(protein)
|
||||
#get cluster
|
||||
mask = [protein in i for i in protein_clusters]
|
||||
cluster = list(compress(protein_clusters, mask))
|
||||
#run borzoi for big proteins
|
||||
if protein in np.array(prot_bigger['symbol']):
|
||||
sequences_one_hot_muts_big = []
|
||||
proteins_with_mutations_working = proteins_with_mutations_working[1:]
|
||||
protein_data = prot_bigger[prot_bigger['symbol'] == protein]
|
||||
prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000
|
||||
prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000
|
||||
chrom = protein_data.iloc[0]['chrom'].split('_')[0]
|
||||
star = prot_start
|
||||
st_big = star
|
||||
while star < prot_end:
|
||||
end = star + 524288
|
||||
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
|
||||
sequences_one_hot_muts_big.append(sequence_one_hot_wt)
|
||||
star = end - (32*32)
|
||||
sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big)
|
||||
#if number of protein splits is begger than batch size
|
||||
#print(sequences_one_hot_muts_big.shape)
|
||||
if sequences_one_hot_muts_big.shape[0] > batch_size:
|
||||
borzoi_pred_list = []
|
||||
for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)):
|
||||
borzoi_pred_list.append(predict_tracks(models, seq_slice))
|
||||
y_mut = np.concatenate(borzoi_pred_list)
|
||||
else:
|
||||
y_mut = predict_tracks(models, sequences_one_hot_muts_big)
|
||||
y_mut = np.reshape(y_mut, [1,1,-1,89])
|
||||
TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df))
|
||||
#np.save('expression_predictions_%s.npy' %protein, y_mut)
|
||||
else:
|
||||
#append to a list of proteins to run
|
||||
#get star and end of the cluste
|
||||
star, end = (list(compress(prot_subset, mask))[0][:2])
|
||||
#get mutated proteins in the cluster
|
||||
mask = [i in cluster[0] for i in proteins_with_mutations_working]
|
||||
proteins_in_cluster = list(compress(proteins_with_mutations_working, mask))
|
||||
#remove cluster proteins from the ptoein list
|
||||
proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask)))
|
||||
chrom = MANE_data[MANE_data['symbol'] == proteins_in_cluster[0]].iloc[0]['chrom'].split('_')[0]
|
||||
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
|
||||
sequences_one_hot_muts.append(sequence_one_hot_wt)
|
||||
st.append(star)
|
||||
cl.append(cluster)
|
||||
### Test wt
|
||||
#sequences_one_hot_muts.append(sequence_one_hot_wt)
|
||||
###
|
||||
sequences_one_hot_muts = np.array(sequences_one_hot_muts)
|
||||
#run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step)
|
||||
if sequences_one_hot_muts.shape != (0,):
|
||||
y_mut = predict_tracks(models, sequences_one_hot_muts)
|
||||
for i in range(len(y_mut)):
|
||||
TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df))
|
||||
TPM_dfs = pd.concat(TPM_dfs)
|
||||
TPM.append(TPM_dfs)
|
||||
#np.save('expression_predictions_%s.npy' %protein, y_mut)
|
||||
TPM = pd.concat(TPM)
|
||||
TPM.to_csv('TPM_NO_MUTATIONS.csv')
|
||||
"""
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user