nextflow.enable.dsl=2 process PREDICT_EXPRESSION_NO_MUTATIONS { container "${params.container_borzoi}" containerOptions "${params.containerOptions}" debug true maxForks 1 input: path MANE output: path "TPM_NO_MUTATIONS.csv", emit: expression_output script: """ #!/opt/conda/envs/borzoi/bin/python #Predict expression of reference genom import json import os import time import warnings import pickle from itertools import compress import h5py import matplotlib.pyplot as plt import matplotlib.patches as patches import numpy as np import pandas as pd import pysam import pyfaidx import tensorflow as tf from baskerville import seqnn from baskerville import gene as bgene from baskerville import dna import sys sys.path.append( '/home/omic/borzoi' ) from examples.borzoi_helpers import * #Reference protein dna sequence from MANE dataset prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv") with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp: prot_subset = pickle.load(fp) MANE_data = pd.read_csv("${MANE}", sep = '\t') batch_size = 4 #Model configuration params_file = '/home/omic/borzoi/examples/params_pred.json' targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt n_folds = 1 #4 #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'. rc = True #Average across reverse-complement prediction #Read model parameters with open(params_file) as params_open : params = json.load(params_open) params_model = params['model'] params_train = params['train'] #Read targets targets_df = pd.read_csv(targets_file, index_col=0, sep='\t') target_index = targets_df.index #Create local index of strand_pair (relative to sliced targets) if rc : strand_pair = targets_df.strand_pair target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())} slice_pair = np.array([ target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist() ], dtype='int32') #Initialize model ensemble models = [] for fold_ix in range(n_folds) : model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5" seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file, 0) seqnn_model.build_slice(target_index) if rc : seqnn_model.strand_pair.append(slice_pair) seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0] models.append(seqnn_model) fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa') #Make predictions/ run model def predict_tracks(models, sequence_one_hot): predicted_tracks = [] for fold_ix in range(len(models)): yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16") predicted_tracks.append(yh) predicted_tracks = np.concatenate(predicted_tracks, axis=1) return predicted_tracks #calculate TPM from borzoi def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False): TPM_list = [] #loop over all protrin in cluster for i in range(len(cluster)): #get exon start and end ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')] ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')] #exon bool mask exon_mask = np.zeros(borzoi_data.shape[-2]) for s,n in zip(ex_st,ex_en): exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n)) #protrin TPM per person per tissue TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1) #Plot proteins with exon marks if needed if plot == True: #Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue plt.plot(borzoi_data[0,:,0]) plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height') plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height') plt.xlim(ex_st[0]-100, ex_en[-1]+100) plt.show() #Get average for tissue replicates TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])] TPM_list.append(TPM_per_tissue) #cretae Datafreame TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0]) return(TPM_dataframe) #Protrin cluster list of list protein_clusters = [np.array(i[2]) for i in prot_subset] #all proroteins present in dataset all_reference_proteins = list(pd.concat([pd.concat([i[-1] for i in prot_subset]), prot_bigger['symbol']])) #use variable names from mutation proteins_with_mutations = all_reference_proteins proteins_with_mutations_working = proteins_with_mutations TPM = [] #run until the expression of all proteins is predicted while len(proteins_with_mutations_working) > 0: TPM_dfs = [] sequences_one_hot_muts = [] st = [] cl = [] #append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append) while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0: #get work protein protein = proteins_with_mutations_working[0] #print(protein) #get cluster mask = [protein in i for i in protein_clusters] cluster = list(compress(protein_clusters, mask)) #run borzoi for big proteins if protein in np.array(prot_bigger['symbol']): sequences_one_hot_muts_big = [] proteins_with_mutations_working = proteins_with_mutations_working[1:] protein_data = prot_bigger[prot_bigger['symbol'] == protein] prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000 prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000 chrom = protein_data.iloc[0]['chrom'].split('_')[0] star = prot_start st_big = star while star < prot_end: end = star + 524288 sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288) sequences_one_hot_muts_big.append(sequence_one_hot_wt) star = end - (32*32) sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big) #if number of protein splits is begger than batch size #print(sequences_one_hot_muts_big.shape) if sequences_one_hot_muts_big.shape[0] > batch_size: borzoi_pred_list = [] for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)): borzoi_pred_list.append(predict_tracks(models, seq_slice)) y_mut = np.concatenate(borzoi_pred_list) else: y_mut = predict_tracks(models, sequences_one_hot_muts_big) y_mut = np.reshape(y_mut, [1,1,-1,89]) TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df)) #np.save('expression_predictions_%s.npy' %protein, y_mut) else: #append to a list of proteins to run #get star and end of the cluste star, end = (list(compress(prot_subset, mask))[0][:2]) #get mutated proteins in the cluster mask = [i in cluster[0] for i in proteins_with_mutations_working] proteins_in_cluster = list(compress(proteins_with_mutations_working, mask)) #remove cluster proteins from the ptoein list proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask))) chrom = MANE_data[MANE_data['symbol'] == proteins_in_cluster[0]].iloc[0]['chrom'].split('_')[0] sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288) sequences_one_hot_muts.append(sequence_one_hot_wt) st.append(star) cl.append(cluster) ### Test wt #sequences_one_hot_muts.append(sequence_one_hot_wt) ### sequences_one_hot_muts = np.array(sequences_one_hot_muts) #run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step) if sequences_one_hot_muts.shape != (0,): y_mut = predict_tracks(models, sequences_one_hot_muts) for i in range(len(y_mut)): TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df)) TPM_dfs = pd.concat(TPM_dfs) TPM.append(TPM_dfs) #np.save('expression_predictions_%s.npy' %protein, y_mut) TPM = pd.concat(TPM) TPM.to_csv('TPM_NO_MUTATIONS.csv') """ }