Files
digital-patients/main_no_mutations.nf
Olamide Isreal 9e6a16c19b Initial commit: digital-patients pipeline (clean, no large files)
Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
2026-03-26 15:15:23 +01:00

209 lines
9.6 KiB
Plaintext

nextflow.enable.dsl=2
process PREDICT_EXPRESSION_NO_MUTATIONS {
container "${params.container_borzoi}"
containerOptions "${params.containerOptions}"
debug true
maxForks 1
input:
path MANE
output:
path "TPM_NO_MUTATIONS.csv", emit: expression_output
script:
"""
#!/opt/conda/envs/borzoi/bin/python
#Predict expression of reference genom
import json
import os
import time
import warnings
import pickle
from itertools import compress
import h5py
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import pandas as pd
import pysam
import pyfaidx
import tensorflow as tf
from baskerville import seqnn
from baskerville import gene as bgene
from baskerville import dna
import sys
sys.path.append( '/home/omic/borzoi' )
from examples.borzoi_helpers import *
#Reference protein dna sequence from MANE dataset
prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv")
with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp:
prot_subset = pickle.load(fp)
MANE_data = pd.read_csv("${MANE}", sep = '\t')
batch_size = 4
#Model configuration
params_file = '/home/omic/borzoi/examples/params_pred.json'
targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt
n_folds = 1 #4 #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
rc = True #Average across reverse-complement prediction
#Read model parameters
with open(params_file) as params_open :
params = json.load(params_open)
params_model = params['model']
params_train = params['train']
#Read targets
targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
target_index = targets_df.index
#Create local index of strand_pair (relative to sliced targets)
if rc :
strand_pair = targets_df.strand_pair
target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
slice_pair = np.array([
target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
], dtype='int32')
#Initialize model ensemble
models = []
for fold_ix in range(n_folds) :
model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5"
seqnn_model = seqnn.SeqNN(params_model)
seqnn_model.restore(model_file, 0)
seqnn_model.build_slice(target_index)
if rc :
seqnn_model.strand_pair.append(slice_pair)
seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0]
models.append(seqnn_model)
fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa')
#Make predictions/ run model
def predict_tracks(models, sequence_one_hot):
predicted_tracks = []
for fold_ix in range(len(models)):
yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16")
predicted_tracks.append(yh)
predicted_tracks = np.concatenate(predicted_tracks, axis=1)
return predicted_tracks
#calculate TPM from borzoi
def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False):
TPM_list = []
#loop over all protrin in cluster
for i in range(len(cluster)):
#get exon start and end
ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')]
ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')]
#exon bool mask
exon_mask = np.zeros(borzoi_data.shape[-2])
for s,n in zip(ex_st,ex_en):
exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n))
#protrin TPM per person per tissue
TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1)
#Plot proteins with exon marks if needed
if plot == True:
#Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue
plt.plot(borzoi_data[0,:,0])
plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height')
plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height')
plt.xlim(ex_st[0]-100, ex_en[-1]+100)
plt.show()
#Get average for tissue replicates
TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])]
TPM_list.append(TPM_per_tissue)
#cretae Datafreame
TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0])
return(TPM_dataframe)
#Protrin cluster list of list
protein_clusters = [np.array(i[2]) for i in prot_subset]
#all proroteins present in dataset
all_reference_proteins = list(pd.concat([pd.concat([i[-1] for i in prot_subset]), prot_bigger['symbol']]))
#use variable names from mutation
proteins_with_mutations = all_reference_proteins
proteins_with_mutations_working = proteins_with_mutations
TPM = []
#run until the expression of all proteins is predicted
while len(proteins_with_mutations_working) > 0:
TPM_dfs = []
sequences_one_hot_muts = []
st = []
cl = []
#append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append)
while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0:
#get work protein
protein = proteins_with_mutations_working[0]
#print(protein)
#get cluster
mask = [protein in i for i in protein_clusters]
cluster = list(compress(protein_clusters, mask))
#run borzoi for big proteins
if protein in np.array(prot_bigger['symbol']):
sequences_one_hot_muts_big = []
proteins_with_mutations_working = proteins_with_mutations_working[1:]
protein_data = prot_bigger[prot_bigger['symbol'] == protein]
prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000
prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000
chrom = protein_data.iloc[0]['chrom'].split('_')[0]
star = prot_start
st_big = star
while star < prot_end:
end = star + 524288
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
sequences_one_hot_muts_big.append(sequence_one_hot_wt)
star = end - (32*32)
sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big)
#if number of protein splits is begger than batch size
#print(sequences_one_hot_muts_big.shape)
if sequences_one_hot_muts_big.shape[0] > batch_size:
borzoi_pred_list = []
for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)):
borzoi_pred_list.append(predict_tracks(models, seq_slice))
y_mut = np.concatenate(borzoi_pred_list)
else:
y_mut = predict_tracks(models, sequences_one_hot_muts_big)
y_mut = np.reshape(y_mut, [1,1,-1,89])
TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df))
#np.save('expression_predictions_%s.npy' %protein, y_mut)
else:
#append to a list of proteins to run
#get star and end of the cluste
star, end = (list(compress(prot_subset, mask))[0][:2])
#get mutated proteins in the cluster
mask = [i in cluster[0] for i in proteins_with_mutations_working]
proteins_in_cluster = list(compress(proteins_with_mutations_working, mask))
#remove cluster proteins from the ptoein list
proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask)))
chrom = MANE_data[MANE_data['symbol'] == proteins_in_cluster[0]].iloc[0]['chrom'].split('_')[0]
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
sequences_one_hot_muts.append(sequence_one_hot_wt)
st.append(star)
cl.append(cluster)
### Test wt
#sequences_one_hot_muts.append(sequence_one_hot_wt)
###
sequences_one_hot_muts = np.array(sequences_one_hot_muts)
#run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step)
if sequences_one_hot_muts.shape != (0,):
y_mut = predict_tracks(models, sequences_one_hot_muts)
for i in range(len(y_mut)):
TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df))
TPM_dfs = pd.concat(TPM_dfs)
TPM.append(TPM_dfs)
#np.save('expression_predictions_%s.npy' %protein, y_mut)
TPM = pd.concat(TPM)
TPM.to_csv('TPM_NO_MUTATIONS.csv')
"""
}