Initial commit: digital-patients pipeline (clean, no large files)

Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
2026-03-26 15:15:23 +01:00
commit 9e6a16c19b
45 changed files with 7207 additions and 0 deletions
--- a/main_no_mutations.nf
+++ b/main_no_mutations.nf
@@ -0,0 +1,208 @@
+nextflow.enable.dsl=2
+
+process PREDICT_EXPRESSION_NO_MUTATIONS {
+    container "${params.container_borzoi}"
+    containerOptions "${params.containerOptions}"
+    debug true
+    maxForks 1
+
+    input:
+    path MANE
+
+    output:
+	path "TPM_NO_MUTATIONS.csv", emit: expression_output
+
+    script:
+    """
+    #!/opt/conda/envs/borzoi/bin/python
+    #Predict expression of reference genom
+    import json
+    import os
+    import time
+    import warnings
+    import pickle
+    from itertools import compress
+
+    import h5py
+    import matplotlib.pyplot as plt
+    import matplotlib.patches as patches
+    import numpy as np
+    import pandas as pd
+    import pysam
+    import pyfaidx
+    import tensorflow as tf
+
+    from baskerville import seqnn
+    from baskerville import gene as bgene
+    from baskerville import dna
+
+    import sys
+    sys.path.append( '/home/omic/borzoi' )
+    from examples.borzoi_helpers import *
+
+    #Reference protein dna sequence from MANE dataset
+    prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv")
+    with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp:
+        prot_subset = pickle.load(fp)
+    MANE_data = pd.read_csv("${MANE}", sep = '\t')
+
+    batch_size = 4
+
+    #Model configuration
+    params_file = '/home/omic/borzoi/examples/params_pred.json'
+    targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt
+    n_folds = 1 #4        #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
+    rc = True         #Average across reverse-complement prediction
+
+    #Read model parameters
+    with open(params_file) as params_open :
+        params = json.load(params_open)
+        params_model = params['model']
+        params_train = params['train']
+
+    #Read targets
+    targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
+    target_index = targets_df.index
+
+    #Create local index of strand_pair (relative to sliced targets)
+    if rc :
+        strand_pair = targets_df.strand_pair
+        target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
+        slice_pair = np.array([
+            target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
+        ], dtype='int32')
+
+    #Initialize model ensemble
+    models = []
+    for fold_ix in range(n_folds) :
+        model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5"
+        seqnn_model = seqnn.SeqNN(params_model)
+        seqnn_model.restore(model_file, 0)
+        seqnn_model.build_slice(target_index)
+        if rc :
+            seqnn_model.strand_pair.append(slice_pair)
+        seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0]
+        models.append(seqnn_model)
+    fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa')
+
+    #Make predictions/ run model
+    def predict_tracks(models, sequence_one_hot):
+        predicted_tracks = []
+        for fold_ix in range(len(models)):
+            yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16") 
+            predicted_tracks.append(yh)
+        predicted_tracks = np.concatenate(predicted_tracks, axis=1)
+        return predicted_tracks    
+
+    #calculate TPM from borzoi
+    def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False):
+        TPM_list = []
+        #loop over all protrin in cluster
+        for i in range(len(cluster)):
+            #get exon start and end
+            ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')]
+            ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')]
+            #exon bool mask
+            exon_mask = np.zeros(borzoi_data.shape[-2])
+            for s,n in zip(ex_st,ex_en):
+                exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n))
+            #protrin TPM per person per tissue
+            TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1)
+            #Plot proteins with exon marks if needed
+            if plot == True:
+                #Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue
+                plt.plot(borzoi_data[0,:,0])
+                plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height')
+                plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height')
+                plt.xlim(ex_st[0]-100, ex_en[-1]+100)
+                plt.show()
+            #Get average for tissue replicates
+            TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])]
+            TPM_list.append(TPM_per_tissue)
+        #cretae Datafreame
+        TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0])
+        return(TPM_dataframe)
+
+    #Protrin cluster list of list
+    protein_clusters = [np.array(i[2]) for i in prot_subset]
+
+    #all proroteins present in dataset
+    all_reference_proteins = list(pd.concat([pd.concat([i[-1] for i in prot_subset]), prot_bigger['symbol']]))
+    #use variable names from mutation
+    proteins_with_mutations = all_reference_proteins  
+    proteins_with_mutations_working = proteins_with_mutations
+
+    TPM = []
+    #run until the expression of all proteins is predicted
+    while len(proteins_with_mutations_working) > 0:
+        TPM_dfs = []
+        sequences_one_hot_muts = []
+        st = []
+        cl = []
+        #append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append) 
+        while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0:
+            #get work protein
+            protein = proteins_with_mutations_working[0]
+            #print(protein)
+            #get cluster
+            mask = [protein in i for i in protein_clusters]
+            cluster = list(compress(protein_clusters, mask))
+            #run borzoi for big proteins
+            if protein in np.array(prot_bigger['symbol']):
+                sequences_one_hot_muts_big = []
+                proteins_with_mutations_working = proteins_with_mutations_working[1:]
+                protein_data = prot_bigger[prot_bigger['symbol'] == protein]
+                prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000
+                prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000
+                chrom = protein_data.iloc[0]['chrom'].split('_')[0]
+                star = prot_start
+                st_big = star
+                while star < prot_end:
+                    end = star + 524288
+                    sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
+                    sequences_one_hot_muts_big.append(sequence_one_hot_wt)
+                    star = end - (32*32)
+                sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big)
+                #if number of protein splits is begger than batch size
+                #print(sequences_one_hot_muts_big.shape)
+                if sequences_one_hot_muts_big.shape[0] > batch_size:
+                    borzoi_pred_list = []
+                    for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)):
+                        borzoi_pred_list.append(predict_tracks(models, seq_slice))
+                    y_mut = np.concatenate(borzoi_pred_list)
+                else: 
+                    y_mut = predict_tracks(models, sequences_one_hot_muts_big)
+                y_mut = np.reshape(y_mut, [1,1,-1,89])
+                TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df))
+                #np.save('expression_predictions_%s.npy' %protein, y_mut)
+            else:
+                #append to a list of proteins to run
+                #get star and end of the cluste
+                star, end = (list(compress(prot_subset, mask))[0][:2])
+                #get mutated proteins in the cluster
+                mask = [i in cluster[0] for i in proteins_with_mutations_working]
+                proteins_in_cluster = list(compress(proteins_with_mutations_working, mask))
+                #remove cluster proteins from the ptoein list
+                proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask)))
+                chrom = MANE_data[MANE_data['symbol'] == proteins_in_cluster[0]].iloc[0]['chrom'].split('_')[0] 
+                sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
+                sequences_one_hot_muts.append(sequence_one_hot_wt)
+                st.append(star)
+                cl.append(cluster)
+                ### Test wt
+                #sequences_one_hot_muts.append(sequence_one_hot_wt)
+                ###
+        sequences_one_hot_muts = np.array(sequences_one_hot_muts)
+        #run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step)
+        if sequences_one_hot_muts.shape != (0,):
+            y_mut = predict_tracks(models, sequences_one_hot_muts)
+            for i in range(len(y_mut)):
+                TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df))
+            TPM_dfs = pd.concat(TPM_dfs)
+            TPM.append(TPM_dfs)
+            #np.save('expression_predictions_%s.npy' %protein, y_mut)
+    TPM = pd.concat(TPM)
+    TPM.to_csv('TPM_NO_MUTATIONS.csv')
+    """
+}
+