digital-patients/main_no_mutations.nf

nextflow.enable.dsl=2

process PREDICT_EXPRESSION_NO_MUTATIONS {
    container "${params.container_borzoi}"
    containerOptions "${params.containerOptions}"
    debug true
    maxForks 1

    input:
    path MANE

    output:
	path "TPM_NO_MUTATIONS.csv", emit: expression_output

    script:
    """
    #!/opt/conda/envs/borzoi/bin/python
    #Predict expression of reference genom
    import json
    import os
    import time
    import warnings
    import pickle
    from itertools import compress

    import h5py
    import matplotlib.pyplot as plt
    import matplotlib.patches as patches
    import numpy as np
    import pandas as pd
    import pysam
    import pyfaidx
    import tensorflow as tf

    from baskerville import seqnn
    from baskerville import gene as bgene
    from baskerville import dna

    import sys
    sys.path.append( '/home/omic/borzoi' )
    from examples.borzoi_helpers import *

    #Reference protein dna sequence from MANE dataset
    prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv")
    with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp:
        prot_subset = pickle.load(fp)
    MANE_data = pd.read_csv("${MANE}", sep = '\t')

    batch_size = 4

    #Model configuration
    params_file = '/home/omic/borzoi/examples/params_pred.json'
    targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt
    n_folds = 1 #4        #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
    rc = True         #Average across reverse-complement prediction

    #Read model parameters
    with open(params_file) as params_open :
        params = json.load(params_open)
        params_model = params['model']
        params_train = params['train']

    #Read targets
    targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
    target_index = targets_df.index

    #Create local index of strand_pair (relative to sliced targets)
    if rc :
        strand_pair = targets_df.strand_pair
        target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
        slice_pair = np.array([
            target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
        ], dtype='int32')

    #Initialize model ensemble
    models = []
    for fold_ix in range(n_folds) :
        model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5"
        seqnn_model = seqnn.SeqNN(params_model)
        seqnn_model.restore(model_file, 0)
        seqnn_model.build_slice(target_index)
        if rc :
            seqnn_model.strand_pair.append(slice_pair)
        seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0]
        models.append(seqnn_model)
    fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa')

    #Make predictions/ run model
    def predict_tracks(models, sequence_one_hot):
        predicted_tracks = []
        for fold_ix in range(len(models)):
            yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16")
            predicted_tracks.append(yh)
        predicted_tracks = np.concatenate(predicted_tracks, axis=1)
        return predicted_tracks

    #calculate TPM from borzoi
    def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False):
        TPM_list = []
        #loop over all protrin in cluster
        for i in range(len(cluster)):
            #get exon start and end
            ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')]
            ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')]
            #exon bool mask
            exon_mask = np.zeros(borzoi_data.shape[-2])
            for s,n in zip(ex_st,ex_en):
                exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n))
            #protrin TPM per person per tissue
            TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1)
            #Plot proteins with exon marks if needed
            if plot == True:
                #Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue
                plt.plot(borzoi_data[0,:,0])
                plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height')
                plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height')
                plt.xlim(ex_st[0]-100, ex_en[-1]+100)
                plt.show()
            #Get average for tissue replicates
            TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])]
            TPM_list.append(TPM_per_tissue)
        #cretae Datafreame
        TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0])
        return(TPM_dataframe)

    #Protrin cluster list of list
    protein_clusters = [np.array(i[2]) for i in prot_subset]

    #all proroteins present in dataset
    all_reference_proteins = list(pd.concat([pd.concat([i[-1] for i in prot_subset]), prot_bigger['symbol']]))
    #use variable names from mutation
    proteins_with_mutations = all_reference_proteins
    proteins_with_mutations_working = proteins_with_mutations

    TPM = []
    #run until the expression of all proteins is predicted
    while len(proteins_with_mutations_working) > 0:
        TPM_dfs = []
        sequences_one_hot_muts = []
        st = []
        cl = []
        #append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append)
        while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0:
            #get work protein
            protein = proteins_with_mutations_working[0]
            #print(protein)
            #get cluster
            mask = [protein in i for i in protein_clusters]
            cluster = list(compress(protein_clusters, mask))
            #run borzoi for big proteins
            if protein in np.array(prot_bigger['symbol']):
                sequences_one_hot_muts_big = []
                proteins_with_mutations_working = proteins_with_mutations_working[1:]
                protein_data = prot_bigger[prot_bigger['symbol'] == protein]
                prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000
                prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000
                chrom = protein_data.iloc[0]['chrom'].split('_')[0]
                star = prot_start
                st_big = star
                while star < prot_end:
                    end = star + 524288
                    sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
                    sequences_one_hot_muts_big.append(sequence_one_hot_wt)
                    star = end - (32*32)
                sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big)
                #if number of protein splits is begger than batch size
                #print(sequences_one_hot_muts_big.shape)
                if sequences_one_hot_muts_big.shape[0] > batch_size:
                    borzoi_pred_list = []
                    for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)):
                        borzoi_pred_list.append(predict_tracks(models, seq_slice))
                    y_mut = np.concatenate(borzoi_pred_list)
                else:
                    y_mut = predict_tracks(models, sequences_one_hot_muts_big)
                y_mut = np.reshape(y_mut, [1,1,-1,89])
                TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df))
                #np.save('expression_predictions_%s.npy' %protein, y_mut)
            else:
                #append to a list of proteins to run
                #get star and end of the cluste
                star, end = (list(compress(prot_subset, mask))[0][:2])
                #get mutated proteins in the cluster
                mask = [i in cluster[0] for i in proteins_with_mutations_working]
                proteins_in_cluster = list(compress(proteins_with_mutations_working, mask))
                #remove cluster proteins from the ptoein list
                proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask)))
                chrom = MANE_data[MANE_data['symbol'] == proteins_in_cluster[0]].iloc[0]['chrom'].split('_')[0]
                sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
                sequences_one_hot_muts.append(sequence_one_hot_wt)
                st.append(star)
                cl.append(cluster)
                ### Test wt
                #sequences_one_hot_muts.append(sequence_one_hot_wt)
                ###
        sequences_one_hot_muts = np.array(sequences_one_hot_muts)
        #run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step)
        if sequences_one_hot_muts.shape != (0,):
            y_mut = predict_tracks(models, sequences_one_hot_muts)
            for i in range(len(y_mut)):
                TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df))
            TPM_dfs = pd.concat(TPM_dfs)
            TPM.append(TPM_dfs)
            #np.save('expression_predictions_%s.npy' %protein, y_mut)
    TPM = pd.concat(TPM)
    TPM.to_csv('TPM_NO_MUTATIONS.csv')
    """
}