digital-patients/main_borzoi.nf

nextflow.enable.dsl=2

process FILTER_VCF {
    memory 4.GB

    container "${params.container_borzoi}"
    containerOptions "${params.containerOptions}"
    debug true
    // maxForks 1

    input:
    path MANE
    path vcf_filtered

    output:
    path "*.vcf" , emit: expression_output

    script:
    """
    #!/opt/conda/envs/borzoi/bin/python
    #Filter VCF to extract only mutations in coding sequences plus 1000 upstream regulatory elements
    import numpy as np
    import pandas as pd
    import pickle

    #If ncbiRefSeq data need to be filtered in different manner change commented part
    #This part is needed if we want to change ncbiRefSeq filtering
    #column_names = [
    #    "bin", "name", "chrom", "strand", "txStart", "txEnd",
    #    "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds",
    #    "score", "name2", "cdsStartStat", "cdsEndStat", "exonFrames"
    #]

    #ncbiRefSeq = pd.read_csv("{ncbiRefSeq}", sep='\t', names=column_names) #dollar sign is missing

    ##Get onyl protein coding genes (ncbiRefSeq['cdsStartStat'] != 'none')
    #ncbiRefSeq = ncbiRefSeq[ncbiRefSeq['cdsStartStat'] != 'none']

    ##Remove duplicates
    #ncbiRefSeq = ncbiRefSeq.drop_duplicates()

    ##Filter out isoforms for longest DNA sequenc. Filter by protein name and chromosom(differnet version of some shromosoms)
    #ncbiRefSeq_filtered = []
    #for i in np.unique(ncbiRefSeq['name2']):
    #    prot = ncbiRefSeq[ncbiRefSeq['name2'] == i]
    #    for j in np.unique(prot['chrom']):
    #        prot_chr = prot[prot['chrom'] == j]
    #        d1 = [prot_chr['txEnd'].astype(int).max(), prot_chr['txStart'].astype(int).min()] #get lowest starting position and maximum stop position
    #        max_len = np.argmax(prot_chr['txEnd'].astype(int) - prot_chr['txStart'].astype(int))
    #        position = prot_chr.iloc[max_len]
    #        position['txEnd'] = str(d1[0]) #end for isoform can be loaded from exonEnds
    #        position['txStart'] = str(d1[1]) #start for isoform can be loaded from exonStarts
    #        ncbiRefSeq_filtered.append(position)
    #
    #ncbiRefSeq = pd.DataFrame(ncbiRefSeq_filtered).reset_index().drop('index', axis = 1)
    ##protrin coding genes only 23314, all genes 48420

    #Load ncbiRefSeq filtered
    MANE_data = pd.read_csv("${MANE}", sep = '\t')

    #Load vcf
    with open("${vcf_filtered}") as f:
        lines = f.readlines()

    #Remove header/start of vcf file
    vcf = []
    for i in lines:
        if i[:2] != '##':
            vcf.append(i[:-1].split('\t'))

    vcf = pd.DataFrame(vcf[1:], columns=vcf[0])

    # Assuming 'vcf' and 'ncbiRefSeq' are already loaded DataFrames.
    vcf['POS'] = vcf['POS'].astype(int)
    vcf['#CHROM'] = vcf['#CHROM'].astype('category')
    MANE_data['chrom'] = MANE_data['chrom'].astype('category')
    MANE_data_renamed = MANE_data.rename(columns={'chrom': '#CHROM', 'chr_start': 'Start', 'chr_end': 'End', 'chr_strand': 'strand'})
    relevant_chromosomes = vcf['#CHROM'].unique()
    MANE_data_renamed = MANE_data_renamed[MANE_data_renamed['#CHROM'].isin(relevant_chromosomes)]

    dataset = []

    #Filter VCF
    for chrom in relevant_chromosomes:
        chrom_MANE = MANE_data_renamed[MANE_data_renamed['#CHROM'] == chrom]
        # Find the indices in `vcf` that match the conditions for the current chromosome
        for name, strand, start, end in chrom_MANE[['symbol', 'strand', 'Start', 'End']].itertuples(index=False):
            condition = (vcf['#CHROM'] == chrom) & (vcf['POS'] >= start) & (vcf['POS'] <= end)
            if np.sum(condition) != 0:
                if strand == '+':
                    start_ = start - 1000
                    end_ = end
                else:
                    end_ = end + 1000
                    start_ = start
                dataset.append([[name, strand, start_, end_], np.array(vcf[condition]).tolist()])
    #sort vcf by length
    len_ = []
    for vcf_entery in dataset:
        len_.append(vcf_entery[0][3] - vcf_entery[0][2])
    dataset = [dataset[index] for index in np.argsort(len_)]
    #save vcf
    #get name
    patient_name = "${vcf_filtered}".split('/')[-1].split('_variants.vcf')[0]
    with open(f"{patient_name}_rna_coding_vcf_with_mutations.vcf", "wb") as fp:
        pickle.dump(dataset, fp)
    """
}

process PREDICT_EXPRESSION {
    accelerator 1
    memory 4.GB

    container "${params.container_borzoi}"
    label 'gpu_process'
    // containerOptions "${params.containerOptions_borzoi}"
    // debug true
    maxForks 1 // TODO: COMMENT THIS IN KUBERNETES

    input:
    path vcf_filtered
    path MANE
    //path ncbiRefSeq_bigger
    //path ncbiRefSeq_subset

    output:
    path "*_TPM.csv", emit: expression_output

    script:
    """
    #!/opt/conda/envs/borzoi/bin/python
    #Predict expression based on mutation
    import json
    import os
    import time
    import warnings
    import pickle
    from itertools import compress

    import h5py
    import matplotlib.pyplot as plt
    import matplotlib.patches as patches
    import numpy as np
    import pandas as pd
    import pysam
    import pyfaidx
    import tensorflow as tf

    from baskerville import seqnn
    from baskerville import gene as bgene
    from baskerville import dna

    import sys
    sys.path.append( '/home/omic/borzoi' )
    from examples.borzoi_helpers import *

    #Load VCF and mutation files
    #ncbiRefSeq_bigger = pd.read_csv("{ncbiRefSeq_bigger}")
    #with open("{ncbiRefSeq_subset}", "rb") as fp:
    #    ncbiRefSeq_subset = pickle.load(fp)
    prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv")
    with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp:
        prot_subset = pickle.load(fp)
    with open("${vcf_filtered}", "rb") as fp:
        vcf_file = pickle.load(fp)
    MANE_data = pd.read_csv("${MANE}", sep = '\t')

    batch_size = 4
    #Define batch size and stop if batch size is to small for big transcripts
    #min_batch_size = max(((np.array(ncbiRefSeq_bigger['txEnd']).astype('int')-np.array(ncbiRefSeq_bigger['txStart']).astype('int'))/524288).astype('int')+1)
    #if min_batch_size > batch_size:
    #    print('batch size is ',batch_size,'and min_batch_size is ',min_batch_size)
    #    sys.exit('Batch size has to be bigger or same as number of big trnascripts split into chunks')

    #Model configuration
    params_file = '/home/omic/borzoi/examples/params_pred.json'
    targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt
    n_folds = 1 #4        #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
    rc = True         #Average across reverse-complement prediction

    #Read model parameters
    with open(params_file) as params_open :
        params = json.load(params_open)
        params_model = params['model']
        params_train = params['train']

    #Read targets
    targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
    target_index = targets_df.index

    #Create local index of strand_pair (relative to sliced targets)
    if rc :
        strand_pair = targets_df.strand_pair
        target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
        slice_pair = np.array([
            target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
        ], dtype='int32')

    #Initialize model ensemble
    models = []
    for fold_ix in range(n_folds) :
        model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5"
        seqnn_model = seqnn.SeqNN(params_model)
        seqnn_model.restore(model_file, 0)
        seqnn_model.build_slice(target_index)
        if rc :
            seqnn_model.strand_pair.append(slice_pair)
        seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0]
        models.append(seqnn_model)
    fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa')

    #Create mutations(s) from WT
    def create_mut(sequence_one_hot, poses, alts, start):
        sequence_one_hot_mut = np.copy(sequence_one_hot_wt)
        for pos, alt in zip(poses, alts) :
            alt_ix = -1
            if alt == 'A' :
                alt_ix = 0
            elif alt == 'C' :
                alt_ix = 1
            elif alt == 'G' :
                alt_ix = 2
            elif alt == 'T' :
                alt_ix = 3
            sequence_one_hot_mut[pos-start-1] = 0.
            sequence_one_hot_mut[pos-start-1, alt_ix] = 1.
        return sequence_one_hot_mut

    #Make predictions/ run model
    def predict_tracks(models, sequence_one_hot):
        predicted_tracks = []
        for fold_ix in range(len(models)):
            yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16")
            predicted_tracks.append(yh)
        predicted_tracks = np.concatenate(predicted_tracks, axis=1)
        return predicted_tracks

    #calculate TPM from borzoi
    def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False):
        TPM_list = []
        #loop over all protrin in cluster
        for i in range(len(cluster)):
            #get exon start and end
            ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')]
            ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')]
            #exon bool mask
            exon_mask = np.zeros(borzoi_data.shape[-2])
            for s,n in zip(ex_st,ex_en):
                exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n))
            #protrin TPM per person per tissue
            TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1)
            #Plot proteins with exon marks if needed
            if plot == True:
                #Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue
                plt.plot(borzoi_data[0,:,0])
                plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height')
                plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height')
                plt.xlim(ex_st[0]-100, ex_en[-1]+100)
                plt.show()
            #Get average for tissue replicates
            TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])]
            TPM_list.append(TPM_per_tissue)
        #cretae Datafreame
        TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0])
        return(TPM_dataframe)

    #Protrin cluster list of list
    protein_clusters = [np.array(i[2]) for i in prot_subset]
    #Get all proteins from VCF
    proteins_with_mutations = [i[0][0] for i in vcf_file]

    proteins_with_mutations_working = proteins_with_mutations

    TPM = []
    #run until the expression of all proteins is predicted
    while len(proteins_with_mutations_working) > 0:
        TPM_dfs = []
        sequences_one_hot_muts = []
        st = []
        cl = []
        #append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append)
        while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0:
            #get work protein
            protein = proteins_with_mutations_working[0]
            #print(protein)
            #get cluster
            mask = [protein in i for i in protein_clusters]
            cluster = list(compress(protein_clusters, mask))
            #run borzoi for big proteins
            if protein in np.array(prot_bigger['symbol']):
                sequences_one_hot_muts_big = []
                proteins_with_mutations_working = proteins_with_mutations_working[1:]
                protein_data = prot_bigger[prot_bigger['symbol'] == protein]
                prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000
                prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000
                mutations_ = np.array(list(compress(vcf_file, np.array(proteins_with_mutations) == protein))[0][1])
                #only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover
                mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]]
                chrom = mutations_[0,0]
                all_poses = mutations_[:,1].astype('int')
                all_alts = mutations_[:,4]
                star = prot_start
                st_big = star
                while star < prot_end:
                    end = star + 524288
                    poses = all_poses[(all_poses > star) & (all_poses < end)]
                    alts = all_alts[(all_poses > star) & (all_poses < end)]
                    sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
                    sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star)
                    sequences_one_hot_muts_big.append(sequence_one_hot_mut)
                    star = end - (32*32)
                sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big)
                #if number of protein splits is begger than batch size
                #print(sequences_one_hot_muts_big.shape)
                if sequences_one_hot_muts_big.shape[0] > batch_size:
                    borzoi_pred_list = []
                    for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)):
                        borzoi_pred_list.append(predict_tracks(models, seq_slice))
                    y_mut = np.concatenate(borzoi_pred_list)
                else:
                    y_mut = predict_tracks(models, sequences_one_hot_muts_big)
                y_mut = np.reshape(y_mut, [1,1,-1,89])
                TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df))
                #np.save('expression_predictions_%s.npy' %protein, y_mut)
            else:
                #append to a list of proteins to run
                #get star and end of the cluste
                star, end = (list(compress(prot_subset, mask))[0][:2])
                #get mutated proteins in the cluster
                mask = [i in cluster[0] for i in proteins_with_mutations_working]
                proteins_in_cluster = list(compress(proteins_with_mutations_working, mask))
                #remove cluster proteins from the ptoein list
                proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask)))
                #print(proteins_in_cluster)
                mutations_ = [list(compress(vcf_file, [np.array(proteins_with_mutations) == i][0]))[0][1] for i in proteins_in_cluster]
                mutations_ = np.concatenate([np.array(i) for i in mutations_],0)
                #only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover
                mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]]
                chrom = mutations_[0,0]
                poses = mutations_[:,1].astype('int')
                alts = mutations_[:,4]
                sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
                sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star)
                sequences_one_hot_muts.append(sequence_one_hot_mut)
                st.append(star)
                cl.append(cluster)
                ### Test wt
                #sequences_one_hot_muts.append(sequence_one_hot_wt)
                ###
        sequences_one_hot_muts = np.array(sequences_one_hot_muts)
        #run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step)
        if sequences_one_hot_muts.shape != (0,):
            y_mut = predict_tracks(models, sequences_one_hot_muts)
            for i in range(len(y_mut)):
                TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df))
            TPM_dfs = pd.concat(TPM_dfs)
            TPM.append(TPM_dfs)
            #np.save('expression_predictions_%s.npy' %protein, y_mut)
    TPM = pd.concat(TPM)

    #add RNA expression for positions with no mutations
    #load reference genome expression
    tpm_no_mut = pd.read_csv('/home/omic/borzoi/TPM_NO_MUTATIONS.csv',index_col=0)
    #concat missing expression
    TPM = pd.concat([TPM, tpm_no_mut.loc[list(set(tpm_no_mut.index) - set(TPM.index))]])
    #change symbol to ENSG
    TPM.index.names = ['RNA']
    MANE_data['ENSG'] = [i.split('.')[0] for i in MANE_data['Ensembl_Gene']]
    mane_map = MANE_data[['symbol','ENSG']]
    TPM = mane_map.merge(TPM, left_on='symbol', right_on='RNA').dropna().drop_duplicates(subset=['symbol'])
    #drop duplicates
    TPM =TPM.drop_duplicates()
    TPM = TPM.iloc[:,2:].set_index(TPM.iloc[:,1])
    #save TPM
    #get name
    patient_name = "${vcf_filtered}".split('/')[-1].split('_rna_coding_vcf_with_mutations.vcf')[0]
    TPM.to_csv(f'{patient_name}_TPM.csv')

    gpu_stats = tf.config.experimental.get_memory_info('GPU:0')
    print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB")
    print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB")
    """
}


process CREATE_PROTEIN_CLUSTER {
    container "${params.container_borzoi}"
    containerOptions "${params.containerOptions}"
    debug true

    input:
    path MANE

    output:
    path "prot_bigger.csv"
    path "prot_subset.pickle"

    script:
    """
    #!/opt/conda/envs/borzoi/bin/python
    # Use this if new RNAs transcripts need to be added to list of predicted RNAs
    import numpy as np
    import pandas as pd
    import pickle

    data = pd.read_csv("${MANE}", sep='\t')

    data_subset = []
    data_bigger = []
    for ch in np.unique(data['chrom']):
        data_working = data[data['chrom']==ch]
        arg = np.argsort(np.array(data_working['chr_start']).astype('int'))
        data_working = data_working.iloc[arg]
        #protrin start poistion wirh regulatori elements negativ 1000 from start for + strand, plus 1000 from end for - strand
        #add (16*32) as buffer because borzoi will cut tham
        prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000
        prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000
        input_start = prot_start
        input_end = input_start + 524288
        while len(input_start) > 0:
            st = input_start[0]
            en = input_end[0]
            mask = (prot_start >= st) & (prot_end <= en)
            data_mask = data_working[mask]
            data_subset.append([st, en, data_mask['symbol']])
            #ncbiRefSeq_subset.append(ncbiRefSeq_working[mask])
            if mask[0] == False:
                data_bigger.append(data_working.iloc[0])
                mask[0] = True
            data_working = data_working[~mask]
            arg = np.argsort(np.array(data_working['chr_start']).astype('int'))
            data_working = data_working.iloc[arg]
            prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000
            prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000
            input_start = prot_start
            input_end = input_start + 524288

    prot_bigger = pd.DataFrame(data_bigger)

    prot_bigger.to_csv('prot_bigger.csv')
    with open("prot_subset.pickle", "wb") as fp:
        pickle.dump(data_subset, fp)

    gpu_stats = tf.config.experimental.get_memory_info('GPU:0')
    print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB")
    print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB")
    """
}