Initial commit: digital-patients pipeline (clean, no large files)

Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
2026-03-26 15:15:23 +01:00
commit 9e6a16c19b
45 changed files with 7207 additions and 0 deletions
--- a/main_borzoi.nf
+++ b/main_borzoi.nf
@@ -0,0 +1,448 @@
+nextflow.enable.dsl=2
+
+process FILTER_VCF {
+    memory 4.GB
+
+    container "${params.container_borzoi}"
+    containerOptions "${params.containerOptions}"
+    debug true
+    // maxForks 1
+
+    input:
+    path MANE
+    path vcf_filtered
+
+    output:
+    path "*.vcf" , emit: expression_output
+
+    script:
+    """
+    #!/opt/conda/envs/borzoi/bin/python
+    #Filter VCF to extract only mutations in coding sequences plus 1000 upstream regulatory elements
+    import numpy as np
+    import pandas as pd
+    import pickle
+
+    #If ncbiRefSeq data need to be filtered in different manner change commented part
+    #This part is needed if we want to change ncbiRefSeq filtering
+    #column_names = [
+    #    "bin", "name", "chrom", "strand", "txStart", "txEnd",
+    #    "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds",
+    #    "score", "name2", "cdsStartStat", "cdsEndStat", "exonFrames"
+    #]
+
+    #ncbiRefSeq = pd.read_csv("{ncbiRefSeq}", sep='\t', names=column_names) #dollar sign is missing
+
+    ##Get onyl protein coding genes (ncbiRefSeq['cdsStartStat'] != 'none')
+    #ncbiRefSeq = ncbiRefSeq[ncbiRefSeq['cdsStartStat'] != 'none']
+
+    ##Remove duplicates
+    #ncbiRefSeq = ncbiRefSeq.drop_duplicates()
+
+    ##Filter out isoforms for longest DNA sequenc. Filter by protein name and chromosom(differnet version of some shromosoms)
+    #ncbiRefSeq_filtered = []
+    #for i in np.unique(ncbiRefSeq['name2']):
+    #    prot = ncbiRefSeq[ncbiRefSeq['name2'] == i]
+    #    for j in np.unique(prot['chrom']):
+    #        prot_chr = prot[prot['chrom'] == j]
+    #        d1 = [prot_chr['txEnd'].astype(int).max(), prot_chr['txStart'].astype(int).min()] #get lowest starting position and maximum stop position
+    #        max_len = np.argmax(prot_chr['txEnd'].astype(int) - prot_chr['txStart'].astype(int))
+    #        position = prot_chr.iloc[max_len]
+    #        position['txEnd'] = str(d1[0]) #end for isoform can be loaded from exonEnds
+    #        position['txStart'] = str(d1[1]) #start for isoform can be loaded from exonStarts
+    #        ncbiRefSeq_filtered.append(position)
+    #
+    #ncbiRefSeq = pd.DataFrame(ncbiRefSeq_filtered).reset_index().drop('index', axis = 1)
+    ##protrin coding genes only 23314, all genes 48420
+
+    #Load ncbiRefSeq filtered
+    MANE_data = pd.read_csv("${MANE}", sep = '\t')
+
+    #Load vcf
+    with open("${vcf_filtered}") as f:
+        lines = f.readlines()
+
+    #Remove header/start of vcf file
+    vcf = []
+    for i in lines:
+        if i[:2] != '##':
+            vcf.append(i[:-1].split('\t'))
+
+    vcf = pd.DataFrame(vcf[1:], columns=vcf[0])
+
+    # Assuming 'vcf' and 'ncbiRefSeq' are already loaded DataFrames.
+    vcf['POS'] = vcf['POS'].astype(int)
+    vcf['#CHROM'] = vcf['#CHROM'].astype('category')
+    MANE_data['chrom'] = MANE_data['chrom'].astype('category')
+    MANE_data_renamed = MANE_data.rename(columns={'chrom': '#CHROM', 'chr_start': 'Start', 'chr_end': 'End', 'chr_strand': 'strand'})
+    relevant_chromosomes = vcf['#CHROM'].unique()
+    MANE_data_renamed = MANE_data_renamed[MANE_data_renamed['#CHROM'].isin(relevant_chromosomes)]
+
+    dataset = []
+
+    #Filter VCF
+    for chrom in relevant_chromosomes:
+        chrom_MANE = MANE_data_renamed[MANE_data_renamed['#CHROM'] == chrom]
+        # Find the indices in `vcf` that match the conditions for the current chromosome
+        for name, strand, start, end in chrom_MANE[['symbol', 'strand', 'Start', 'End']].itertuples(index=False):
+            condition = (vcf['#CHROM'] == chrom) & (vcf['POS'] >= start) & (vcf['POS'] <= end)
+            if np.sum(condition) != 0:
+                if strand == '+':
+                    start_ = start - 1000
+                    end_ = end
+                else:
+                    end_ = end + 1000
+                    start_ = start
+                dataset.append([[name, strand, start_, end_], np.array(vcf[condition]).tolist()])
+    #sort vcf by length
+    len_ = []
+    for vcf_entery in dataset:
+        len_.append(vcf_entery[0][3] - vcf_entery[0][2])
+    dataset = [dataset[index] for index in np.argsort(len_)]
+    #save vcf
+    #get name
+    patient_name = "${vcf_filtered}".split('/')[-1].split('_variants.vcf')[0]
+    with open(f"{patient_name}_rna_coding_vcf_with_mutations.vcf", "wb") as fp:
+        pickle.dump(dataset, fp)
+    """
+}
+
+process PREDICT_EXPRESSION {
+    accelerator 1
+    memory 4.GB
+
+    container "${params.container_borzoi}"
+    label 'gpu_process'
+    // containerOptions "${params.containerOptions_borzoi}"
+    // debug true
+    maxForks 1 // TODO: COMMENT THIS IN KUBERNETES
+
+    input:
+    path vcf_filtered
+    path MANE
+    //path ncbiRefSeq_bigger
+    //path ncbiRefSeq_subset
+
+    output:
+    path "*_TPM.csv", emit: expression_output
+
+    script:
+    """
+    #!/opt/conda/envs/borzoi/bin/python
+    #Predict expression based on mutation
+    import json
+    import os
+    import time
+    import warnings
+    import pickle
+    from itertools import compress
+
+    import h5py
+    import matplotlib.pyplot as plt
+    import matplotlib.patches as patches
+    import numpy as np
+    import pandas as pd
+    import pysam
+    import pyfaidx
+    import tensorflow as tf
+
+    from baskerville import seqnn
+    from baskerville import gene as bgene
+    from baskerville import dna
+
+    import sys
+    sys.path.append( '/home/omic/borzoi' )
+    from examples.borzoi_helpers import *
+
+    #Load VCF and mutation files
+    #ncbiRefSeq_bigger = pd.read_csv("{ncbiRefSeq_bigger}")
+    #with open("{ncbiRefSeq_subset}", "rb") as fp:
+    #    ncbiRefSeq_subset = pickle.load(fp)
+    prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv")
+    with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp:
+        prot_subset = pickle.load(fp)
+    with open("${vcf_filtered}", "rb") as fp:
+        vcf_file = pickle.load(fp)
+    MANE_data = pd.read_csv("${MANE}", sep = '\t')
+
+    batch_size = 4
+    #Define batch size and stop if batch size is to small for big transcripts
+    #min_batch_size = max(((np.array(ncbiRefSeq_bigger['txEnd']).astype('int')-np.array(ncbiRefSeq_bigger['txStart']).astype('int'))/524288).astype('int')+1)
+    #if min_batch_size > batch_size:
+    #    print('batch size is ',batch_size,'and min_batch_size is ',min_batch_size)
+    #    sys.exit('Batch size has to be bigger or same as number of big trnascripts split into chunks')
+
+    #Model configuration
+    params_file = '/home/omic/borzoi/examples/params_pred.json'
+    targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt
+    n_folds = 1 #4        #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
+    rc = True         #Average across reverse-complement prediction
+
+    #Read model parameters
+    with open(params_file) as params_open :
+        params = json.load(params_open)
+        params_model = params['model']
+        params_train = params['train']
+
+    #Read targets
+    targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
+    target_index = targets_df.index
+
+    #Create local index of strand_pair (relative to sliced targets)
+    if rc :
+        strand_pair = targets_df.strand_pair
+        target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
+        slice_pair = np.array([
+            target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
+        ], dtype='int32')
+
+    #Initialize model ensemble
+    models = []
+    for fold_ix in range(n_folds) :
+        model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5"
+        seqnn_model = seqnn.SeqNN(params_model)
+        seqnn_model.restore(model_file, 0)
+        seqnn_model.build_slice(target_index)
+        if rc :
+            seqnn_model.strand_pair.append(slice_pair)
+        seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0]
+        models.append(seqnn_model)
+    fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa')
+
+    #Create mutations(s) from WT
+    def create_mut(sequence_one_hot, poses, alts, start):
+        sequence_one_hot_mut = np.copy(sequence_one_hot_wt)
+        for pos, alt in zip(poses, alts) :
+            alt_ix = -1
+            if alt == 'A' :
+                alt_ix = 0
+            elif alt == 'C' :
+                alt_ix = 1
+            elif alt == 'G' :
+                alt_ix = 2
+            elif alt == 'T' :
+                alt_ix = 3
+            sequence_one_hot_mut[pos-start-1] = 0.
+            sequence_one_hot_mut[pos-start-1, alt_ix] = 1.
+        return sequence_one_hot_mut
+
+    #Make predictions/ run model
+    def predict_tracks(models, sequence_one_hot):
+        predicted_tracks = []
+        for fold_ix in range(len(models)):
+            yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16") 
+            predicted_tracks.append(yh)
+        predicted_tracks = np.concatenate(predicted_tracks, axis=1)
+        return predicted_tracks    
+
+    #calculate TPM from borzoi
+    def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False):
+        TPM_list = []
+        #loop over all protrin in cluster
+        for i in range(len(cluster)):
+            #get exon start and end
+            ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')]
+            ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')]
+            #exon bool mask
+            exon_mask = np.zeros(borzoi_data.shape[-2])
+            for s,n in zip(ex_st,ex_en):
+                exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n))
+            #protrin TPM per person per tissue
+            TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1)
+            #Plot proteins with exon marks if needed
+            if plot == True:
+                #Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue
+                plt.plot(borzoi_data[0,:,0])
+                plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height')
+                plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height')
+                plt.xlim(ex_st[0]-100, ex_en[-1]+100)
+                plt.show()
+            #Get average for tissue replicates
+            TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])]
+            TPM_list.append(TPM_per_tissue)
+        #cretae Datafreame
+        TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0])
+        return(TPM_dataframe)
+
+    #Protrin cluster list of list
+    protein_clusters = [np.array(i[2]) for i in prot_subset]
+    #Get all proteins from VCF
+    proteins_with_mutations = [i[0][0] for i in vcf_file]
+
+    proteins_with_mutations_working = proteins_with_mutations
+
+    TPM = []
+    #run until the expression of all proteins is predicted
+    while len(proteins_with_mutations_working) > 0:
+        TPM_dfs = []
+        sequences_one_hot_muts = []
+        st = []
+        cl = []
+        #append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append) 
+        while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0:
+            #get work protein
+            protein = proteins_with_mutations_working[0]
+            #print(protein)
+            #get cluster
+            mask = [protein in i for i in protein_clusters]
+            cluster = list(compress(protein_clusters, mask))
+            #run borzoi for big proteins
+            if protein in np.array(prot_bigger['symbol']):
+                sequences_one_hot_muts_big = []
+                proteins_with_mutations_working = proteins_with_mutations_working[1:]
+                protein_data = prot_bigger[prot_bigger['symbol'] == protein]
+                prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000
+                prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000
+                mutations_ = np.array(list(compress(vcf_file, np.array(proteins_with_mutations) == protein))[0][1])
+                #only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover
+                mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]]
+                chrom = mutations_[0,0]
+                all_poses = mutations_[:,1].astype('int')
+                all_alts = mutations_[:,4]
+                star = prot_start
+                st_big = star
+                while star < prot_end:
+                    end = star + 524288
+                    poses = all_poses[(all_poses > star) & (all_poses < end)]
+                    alts = all_alts[(all_poses > star) & (all_poses < end)]
+                    sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
+                    sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star)
+                    sequences_one_hot_muts_big.append(sequence_one_hot_mut)
+                    star = end - (32*32)
+                sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big)
+                #if number of protein splits is begger than batch size
+                #print(sequences_one_hot_muts_big.shape)
+                if sequences_one_hot_muts_big.shape[0] > batch_size:
+                    borzoi_pred_list = []
+                    for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)):
+                        borzoi_pred_list.append(predict_tracks(models, seq_slice))
+                    y_mut = np.concatenate(borzoi_pred_list)
+                else: 
+                    y_mut = predict_tracks(models, sequences_one_hot_muts_big)
+                y_mut = np.reshape(y_mut, [1,1,-1,89])
+                TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df))
+                #np.save('expression_predictions_%s.npy' %protein, y_mut)
+            else:
+                #append to a list of proteins to run
+                #get star and end of the cluste
+                star, end = (list(compress(prot_subset, mask))[0][:2])
+                #get mutated proteins in the cluster
+                mask = [i in cluster[0] for i in proteins_with_mutations_working]
+                proteins_in_cluster = list(compress(proteins_with_mutations_working, mask))
+                #remove cluster proteins from the ptoein list
+                proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask)))
+                #print(proteins_in_cluster)
+                mutations_ = [list(compress(vcf_file, [np.array(proteins_with_mutations) == i][0]))[0][1] for i in proteins_in_cluster]
+                mutations_ = np.concatenate([np.array(i) for i in mutations_],0)
+                #only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover
+                mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]]
+                chrom = mutations_[0,0]
+                poses = mutations_[:,1].astype('int')
+                alts = mutations_[:,4]
+                sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
+                sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star)
+                sequences_one_hot_muts.append(sequence_one_hot_mut)
+                st.append(star)
+                cl.append(cluster)
+                ### Test wt
+                #sequences_one_hot_muts.append(sequence_one_hot_wt)
+                ###
+        sequences_one_hot_muts = np.array(sequences_one_hot_muts)
+        #run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step)
+        if sequences_one_hot_muts.shape != (0,):
+            y_mut = predict_tracks(models, sequences_one_hot_muts)
+            for i in range(len(y_mut)):
+                TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df))
+            TPM_dfs = pd.concat(TPM_dfs)
+            TPM.append(TPM_dfs)
+            #np.save('expression_predictions_%s.npy' %protein, y_mut)
+    TPM = pd.concat(TPM)
+
+    #add RNA expression for positions with no mutations
+    #load reference genome expression
+    tpm_no_mut = pd.read_csv('/home/omic/borzoi/TPM_NO_MUTATIONS.csv',index_col=0)
+    #concat missing expression
+    TPM = pd.concat([TPM, tpm_no_mut.loc[list(set(tpm_no_mut.index) - set(TPM.index))]])
+    #change symbol to ENSG
+    TPM.index.names = ['RNA']
+    MANE_data['ENSG'] = [i.split('.')[0] for i in MANE_data['Ensembl_Gene']] 
+    mane_map = MANE_data[['symbol','ENSG']]                                                                                                                        
+    TPM = mane_map.merge(TPM, left_on='symbol', right_on='RNA').dropna().drop_duplicates(subset=['symbol'])
+    #drop duplicates
+    TPM =TPM.drop_duplicates()
+    TPM = TPM.iloc[:,2:].set_index(TPM.iloc[:,1])
+    #save TPM
+    #get name                                                                         
+    patient_name = "${vcf_filtered}".split('/')[-1].split('_rna_coding_vcf_with_mutations.vcf')[0]
+    TPM.to_csv(f'{patient_name}_TPM.csv')
+
+    gpu_stats = tf.config.experimental.get_memory_info('GPU:0')
+    print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB")
+    print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB")
+    """
+}
+
+
+process CREATE_PROTEIN_CLUSTER {
+    container "${params.container_borzoi}"
+    containerOptions "${params.containerOptions}"
+    debug true
+
+    input:
+    path MANE
+
+    output:
+    path "prot_bigger.csv"
+    path "prot_subset.pickle"
+
+    script:
+    """
+    #!/opt/conda/envs/borzoi/bin/python
+    # Use this if new RNAs transcripts need to be added to list of predicted RNAs
+    import numpy as np
+    import pandas as pd
+    import pickle
+
+    data = pd.read_csv("${MANE}", sep='\t')
+
+    data_subset = []
+    data_bigger = []
+    for ch in np.unique(data['chrom']):
+        data_working = data[data['chrom']==ch]
+        arg = np.argsort(np.array(data_working['chr_start']).astype('int'))
+        data_working = data_working.iloc[arg]
+        #protrin start poistion wirh regulatori elements negativ 1000 from start for + strand, plus 1000 from end for - strand
+        #add (16*32) as buffer because borzoi will cut tham
+        prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000
+        prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000
+        input_start = prot_start
+        input_end = input_start + 524288
+        while len(input_start) > 0:
+            st = input_start[0]
+            en = input_end[0]
+            mask = (prot_start >= st) & (prot_end <= en)
+            data_mask = data_working[mask]
+            data_subset.append([st, en, data_mask['symbol']])
+            #ncbiRefSeq_subset.append(ncbiRefSeq_working[mask])
+            if mask[0] == False:
+                data_bigger.append(data_working.iloc[0])
+                mask[0] = True
+            data_working = data_working[~mask]
+            arg = np.argsort(np.array(data_working['chr_start']).astype('int'))
+            data_working = data_working.iloc[arg]
+            prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000
+            prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000
+            input_start = prot_start
+            input_end = input_start + 524288
+
+    prot_bigger = pd.DataFrame(data_bigger)
+
+    prot_bigger.to_csv('prot_bigger.csv')
+    with open("prot_subset.pickle", "wb") as fp:
+        pickle.dump(data_subset, fp)
+
+    gpu_stats = tf.config.experimental.get_memory_info('GPU:0')
+    print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB")
+    print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB")
+    """
+}