nextflow.enable.dsl=2 process FILTER_VCF { memory 4.GB container "${params.container_borzoi}" containerOptions "${params.containerOptions}" debug true // maxForks 1 input: path MANE path vcf_filtered output: path "*.vcf" , emit: expression_output script: """ #!/opt/conda/envs/borzoi/bin/python #Filter VCF to extract only mutations in coding sequences plus 1000 upstream regulatory elements import numpy as np import pandas as pd import pickle #If ncbiRefSeq data need to be filtered in different manner change commented part #This part is needed if we want to change ncbiRefSeq filtering #column_names = [ # "bin", "name", "chrom", "strand", "txStart", "txEnd", # "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds", # "score", "name2", "cdsStartStat", "cdsEndStat", "exonFrames" #] #ncbiRefSeq = pd.read_csv("{ncbiRefSeq}", sep='\t', names=column_names) #dollar sign is missing ##Get onyl protein coding genes (ncbiRefSeq['cdsStartStat'] != 'none') #ncbiRefSeq = ncbiRefSeq[ncbiRefSeq['cdsStartStat'] != 'none'] ##Remove duplicates #ncbiRefSeq = ncbiRefSeq.drop_duplicates() ##Filter out isoforms for longest DNA sequenc. Filter by protein name and chromosom(differnet version of some shromosoms) #ncbiRefSeq_filtered = [] #for i in np.unique(ncbiRefSeq['name2']): # prot = ncbiRefSeq[ncbiRefSeq['name2'] == i] # for j in np.unique(prot['chrom']): # prot_chr = prot[prot['chrom'] == j] # d1 = [prot_chr['txEnd'].astype(int).max(), prot_chr['txStart'].astype(int).min()] #get lowest starting position and maximum stop position # max_len = np.argmax(prot_chr['txEnd'].astype(int) - prot_chr['txStart'].astype(int)) # position = prot_chr.iloc[max_len] # position['txEnd'] = str(d1[0]) #end for isoform can be loaded from exonEnds # position['txStart'] = str(d1[1]) #start for isoform can be loaded from exonStarts # ncbiRefSeq_filtered.append(position) # #ncbiRefSeq = pd.DataFrame(ncbiRefSeq_filtered).reset_index().drop('index', axis = 1) ##protrin coding genes only 23314, all genes 48420 #Load ncbiRefSeq filtered MANE_data = pd.read_csv("${MANE}", sep = '\t') #Load vcf with open("${vcf_filtered}") as f: lines = f.readlines() #Remove header/start of vcf file vcf = [] for i in lines: if i[:2] != '##': vcf.append(i[:-1].split('\t')) vcf = pd.DataFrame(vcf[1:], columns=vcf[0]) # Assuming 'vcf' and 'ncbiRefSeq' are already loaded DataFrames. vcf['POS'] = vcf['POS'].astype(int) vcf['#CHROM'] = vcf['#CHROM'].astype('category') MANE_data['chrom'] = MANE_data['chrom'].astype('category') MANE_data_renamed = MANE_data.rename(columns={'chrom': '#CHROM', 'chr_start': 'Start', 'chr_end': 'End', 'chr_strand': 'strand'}) relevant_chromosomes = vcf['#CHROM'].unique() MANE_data_renamed = MANE_data_renamed[MANE_data_renamed['#CHROM'].isin(relevant_chromosomes)] dataset = [] #Filter VCF for chrom in relevant_chromosomes: chrom_MANE = MANE_data_renamed[MANE_data_renamed['#CHROM'] == chrom] # Find the indices in `vcf` that match the conditions for the current chromosome for name, strand, start, end in chrom_MANE[['symbol', 'strand', 'Start', 'End']].itertuples(index=False): condition = (vcf['#CHROM'] == chrom) & (vcf['POS'] >= start) & (vcf['POS'] <= end) if np.sum(condition) != 0: if strand == '+': start_ = start - 1000 end_ = end else: end_ = end + 1000 start_ = start dataset.append([[name, strand, start_, end_], np.array(vcf[condition]).tolist()]) #sort vcf by length len_ = [] for vcf_entery in dataset: len_.append(vcf_entery[0][3] - vcf_entery[0][2]) dataset = [dataset[index] for index in np.argsort(len_)] #save vcf #get name patient_name = "${vcf_filtered}".split('/')[-1].split('_variants.vcf')[0] with open(f"{patient_name}_rna_coding_vcf_with_mutations.vcf", "wb") as fp: pickle.dump(dataset, fp) """ } process PREDICT_EXPRESSION { accelerator 1 memory 4.GB container "${params.container_borzoi}" label 'gpu_process' // containerOptions "${params.containerOptions_borzoi}" // debug true maxForks 1 // TODO: COMMENT THIS IN KUBERNETES input: path vcf_filtered path MANE //path ncbiRefSeq_bigger //path ncbiRefSeq_subset output: path "*_TPM.csv", emit: expression_output script: """ #!/opt/conda/envs/borzoi/bin/python #Predict expression based on mutation import json import os import time import warnings import pickle from itertools import compress import h5py import matplotlib.pyplot as plt import matplotlib.patches as patches import numpy as np import pandas as pd import pysam import pyfaidx import tensorflow as tf from baskerville import seqnn from baskerville import gene as bgene from baskerville import dna import sys sys.path.append( '/home/omic/borzoi' ) from examples.borzoi_helpers import * #Load VCF and mutation files #ncbiRefSeq_bigger = pd.read_csv("{ncbiRefSeq_bigger}") #with open("{ncbiRefSeq_subset}", "rb") as fp: # ncbiRefSeq_subset = pickle.load(fp) prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv") with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp: prot_subset = pickle.load(fp) with open("${vcf_filtered}", "rb") as fp: vcf_file = pickle.load(fp) MANE_data = pd.read_csv("${MANE}", sep = '\t') batch_size = 4 #Define batch size and stop if batch size is to small for big transcripts #min_batch_size = max(((np.array(ncbiRefSeq_bigger['txEnd']).astype('int')-np.array(ncbiRefSeq_bigger['txStart']).astype('int'))/524288).astype('int')+1) #if min_batch_size > batch_size: # print('batch size is ',batch_size,'and min_batch_size is ',min_batch_size) # sys.exit('Batch size has to be bigger or same as number of big trnascripts split into chunks') #Model configuration params_file = '/home/omic/borzoi/examples/params_pred.json' targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt n_folds = 1 #4 #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'. rc = True #Average across reverse-complement prediction #Read model parameters with open(params_file) as params_open : params = json.load(params_open) params_model = params['model'] params_train = params['train'] #Read targets targets_df = pd.read_csv(targets_file, index_col=0, sep='\t') target_index = targets_df.index #Create local index of strand_pair (relative to sliced targets) if rc : strand_pair = targets_df.strand_pair target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())} slice_pair = np.array([ target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist() ], dtype='int32') #Initialize model ensemble models = [] for fold_ix in range(n_folds) : model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5" seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file, 0) seqnn_model.build_slice(target_index) if rc : seqnn_model.strand_pair.append(slice_pair) seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0] models.append(seqnn_model) fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa') #Create mutations(s) from WT def create_mut(sequence_one_hot, poses, alts, start): sequence_one_hot_mut = np.copy(sequence_one_hot_wt) for pos, alt in zip(poses, alts) : alt_ix = -1 if alt == 'A' : alt_ix = 0 elif alt == 'C' : alt_ix = 1 elif alt == 'G' : alt_ix = 2 elif alt == 'T' : alt_ix = 3 sequence_one_hot_mut[pos-start-1] = 0. sequence_one_hot_mut[pos-start-1, alt_ix] = 1. return sequence_one_hot_mut #Make predictions/ run model def predict_tracks(models, sequence_one_hot): predicted_tracks = [] for fold_ix in range(len(models)): yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16") predicted_tracks.append(yh) predicted_tracks = np.concatenate(predicted_tracks, axis=1) return predicted_tracks #calculate TPM from borzoi def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False): TPM_list = [] #loop over all protrin in cluster for i in range(len(cluster)): #get exon start and end ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')] ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')] #exon bool mask exon_mask = np.zeros(borzoi_data.shape[-2]) for s,n in zip(ex_st,ex_en): exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n)) #protrin TPM per person per tissue TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1) #Plot proteins with exon marks if needed if plot == True: #Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue plt.plot(borzoi_data[0,:,0]) plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height') plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height') plt.xlim(ex_st[0]-100, ex_en[-1]+100) plt.show() #Get average for tissue replicates TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])] TPM_list.append(TPM_per_tissue) #cretae Datafreame TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0]) return(TPM_dataframe) #Protrin cluster list of list protein_clusters = [np.array(i[2]) for i in prot_subset] #Get all proteins from VCF proteins_with_mutations = [i[0][0] for i in vcf_file] proteins_with_mutations_working = proteins_with_mutations TPM = [] #run until the expression of all proteins is predicted while len(proteins_with_mutations_working) > 0: TPM_dfs = [] sequences_one_hot_muts = [] st = [] cl = [] #append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append) while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0: #get work protein protein = proteins_with_mutations_working[0] #print(protein) #get cluster mask = [protein in i for i in protein_clusters] cluster = list(compress(protein_clusters, mask)) #run borzoi for big proteins if protein in np.array(prot_bigger['symbol']): sequences_one_hot_muts_big = [] proteins_with_mutations_working = proteins_with_mutations_working[1:] protein_data = prot_bigger[prot_bigger['symbol'] == protein] prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000 prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000 mutations_ = np.array(list(compress(vcf_file, np.array(proteins_with_mutations) == protein))[0][1]) #only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]] chrom = mutations_[0,0] all_poses = mutations_[:,1].astype('int') all_alts = mutations_[:,4] star = prot_start st_big = star while star < prot_end: end = star + 524288 poses = all_poses[(all_poses > star) & (all_poses < end)] alts = all_alts[(all_poses > star) & (all_poses < end)] sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288) sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star) sequences_one_hot_muts_big.append(sequence_one_hot_mut) star = end - (32*32) sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big) #if number of protein splits is begger than batch size #print(sequences_one_hot_muts_big.shape) if sequences_one_hot_muts_big.shape[0] > batch_size: borzoi_pred_list = [] for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)): borzoi_pred_list.append(predict_tracks(models, seq_slice)) y_mut = np.concatenate(borzoi_pred_list) else: y_mut = predict_tracks(models, sequences_one_hot_muts_big) y_mut = np.reshape(y_mut, [1,1,-1,89]) TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df)) #np.save('expression_predictions_%s.npy' %protein, y_mut) else: #append to a list of proteins to run #get star and end of the cluste star, end = (list(compress(prot_subset, mask))[0][:2]) #get mutated proteins in the cluster mask = [i in cluster[0] for i in proteins_with_mutations_working] proteins_in_cluster = list(compress(proteins_with_mutations_working, mask)) #remove cluster proteins from the ptoein list proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask))) #print(proteins_in_cluster) mutations_ = [list(compress(vcf_file, [np.array(proteins_with_mutations) == i][0]))[0][1] for i in proteins_in_cluster] mutations_ = np.concatenate([np.array(i) for i in mutations_],0) #only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]] chrom = mutations_[0,0] poses = mutations_[:,1].astype('int') alts = mutations_[:,4] sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288) sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star) sequences_one_hot_muts.append(sequence_one_hot_mut) st.append(star) cl.append(cluster) ### Test wt #sequences_one_hot_muts.append(sequence_one_hot_wt) ### sequences_one_hot_muts = np.array(sequences_one_hot_muts) #run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step) if sequences_one_hot_muts.shape != (0,): y_mut = predict_tracks(models, sequences_one_hot_muts) for i in range(len(y_mut)): TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df)) TPM_dfs = pd.concat(TPM_dfs) TPM.append(TPM_dfs) #np.save('expression_predictions_%s.npy' %protein, y_mut) TPM = pd.concat(TPM) #add RNA expression for positions with no mutations #load reference genome expression tpm_no_mut = pd.read_csv('/home/omic/borzoi/TPM_NO_MUTATIONS.csv',index_col=0) #concat missing expression TPM = pd.concat([TPM, tpm_no_mut.loc[list(set(tpm_no_mut.index) - set(TPM.index))]]) #change symbol to ENSG TPM.index.names = ['RNA'] MANE_data['ENSG'] = [i.split('.')[0] for i in MANE_data['Ensembl_Gene']] mane_map = MANE_data[['symbol','ENSG']] TPM = mane_map.merge(TPM, left_on='symbol', right_on='RNA').dropna().drop_duplicates(subset=['symbol']) #drop duplicates TPM =TPM.drop_duplicates() TPM = TPM.iloc[:,2:].set_index(TPM.iloc[:,1]) #save TPM #get name patient_name = "${vcf_filtered}".split('/')[-1].split('_rna_coding_vcf_with_mutations.vcf')[0] TPM.to_csv(f'{patient_name}_TPM.csv') gpu_stats = tf.config.experimental.get_memory_info('GPU:0') print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB") print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB") """ } process CREATE_PROTEIN_CLUSTER { container "${params.container_borzoi}" containerOptions "${params.containerOptions}" debug true input: path MANE output: path "prot_bigger.csv" path "prot_subset.pickle" script: """ #!/opt/conda/envs/borzoi/bin/python # Use this if new RNAs transcripts need to be added to list of predicted RNAs import numpy as np import pandas as pd import pickle data = pd.read_csv("${MANE}", sep='\t') data_subset = [] data_bigger = [] for ch in np.unique(data['chrom']): data_working = data[data['chrom']==ch] arg = np.argsort(np.array(data_working['chr_start']).astype('int')) data_working = data_working.iloc[arg] #protrin start poistion wirh regulatori elements negativ 1000 from start for + strand, plus 1000 from end for - strand #add (16*32) as buffer because borzoi will cut tham prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000 prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000 input_start = prot_start input_end = input_start + 524288 while len(input_start) > 0: st = input_start[0] en = input_end[0] mask = (prot_start >= st) & (prot_end <= en) data_mask = data_working[mask] data_subset.append([st, en, data_mask['symbol']]) #ncbiRefSeq_subset.append(ncbiRefSeq_working[mask]) if mask[0] == False: data_bigger.append(data_working.iloc[0]) mask[0] = True data_working = data_working[~mask] arg = np.argsort(np.array(data_working['chr_start']).astype('int')) data_working = data_working.iloc[arg] prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000 prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000 input_start = prot_start input_end = input_start + 524288 prot_bigger = pd.DataFrame(data_bigger) prot_bigger.to_csv('prot_bigger.csv') with open("prot_subset.pickle", "wb") as fp: pickle.dump(data_subset, fp) gpu_stats = tf.config.experimental.get_memory_info('GPU:0') print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB") print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB") """ }