Initial commit: digital-patients pipeline (clean, no large files)

Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
This commit is contained in:
2026-03-26 15:15:23 +01:00
commit 9e6a16c19b
45 changed files with 7207 additions and 0 deletions

448
main_borzoi.nf Normal file
View File

@@ -0,0 +1,448 @@
nextflow.enable.dsl=2
process FILTER_VCF {
memory 4.GB
container "${params.container_borzoi}"
containerOptions "${params.containerOptions}"
debug true
// maxForks 1
input:
path MANE
path vcf_filtered
output:
path "*.vcf" , emit: expression_output
script:
"""
#!/opt/conda/envs/borzoi/bin/python
#Filter VCF to extract only mutations in coding sequences plus 1000 upstream regulatory elements
import numpy as np
import pandas as pd
import pickle
#If ncbiRefSeq data need to be filtered in different manner change commented part
#This part is needed if we want to change ncbiRefSeq filtering
#column_names = [
# "bin", "name", "chrom", "strand", "txStart", "txEnd",
# "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds",
# "score", "name2", "cdsStartStat", "cdsEndStat", "exonFrames"
#]
#ncbiRefSeq = pd.read_csv("{ncbiRefSeq}", sep='\t', names=column_names) #dollar sign is missing
##Get onyl protein coding genes (ncbiRefSeq['cdsStartStat'] != 'none')
#ncbiRefSeq = ncbiRefSeq[ncbiRefSeq['cdsStartStat'] != 'none']
##Remove duplicates
#ncbiRefSeq = ncbiRefSeq.drop_duplicates()
##Filter out isoforms for longest DNA sequenc. Filter by protein name and chromosom(differnet version of some shromosoms)
#ncbiRefSeq_filtered = []
#for i in np.unique(ncbiRefSeq['name2']):
# prot = ncbiRefSeq[ncbiRefSeq['name2'] == i]
# for j in np.unique(prot['chrom']):
# prot_chr = prot[prot['chrom'] == j]
# d1 = [prot_chr['txEnd'].astype(int).max(), prot_chr['txStart'].astype(int).min()] #get lowest starting position and maximum stop position
# max_len = np.argmax(prot_chr['txEnd'].astype(int) - prot_chr['txStart'].astype(int))
# position = prot_chr.iloc[max_len]
# position['txEnd'] = str(d1[0]) #end for isoform can be loaded from exonEnds
# position['txStart'] = str(d1[1]) #start for isoform can be loaded from exonStarts
# ncbiRefSeq_filtered.append(position)
#
#ncbiRefSeq = pd.DataFrame(ncbiRefSeq_filtered).reset_index().drop('index', axis = 1)
##protrin coding genes only 23314, all genes 48420
#Load ncbiRefSeq filtered
MANE_data = pd.read_csv("${MANE}", sep = '\t')
#Load vcf
with open("${vcf_filtered}") as f:
lines = f.readlines()
#Remove header/start of vcf file
vcf = []
for i in lines:
if i[:2] != '##':
vcf.append(i[:-1].split('\t'))
vcf = pd.DataFrame(vcf[1:], columns=vcf[0])
# Assuming 'vcf' and 'ncbiRefSeq' are already loaded DataFrames.
vcf['POS'] = vcf['POS'].astype(int)
vcf['#CHROM'] = vcf['#CHROM'].astype('category')
MANE_data['chrom'] = MANE_data['chrom'].astype('category')
MANE_data_renamed = MANE_data.rename(columns={'chrom': '#CHROM', 'chr_start': 'Start', 'chr_end': 'End', 'chr_strand': 'strand'})
relevant_chromosomes = vcf['#CHROM'].unique()
MANE_data_renamed = MANE_data_renamed[MANE_data_renamed['#CHROM'].isin(relevant_chromosomes)]
dataset = []
#Filter VCF
for chrom in relevant_chromosomes:
chrom_MANE = MANE_data_renamed[MANE_data_renamed['#CHROM'] == chrom]
# Find the indices in `vcf` that match the conditions for the current chromosome
for name, strand, start, end in chrom_MANE[['symbol', 'strand', 'Start', 'End']].itertuples(index=False):
condition = (vcf['#CHROM'] == chrom) & (vcf['POS'] >= start) & (vcf['POS'] <= end)
if np.sum(condition) != 0:
if strand == '+':
start_ = start - 1000
end_ = end
else:
end_ = end + 1000
start_ = start
dataset.append([[name, strand, start_, end_], np.array(vcf[condition]).tolist()])
#sort vcf by length
len_ = []
for vcf_entery in dataset:
len_.append(vcf_entery[0][3] - vcf_entery[0][2])
dataset = [dataset[index] for index in np.argsort(len_)]
#save vcf
#get name
patient_name = "${vcf_filtered}".split('/')[-1].split('_variants.vcf')[0]
with open(f"{patient_name}_rna_coding_vcf_with_mutations.vcf", "wb") as fp:
pickle.dump(dataset, fp)
"""
}
process PREDICT_EXPRESSION {
accelerator 1
memory 4.GB
container "${params.container_borzoi}"
label 'gpu_process'
// containerOptions "${params.containerOptions_borzoi}"
// debug true
maxForks 1 // TODO: COMMENT THIS IN KUBERNETES
input:
path vcf_filtered
path MANE
//path ncbiRefSeq_bigger
//path ncbiRefSeq_subset
output:
path "*_TPM.csv", emit: expression_output
script:
"""
#!/opt/conda/envs/borzoi/bin/python
#Predict expression based on mutation
import json
import os
import time
import warnings
import pickle
from itertools import compress
import h5py
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import pandas as pd
import pysam
import pyfaidx
import tensorflow as tf
from baskerville import seqnn
from baskerville import gene as bgene
from baskerville import dna
import sys
sys.path.append( '/home/omic/borzoi' )
from examples.borzoi_helpers import *
#Load VCF and mutation files
#ncbiRefSeq_bigger = pd.read_csv("{ncbiRefSeq_bigger}")
#with open("{ncbiRefSeq_subset}", "rb") as fp:
# ncbiRefSeq_subset = pickle.load(fp)
prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv")
with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp:
prot_subset = pickle.load(fp)
with open("${vcf_filtered}", "rb") as fp:
vcf_file = pickle.load(fp)
MANE_data = pd.read_csv("${MANE}", sep = '\t')
batch_size = 4
#Define batch size and stop if batch size is to small for big transcripts
#min_batch_size = max(((np.array(ncbiRefSeq_bigger['txEnd']).astype('int')-np.array(ncbiRefSeq_bigger['txStart']).astype('int'))/524288).astype('int')+1)
#if min_batch_size > batch_size:
# print('batch size is ',batch_size,'and min_batch_size is ',min_batch_size)
# sys.exit('Batch size has to be bigger or same as number of big trnascripts split into chunks')
#Model configuration
params_file = '/home/omic/borzoi/examples/params_pred.json'
targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt
n_folds = 1 #4 #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
rc = True #Average across reverse-complement prediction
#Read model parameters
with open(params_file) as params_open :
params = json.load(params_open)
params_model = params['model']
params_train = params['train']
#Read targets
targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
target_index = targets_df.index
#Create local index of strand_pair (relative to sliced targets)
if rc :
strand_pair = targets_df.strand_pair
target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
slice_pair = np.array([
target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
], dtype='int32')
#Initialize model ensemble
models = []
for fold_ix in range(n_folds) :
model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5"
seqnn_model = seqnn.SeqNN(params_model)
seqnn_model.restore(model_file, 0)
seqnn_model.build_slice(target_index)
if rc :
seqnn_model.strand_pair.append(slice_pair)
seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0]
models.append(seqnn_model)
fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa')
#Create mutations(s) from WT
def create_mut(sequence_one_hot, poses, alts, start):
sequence_one_hot_mut = np.copy(sequence_one_hot_wt)
for pos, alt in zip(poses, alts) :
alt_ix = -1
if alt == 'A' :
alt_ix = 0
elif alt == 'C' :
alt_ix = 1
elif alt == 'G' :
alt_ix = 2
elif alt == 'T' :
alt_ix = 3
sequence_one_hot_mut[pos-start-1] = 0.
sequence_one_hot_mut[pos-start-1, alt_ix] = 1.
return sequence_one_hot_mut
#Make predictions/ run model
def predict_tracks(models, sequence_one_hot):
predicted_tracks = []
for fold_ix in range(len(models)):
yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16")
predicted_tracks.append(yh)
predicted_tracks = np.concatenate(predicted_tracks, axis=1)
return predicted_tracks
#calculate TPM from borzoi
def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False):
TPM_list = []
#loop over all protrin in cluster
for i in range(len(cluster)):
#get exon start and end
ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')]
ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')]
#exon bool mask
exon_mask = np.zeros(borzoi_data.shape[-2])
for s,n in zip(ex_st,ex_en):
exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n))
#protrin TPM per person per tissue
TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1)
#Plot proteins with exon marks if needed
if plot == True:
#Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue
plt.plot(borzoi_data[0,:,0])
plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height')
plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height')
plt.xlim(ex_st[0]-100, ex_en[-1]+100)
plt.show()
#Get average for tissue replicates
TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])]
TPM_list.append(TPM_per_tissue)
#cretae Datafreame
TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0])
return(TPM_dataframe)
#Protrin cluster list of list
protein_clusters = [np.array(i[2]) for i in prot_subset]
#Get all proteins from VCF
proteins_with_mutations = [i[0][0] for i in vcf_file]
proteins_with_mutations_working = proteins_with_mutations
TPM = []
#run until the expression of all proteins is predicted
while len(proteins_with_mutations_working) > 0:
TPM_dfs = []
sequences_one_hot_muts = []
st = []
cl = []
#append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append)
while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0:
#get work protein
protein = proteins_with_mutations_working[0]
#print(protein)
#get cluster
mask = [protein in i for i in protein_clusters]
cluster = list(compress(protein_clusters, mask))
#run borzoi for big proteins
if protein in np.array(prot_bigger['symbol']):
sequences_one_hot_muts_big = []
proteins_with_mutations_working = proteins_with_mutations_working[1:]
protein_data = prot_bigger[prot_bigger['symbol'] == protein]
prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000
prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000
mutations_ = np.array(list(compress(vcf_file, np.array(proteins_with_mutations) == protein))[0][1])
#only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover
mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]]
chrom = mutations_[0,0]
all_poses = mutations_[:,1].astype('int')
all_alts = mutations_[:,4]
star = prot_start
st_big = star
while star < prot_end:
end = star + 524288
poses = all_poses[(all_poses > star) & (all_poses < end)]
alts = all_alts[(all_poses > star) & (all_poses < end)]
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star)
sequences_one_hot_muts_big.append(sequence_one_hot_mut)
star = end - (32*32)
sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big)
#if number of protein splits is begger than batch size
#print(sequences_one_hot_muts_big.shape)
if sequences_one_hot_muts_big.shape[0] > batch_size:
borzoi_pred_list = []
for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)):
borzoi_pred_list.append(predict_tracks(models, seq_slice))
y_mut = np.concatenate(borzoi_pred_list)
else:
y_mut = predict_tracks(models, sequences_one_hot_muts_big)
y_mut = np.reshape(y_mut, [1,1,-1,89])
TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df))
#np.save('expression_predictions_%s.npy' %protein, y_mut)
else:
#append to a list of proteins to run
#get star and end of the cluste
star, end = (list(compress(prot_subset, mask))[0][:2])
#get mutated proteins in the cluster
mask = [i in cluster[0] for i in proteins_with_mutations_working]
proteins_in_cluster = list(compress(proteins_with_mutations_working, mask))
#remove cluster proteins from the ptoein list
proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask)))
#print(proteins_in_cluster)
mutations_ = [list(compress(vcf_file, [np.array(proteins_with_mutations) == i][0]))[0][1] for i in proteins_in_cluster]
mutations_ = np.concatenate([np.array(i) for i in mutations_],0)
#only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover
mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]]
chrom = mutations_[0,0]
poses = mutations_[:,1].astype('int')
alts = mutations_[:,4]
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star)
sequences_one_hot_muts.append(sequence_one_hot_mut)
st.append(star)
cl.append(cluster)
### Test wt
#sequences_one_hot_muts.append(sequence_one_hot_wt)
###
sequences_one_hot_muts = np.array(sequences_one_hot_muts)
#run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step)
if sequences_one_hot_muts.shape != (0,):
y_mut = predict_tracks(models, sequences_one_hot_muts)
for i in range(len(y_mut)):
TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df))
TPM_dfs = pd.concat(TPM_dfs)
TPM.append(TPM_dfs)
#np.save('expression_predictions_%s.npy' %protein, y_mut)
TPM = pd.concat(TPM)
#add RNA expression for positions with no mutations
#load reference genome expression
tpm_no_mut = pd.read_csv('/home/omic/borzoi/TPM_NO_MUTATIONS.csv',index_col=0)
#concat missing expression
TPM = pd.concat([TPM, tpm_no_mut.loc[list(set(tpm_no_mut.index) - set(TPM.index))]])
#change symbol to ENSG
TPM.index.names = ['RNA']
MANE_data['ENSG'] = [i.split('.')[0] for i in MANE_data['Ensembl_Gene']]
mane_map = MANE_data[['symbol','ENSG']]
TPM = mane_map.merge(TPM, left_on='symbol', right_on='RNA').dropna().drop_duplicates(subset=['symbol'])
#drop duplicates
TPM =TPM.drop_duplicates()
TPM = TPM.iloc[:,2:].set_index(TPM.iloc[:,1])
#save TPM
#get name
patient_name = "${vcf_filtered}".split('/')[-1].split('_rna_coding_vcf_with_mutations.vcf')[0]
TPM.to_csv(f'{patient_name}_TPM.csv')
gpu_stats = tf.config.experimental.get_memory_info('GPU:0')
print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB")
print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB")
"""
}
process CREATE_PROTEIN_CLUSTER {
container "${params.container_borzoi}"
containerOptions "${params.containerOptions}"
debug true
input:
path MANE
output:
path "prot_bigger.csv"
path "prot_subset.pickle"
script:
"""
#!/opt/conda/envs/borzoi/bin/python
# Use this if new RNAs transcripts need to be added to list of predicted RNAs
import numpy as np
import pandas as pd
import pickle
data = pd.read_csv("${MANE}", sep='\t')
data_subset = []
data_bigger = []
for ch in np.unique(data['chrom']):
data_working = data[data['chrom']==ch]
arg = np.argsort(np.array(data_working['chr_start']).astype('int'))
data_working = data_working.iloc[arg]
#protrin start poistion wirh regulatori elements negativ 1000 from start for + strand, plus 1000 from end for - strand
#add (16*32) as buffer because borzoi will cut tham
prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000
prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000
input_start = prot_start
input_end = input_start + 524288
while len(input_start) > 0:
st = input_start[0]
en = input_end[0]
mask = (prot_start >= st) & (prot_end <= en)
data_mask = data_working[mask]
data_subset.append([st, en, data_mask['symbol']])
#ncbiRefSeq_subset.append(ncbiRefSeq_working[mask])
if mask[0] == False:
data_bigger.append(data_working.iloc[0])
mask[0] = True
data_working = data_working[~mask]
arg = np.argsort(np.array(data_working['chr_start']).astype('int'))
data_working = data_working.iloc[arg]
prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000
prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000
input_start = prot_start
input_end = input_start + 524288
prot_bigger = pd.DataFrame(data_bigger)
prot_bigger.to_csv('prot_bigger.csv')
with open("prot_subset.pickle", "wb") as fp:
pickle.dump(data_subset, fp)
gpu_stats = tf.config.experimental.get_memory_info('GPU:0')
print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB")
print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB")
"""
}