Initial commit: digital-patients pipeline (clean, no large files)
Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
This commit is contained in:
448
main_borzoi.nf
Normal file
448
main_borzoi.nf
Normal file
@@ -0,0 +1,448 @@
|
||||
nextflow.enable.dsl=2
|
||||
|
||||
process FILTER_VCF {
|
||||
memory 4.GB
|
||||
|
||||
container "${params.container_borzoi}"
|
||||
containerOptions "${params.containerOptions}"
|
||||
debug true
|
||||
// maxForks 1
|
||||
|
||||
input:
|
||||
path MANE
|
||||
path vcf_filtered
|
||||
|
||||
output:
|
||||
path "*.vcf" , emit: expression_output
|
||||
|
||||
script:
|
||||
"""
|
||||
#!/opt/conda/envs/borzoi/bin/python
|
||||
#Filter VCF to extract only mutations in coding sequences plus 1000 upstream regulatory elements
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pickle
|
||||
|
||||
#If ncbiRefSeq data need to be filtered in different manner change commented part
|
||||
#This part is needed if we want to change ncbiRefSeq filtering
|
||||
#column_names = [
|
||||
# "bin", "name", "chrom", "strand", "txStart", "txEnd",
|
||||
# "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds",
|
||||
# "score", "name2", "cdsStartStat", "cdsEndStat", "exonFrames"
|
||||
#]
|
||||
|
||||
#ncbiRefSeq = pd.read_csv("{ncbiRefSeq}", sep='\t', names=column_names) #dollar sign is missing
|
||||
|
||||
##Get onyl protein coding genes (ncbiRefSeq['cdsStartStat'] != 'none')
|
||||
#ncbiRefSeq = ncbiRefSeq[ncbiRefSeq['cdsStartStat'] != 'none']
|
||||
|
||||
##Remove duplicates
|
||||
#ncbiRefSeq = ncbiRefSeq.drop_duplicates()
|
||||
|
||||
##Filter out isoforms for longest DNA sequenc. Filter by protein name and chromosom(differnet version of some shromosoms)
|
||||
#ncbiRefSeq_filtered = []
|
||||
#for i in np.unique(ncbiRefSeq['name2']):
|
||||
# prot = ncbiRefSeq[ncbiRefSeq['name2'] == i]
|
||||
# for j in np.unique(prot['chrom']):
|
||||
# prot_chr = prot[prot['chrom'] == j]
|
||||
# d1 = [prot_chr['txEnd'].astype(int).max(), prot_chr['txStart'].astype(int).min()] #get lowest starting position and maximum stop position
|
||||
# max_len = np.argmax(prot_chr['txEnd'].astype(int) - prot_chr['txStart'].astype(int))
|
||||
# position = prot_chr.iloc[max_len]
|
||||
# position['txEnd'] = str(d1[0]) #end for isoform can be loaded from exonEnds
|
||||
# position['txStart'] = str(d1[1]) #start for isoform can be loaded from exonStarts
|
||||
# ncbiRefSeq_filtered.append(position)
|
||||
#
|
||||
#ncbiRefSeq = pd.DataFrame(ncbiRefSeq_filtered).reset_index().drop('index', axis = 1)
|
||||
##protrin coding genes only 23314, all genes 48420
|
||||
|
||||
#Load ncbiRefSeq filtered
|
||||
MANE_data = pd.read_csv("${MANE}", sep = '\t')
|
||||
|
||||
#Load vcf
|
||||
with open("${vcf_filtered}") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
#Remove header/start of vcf file
|
||||
vcf = []
|
||||
for i in lines:
|
||||
if i[:2] != '##':
|
||||
vcf.append(i[:-1].split('\t'))
|
||||
|
||||
vcf = pd.DataFrame(vcf[1:], columns=vcf[0])
|
||||
|
||||
# Assuming 'vcf' and 'ncbiRefSeq' are already loaded DataFrames.
|
||||
vcf['POS'] = vcf['POS'].astype(int)
|
||||
vcf['#CHROM'] = vcf['#CHROM'].astype('category')
|
||||
MANE_data['chrom'] = MANE_data['chrom'].astype('category')
|
||||
MANE_data_renamed = MANE_data.rename(columns={'chrom': '#CHROM', 'chr_start': 'Start', 'chr_end': 'End', 'chr_strand': 'strand'})
|
||||
relevant_chromosomes = vcf['#CHROM'].unique()
|
||||
MANE_data_renamed = MANE_data_renamed[MANE_data_renamed['#CHROM'].isin(relevant_chromosomes)]
|
||||
|
||||
dataset = []
|
||||
|
||||
#Filter VCF
|
||||
for chrom in relevant_chromosomes:
|
||||
chrom_MANE = MANE_data_renamed[MANE_data_renamed['#CHROM'] == chrom]
|
||||
# Find the indices in `vcf` that match the conditions for the current chromosome
|
||||
for name, strand, start, end in chrom_MANE[['symbol', 'strand', 'Start', 'End']].itertuples(index=False):
|
||||
condition = (vcf['#CHROM'] == chrom) & (vcf['POS'] >= start) & (vcf['POS'] <= end)
|
||||
if np.sum(condition) != 0:
|
||||
if strand == '+':
|
||||
start_ = start - 1000
|
||||
end_ = end
|
||||
else:
|
||||
end_ = end + 1000
|
||||
start_ = start
|
||||
dataset.append([[name, strand, start_, end_], np.array(vcf[condition]).tolist()])
|
||||
#sort vcf by length
|
||||
len_ = []
|
||||
for vcf_entery in dataset:
|
||||
len_.append(vcf_entery[0][3] - vcf_entery[0][2])
|
||||
dataset = [dataset[index] for index in np.argsort(len_)]
|
||||
#save vcf
|
||||
#get name
|
||||
patient_name = "${vcf_filtered}".split('/')[-1].split('_variants.vcf')[0]
|
||||
with open(f"{patient_name}_rna_coding_vcf_with_mutations.vcf", "wb") as fp:
|
||||
pickle.dump(dataset, fp)
|
||||
"""
|
||||
}
|
||||
|
||||
process PREDICT_EXPRESSION {
|
||||
accelerator 1
|
||||
memory 4.GB
|
||||
|
||||
container "${params.container_borzoi}"
|
||||
label 'gpu_process'
|
||||
// containerOptions "${params.containerOptions_borzoi}"
|
||||
// debug true
|
||||
maxForks 1 // TODO: COMMENT THIS IN KUBERNETES
|
||||
|
||||
input:
|
||||
path vcf_filtered
|
||||
path MANE
|
||||
//path ncbiRefSeq_bigger
|
||||
//path ncbiRefSeq_subset
|
||||
|
||||
output:
|
||||
path "*_TPM.csv", emit: expression_output
|
||||
|
||||
script:
|
||||
"""
|
||||
#!/opt/conda/envs/borzoi/bin/python
|
||||
#Predict expression based on mutation
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import warnings
|
||||
import pickle
|
||||
from itertools import compress
|
||||
|
||||
import h5py
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pysam
|
||||
import pyfaidx
|
||||
import tensorflow as tf
|
||||
|
||||
from baskerville import seqnn
|
||||
from baskerville import gene as bgene
|
||||
from baskerville import dna
|
||||
|
||||
import sys
|
||||
sys.path.append( '/home/omic/borzoi' )
|
||||
from examples.borzoi_helpers import *
|
||||
|
||||
#Load VCF and mutation files
|
||||
#ncbiRefSeq_bigger = pd.read_csv("{ncbiRefSeq_bigger}")
|
||||
#with open("{ncbiRefSeq_subset}", "rb") as fp:
|
||||
# ncbiRefSeq_subset = pickle.load(fp)
|
||||
prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv")
|
||||
with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp:
|
||||
prot_subset = pickle.load(fp)
|
||||
with open("${vcf_filtered}", "rb") as fp:
|
||||
vcf_file = pickle.load(fp)
|
||||
MANE_data = pd.read_csv("${MANE}", sep = '\t')
|
||||
|
||||
batch_size = 4
|
||||
#Define batch size and stop if batch size is to small for big transcripts
|
||||
#min_batch_size = max(((np.array(ncbiRefSeq_bigger['txEnd']).astype('int')-np.array(ncbiRefSeq_bigger['txStart']).astype('int'))/524288).astype('int')+1)
|
||||
#if min_batch_size > batch_size:
|
||||
# print('batch size is ',batch_size,'and min_batch_size is ',min_batch_size)
|
||||
# sys.exit('Batch size has to be bigger or same as number of big trnascripts split into chunks')
|
||||
|
||||
#Model configuration
|
||||
params_file = '/home/omic/borzoi/examples/params_pred.json'
|
||||
targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt
|
||||
n_folds = 1 #4 #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
|
||||
rc = True #Average across reverse-complement prediction
|
||||
|
||||
#Read model parameters
|
||||
with open(params_file) as params_open :
|
||||
params = json.load(params_open)
|
||||
params_model = params['model']
|
||||
params_train = params['train']
|
||||
|
||||
#Read targets
|
||||
targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
|
||||
target_index = targets_df.index
|
||||
|
||||
#Create local index of strand_pair (relative to sliced targets)
|
||||
if rc :
|
||||
strand_pair = targets_df.strand_pair
|
||||
target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
|
||||
slice_pair = np.array([
|
||||
target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
|
||||
], dtype='int32')
|
||||
|
||||
#Initialize model ensemble
|
||||
models = []
|
||||
for fold_ix in range(n_folds) :
|
||||
model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5"
|
||||
seqnn_model = seqnn.SeqNN(params_model)
|
||||
seqnn_model.restore(model_file, 0)
|
||||
seqnn_model.build_slice(target_index)
|
||||
if rc :
|
||||
seqnn_model.strand_pair.append(slice_pair)
|
||||
seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0]
|
||||
models.append(seqnn_model)
|
||||
fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa')
|
||||
|
||||
#Create mutations(s) from WT
|
||||
def create_mut(sequence_one_hot, poses, alts, start):
|
||||
sequence_one_hot_mut = np.copy(sequence_one_hot_wt)
|
||||
for pos, alt in zip(poses, alts) :
|
||||
alt_ix = -1
|
||||
if alt == 'A' :
|
||||
alt_ix = 0
|
||||
elif alt == 'C' :
|
||||
alt_ix = 1
|
||||
elif alt == 'G' :
|
||||
alt_ix = 2
|
||||
elif alt == 'T' :
|
||||
alt_ix = 3
|
||||
sequence_one_hot_mut[pos-start-1] = 0.
|
||||
sequence_one_hot_mut[pos-start-1, alt_ix] = 1.
|
||||
return sequence_one_hot_mut
|
||||
|
||||
#Make predictions/ run model
|
||||
def predict_tracks(models, sequence_one_hot):
|
||||
predicted_tracks = []
|
||||
for fold_ix in range(len(models)):
|
||||
yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16")
|
||||
predicted_tracks.append(yh)
|
||||
predicted_tracks = np.concatenate(predicted_tracks, axis=1)
|
||||
return predicted_tracks
|
||||
|
||||
#calculate TPM from borzoi
|
||||
def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False):
|
||||
TPM_list = []
|
||||
#loop over all protrin in cluster
|
||||
for i in range(len(cluster)):
|
||||
#get exon start and end
|
||||
ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')]
|
||||
ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')]
|
||||
#exon bool mask
|
||||
exon_mask = np.zeros(borzoi_data.shape[-2])
|
||||
for s,n in zip(ex_st,ex_en):
|
||||
exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n))
|
||||
#protrin TPM per person per tissue
|
||||
TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1)
|
||||
#Plot proteins with exon marks if needed
|
||||
if plot == True:
|
||||
#Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue
|
||||
plt.plot(borzoi_data[0,:,0])
|
||||
plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height')
|
||||
plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height')
|
||||
plt.xlim(ex_st[0]-100, ex_en[-1]+100)
|
||||
plt.show()
|
||||
#Get average for tissue replicates
|
||||
TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])]
|
||||
TPM_list.append(TPM_per_tissue)
|
||||
#cretae Datafreame
|
||||
TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0])
|
||||
return(TPM_dataframe)
|
||||
|
||||
#Protrin cluster list of list
|
||||
protein_clusters = [np.array(i[2]) for i in prot_subset]
|
||||
#Get all proteins from VCF
|
||||
proteins_with_mutations = [i[0][0] for i in vcf_file]
|
||||
|
||||
proteins_with_mutations_working = proteins_with_mutations
|
||||
|
||||
TPM = []
|
||||
#run until the expression of all proteins is predicted
|
||||
while len(proteins_with_mutations_working) > 0:
|
||||
TPM_dfs = []
|
||||
sequences_one_hot_muts = []
|
||||
st = []
|
||||
cl = []
|
||||
#append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append)
|
||||
while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0:
|
||||
#get work protein
|
||||
protein = proteins_with_mutations_working[0]
|
||||
#print(protein)
|
||||
#get cluster
|
||||
mask = [protein in i for i in protein_clusters]
|
||||
cluster = list(compress(protein_clusters, mask))
|
||||
#run borzoi for big proteins
|
||||
if protein in np.array(prot_bigger['symbol']):
|
||||
sequences_one_hot_muts_big = []
|
||||
proteins_with_mutations_working = proteins_with_mutations_working[1:]
|
||||
protein_data = prot_bigger[prot_bigger['symbol'] == protein]
|
||||
prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000
|
||||
prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000
|
||||
mutations_ = np.array(list(compress(vcf_file, np.array(proteins_with_mutations) == protein))[0][1])
|
||||
#only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover
|
||||
mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]]
|
||||
chrom = mutations_[0,0]
|
||||
all_poses = mutations_[:,1].astype('int')
|
||||
all_alts = mutations_[:,4]
|
||||
star = prot_start
|
||||
st_big = star
|
||||
while star < prot_end:
|
||||
end = star + 524288
|
||||
poses = all_poses[(all_poses > star) & (all_poses < end)]
|
||||
alts = all_alts[(all_poses > star) & (all_poses < end)]
|
||||
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
|
||||
sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star)
|
||||
sequences_one_hot_muts_big.append(sequence_one_hot_mut)
|
||||
star = end - (32*32)
|
||||
sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big)
|
||||
#if number of protein splits is begger than batch size
|
||||
#print(sequences_one_hot_muts_big.shape)
|
||||
if sequences_one_hot_muts_big.shape[0] > batch_size:
|
||||
borzoi_pred_list = []
|
||||
for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)):
|
||||
borzoi_pred_list.append(predict_tracks(models, seq_slice))
|
||||
y_mut = np.concatenate(borzoi_pred_list)
|
||||
else:
|
||||
y_mut = predict_tracks(models, sequences_one_hot_muts_big)
|
||||
y_mut = np.reshape(y_mut, [1,1,-1,89])
|
||||
TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df))
|
||||
#np.save('expression_predictions_%s.npy' %protein, y_mut)
|
||||
else:
|
||||
#append to a list of proteins to run
|
||||
#get star and end of the cluste
|
||||
star, end = (list(compress(prot_subset, mask))[0][:2])
|
||||
#get mutated proteins in the cluster
|
||||
mask = [i in cluster[0] for i in proteins_with_mutations_working]
|
||||
proteins_in_cluster = list(compress(proteins_with_mutations_working, mask))
|
||||
#remove cluster proteins from the ptoein list
|
||||
proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask)))
|
||||
#print(proteins_in_cluster)
|
||||
mutations_ = [list(compress(vcf_file, [np.array(proteins_with_mutations) == i][0]))[0][1] for i in proteins_in_cluster]
|
||||
mutations_ = np.concatenate([np.array(i) for i in mutations_],0)
|
||||
#only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover
|
||||
mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]]
|
||||
chrom = mutations_[0,0]
|
||||
poses = mutations_[:,1].astype('int')
|
||||
alts = mutations_[:,4]
|
||||
sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288)
|
||||
sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star)
|
||||
sequences_one_hot_muts.append(sequence_one_hot_mut)
|
||||
st.append(star)
|
||||
cl.append(cluster)
|
||||
### Test wt
|
||||
#sequences_one_hot_muts.append(sequence_one_hot_wt)
|
||||
###
|
||||
sequences_one_hot_muts = np.array(sequences_one_hot_muts)
|
||||
#run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step)
|
||||
if sequences_one_hot_muts.shape != (0,):
|
||||
y_mut = predict_tracks(models, sequences_one_hot_muts)
|
||||
for i in range(len(y_mut)):
|
||||
TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df))
|
||||
TPM_dfs = pd.concat(TPM_dfs)
|
||||
TPM.append(TPM_dfs)
|
||||
#np.save('expression_predictions_%s.npy' %protein, y_mut)
|
||||
TPM = pd.concat(TPM)
|
||||
|
||||
#add RNA expression for positions with no mutations
|
||||
#load reference genome expression
|
||||
tpm_no_mut = pd.read_csv('/home/omic/borzoi/TPM_NO_MUTATIONS.csv',index_col=0)
|
||||
#concat missing expression
|
||||
TPM = pd.concat([TPM, tpm_no_mut.loc[list(set(tpm_no_mut.index) - set(TPM.index))]])
|
||||
#change symbol to ENSG
|
||||
TPM.index.names = ['RNA']
|
||||
MANE_data['ENSG'] = [i.split('.')[0] for i in MANE_data['Ensembl_Gene']]
|
||||
mane_map = MANE_data[['symbol','ENSG']]
|
||||
TPM = mane_map.merge(TPM, left_on='symbol', right_on='RNA').dropna().drop_duplicates(subset=['symbol'])
|
||||
#drop duplicates
|
||||
TPM =TPM.drop_duplicates()
|
||||
TPM = TPM.iloc[:,2:].set_index(TPM.iloc[:,1])
|
||||
#save TPM
|
||||
#get name
|
||||
patient_name = "${vcf_filtered}".split('/')[-1].split('_rna_coding_vcf_with_mutations.vcf')[0]
|
||||
TPM.to_csv(f'{patient_name}_TPM.csv')
|
||||
|
||||
gpu_stats = tf.config.experimental.get_memory_info('GPU:0')
|
||||
print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB")
|
||||
print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB")
|
||||
"""
|
||||
}
|
||||
|
||||
|
||||
process CREATE_PROTEIN_CLUSTER {
|
||||
container "${params.container_borzoi}"
|
||||
containerOptions "${params.containerOptions}"
|
||||
debug true
|
||||
|
||||
input:
|
||||
path MANE
|
||||
|
||||
output:
|
||||
path "prot_bigger.csv"
|
||||
path "prot_subset.pickle"
|
||||
|
||||
script:
|
||||
"""
|
||||
#!/opt/conda/envs/borzoi/bin/python
|
||||
# Use this if new RNAs transcripts need to be added to list of predicted RNAs
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pickle
|
||||
|
||||
data = pd.read_csv("${MANE}", sep='\t')
|
||||
|
||||
data_subset = []
|
||||
data_bigger = []
|
||||
for ch in np.unique(data['chrom']):
|
||||
data_working = data[data['chrom']==ch]
|
||||
arg = np.argsort(np.array(data_working['chr_start']).astype('int'))
|
||||
data_working = data_working.iloc[arg]
|
||||
#protrin start poistion wirh regulatori elements negativ 1000 from start for + strand, plus 1000 from end for - strand
|
||||
#add (16*32) as buffer because borzoi will cut tham
|
||||
prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000
|
||||
prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000
|
||||
input_start = prot_start
|
||||
input_end = input_start + 524288
|
||||
while len(input_start) > 0:
|
||||
st = input_start[0]
|
||||
en = input_end[0]
|
||||
mask = (prot_start >= st) & (prot_end <= en)
|
||||
data_mask = data_working[mask]
|
||||
data_subset.append([st, en, data_mask['symbol']])
|
||||
#ncbiRefSeq_subset.append(ncbiRefSeq_working[mask])
|
||||
if mask[0] == False:
|
||||
data_bigger.append(data_working.iloc[0])
|
||||
mask[0] = True
|
||||
data_working = data_working[~mask]
|
||||
arg = np.argsort(np.array(data_working['chr_start']).astype('int'))
|
||||
data_working = data_working.iloc[arg]
|
||||
prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000
|
||||
prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000
|
||||
input_start = prot_start
|
||||
input_end = input_start + 524288
|
||||
|
||||
prot_bigger = pd.DataFrame(data_bigger)
|
||||
|
||||
prot_bigger.to_csv('prot_bigger.csv')
|
||||
with open("prot_subset.pickle", "wb") as fp:
|
||||
pickle.dump(data_subset, fp)
|
||||
|
||||
gpu_stats = tf.config.experimental.get_memory_info('GPU:0')
|
||||
print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB")
|
||||
print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB")
|
||||
"""
|
||||
}
|
||||
Reference in New Issue
Block a user