Add BioEmu Nextflow pipeline implementation

This commit is contained in:
2025-03-04 09:38:55 -08:00
commit 2cfbf64e92
12 changed files with 565 additions and 0 deletions

89
main.nf Normal file
View File

@@ -0,0 +1,89 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
// Multiple FASTA files to process
params.fasta_list = [
"/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta",
"/mnt/OmicNAS/private/old/olamide/bioemu/input/trp_cage.fasta"
]
params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
params.scripts_dir = "${baseDir}/scripts"
params.num_samples = 10
params.batch_size_100 = 10
params.temperature = 300
params.n_clusters = 5
process BIOEMU {
container 'bioemu:latest'
containerOptions '--rm --gpus all -v /mnt:/mnt -v /tmp:/tmp'
publishDir "${params.outdir}/${protein_id}", mode: 'copy'
input:
tuple val(protein_id), path(fasta)
output:
tuple val(protein_id), path("topology.pdb"), path("samples.xtc"), emit: structures
path "sequence.fasta", optional: true
path "batch_*.npz", optional: true
path "run.log"
script:
"""
# Make sure cache directory exists
mkdir -p ${params.cache_dir}
# Extract the sequence from the FASTA file
SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n')
# Run BioEmu with the extracted sequence
python3 -m bioemu.sample \
--sequence "\${SEQUENCE}" \
--num_samples ${params.num_samples} \
--batch_size_100 ${params.batch_size_100} \
--output_dir . \
--cache_embeds_dir ${params.cache_dir} 2>&1 | tee run.log
"""
}
process CALCULATE_FREE_ENERGY {
container 'bioemu:latest'
containerOptions '--rm --gpus all -v /mnt:/mnt'
publishDir "${params.outdir}/${protein_id}/analysis", mode: 'copy'
input:
tuple val(protein_id), path(topology), path(samples)
output:
tuple val(protein_id), path("free_energy.csv"), emit: free_energy
path "energy_plot.png", optional: true
script:
"""
# Calculate free energy from sampled structures
python3 /opt/bioemu/scripts/calculate_gibbs.py \\
--samples ${samples} \\
--topology ${topology} \\
--temperature ${params.temperature} \\
--n_clusters ${params.n_clusters} \\
--output free_energy.csv \\
--plot energy_plot.png
"""
}
workflow {
// Convert fasta_list to a channel of [protein_id, fasta_file] tuples
Channel.fromList(params.fasta_list)
.map { fasta_path ->
def file = file(fasta_path)
return [file.baseName, file]
}
.set { fasta_ch }
// Run BioEmu for each protein sequence
BIOEMU(fasta_ch)
// Calculate Gibbs free energy for each protein
CALCULATE_FREE_ENERGY(BIOEMU.out.structures)
}