Implement interface-adaptive binding energy calculator

This commit is contained in:
2025-03-06 12:51:36 -08:00
parent 2cfbf64e92
commit 83525663c9
4 changed files with 654 additions and 60 deletions

158
main.nf
View File

@@ -1,89 +1,129 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
// Multiple FASTA files to process
params.fasta_list = [
"/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta",
"/mnt/OmicNAS/private/old/olamide/bioemu/input/trp_cage.fasta"
]
// Define parameters
params.complex_name = "hgh_mab1" // Default complex name
params.protein1_fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/complexes/hgh.fasta"
params.protein2_fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/complexes/mab1.fasta"
params.exp_dG = -13.1 // kcal/mol from experimental database
params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
params.scripts_dir = "${baseDir}/scripts"
params.num_samples = 10
params.batch_size_100 = 10
// Parameters for structure generation and analysis
params.num_samples = 20
params.batch_size = 5
params.temperature = 300
params.n_clusters = 5
process BIOEMU {
process GENERATE_STRUCTURE {
container 'bioemu:latest'
containerOptions '--rm --gpus all -v /mnt:/mnt -v /tmp:/tmp'
publishDir "${params.outdir}/${protein_id}", mode: 'copy'
containerOptions '--rm --gpus all -v /mnt:/mnt'
publishDir "${params.outdir}/${params.complex_name}/${protein_id}", mode: 'copy'
input:
tuple val(protein_id), path(fasta)
output:
tuple val(protein_id), path("topology.pdb"), path("samples.xtc"), emit: structures
path "sequence.fasta", optional: true
path "batch_*.npz", optional: true
path "run.log"
tuple val(protein_id), path("${protein_id}_topology.pdb"), path("${protein_id}_samples.xtc")
script:
"""
# Make sure cache directory exists
mkdir -p ${params.cache_dir}
# Extract the sequence from the FASTA file
# Extract sequence from FASTA
SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n')
# Run BioEmu with the extracted sequence
python3 -m bioemu.sample \
--sequence "\${SEQUENCE}" \
--num_samples ${params.num_samples} \
--batch_size_100 ${params.batch_size_100} \
--output_dir . \
--cache_embeds_dir ${params.cache_dir} 2>&1 | tee run.log
# Run BioEmu
python3 -m bioemu.sample \\
--sequence "\${SEQUENCE}" \\
--num_samples ${params.num_samples} \\
--batch_size_100 ${params.batch_size} \\
--output_dir . \\
--cache_embeds_dir ${params.cache_dir}
# Rename output files
mv topology.pdb ${protein_id}_topology.pdb
mv samples.xtc ${protein_id}_samples.xtc
"""
}
process CALCULATE_FREE_ENERGY {
process CALCULATE_BINDING {
container 'bioemu:latest'
containerOptions '--rm --gpus all -v /mnt:/mnt'
publishDir "${params.outdir}/${protein_id}/analysis", mode: 'copy'
containerOptions '--rm --gpus all -v /mnt:/mnt -v /data:/data'
publishDir "${params.outdir}/${params.complex_name}/analysis", mode: 'copy'
input:
tuple val(protein_id), path(topology), path(samples)
path protein1_topology
path protein1_samples
path protein2_topology
path protein2_samples
output:
tuple val(protein_id), path("free_energy.csv"), emit: free_energy
path "energy_plot.png", optional: true
path "binding_energy.csv"
path "binding_energy_report.txt"
path "energy_comparison.png"
script:
"""
# Calculate free energy from sampled structures
python3 /opt/bioemu/scripts/calculate_gibbs.py \\
--samples ${samples} \\
--topology ${topology} \\
# Run binding energy calculation with the existing script
python3 /data/olamide/fresh-bioemu2/scripts/calculate_binding.py \\
--protein1_topology ${protein1_topology} \\
--protein1_samples ${protein1_samples} \\
--protein2_topology ${protein2_topology} \\
--protein2_samples ${protein2_samples} \\
--temperature ${params.temperature} \\
--n_clusters ${params.n_clusters} \\
--output free_energy.csv \\
--plot energy_plot.png
--output binding_energy.csv \\
--plot energy_comparison.png
# Generate report
echo "# Binding Free Energy Analysis: ${params.complex_name}" > binding_energy_report.txt
echo "======================================================" >> binding_energy_report.txt
echo "## Experimental Value (Database)" >> binding_energy_report.txt
echo "ΔG = ${params.exp_dG} kcal/mol" >> binding_energy_report.txt
echo "" >> binding_energy_report.txt
# Extract predicted value
PREDICTED_DG=\$(grep -A1 "binding_free_energy" binding_energy.csv | tail -n1 | cut -d',' -f2)
echo "## BioEmu Prediction" >> binding_energy_report.txt
echo "ΔG = \${PREDICTED_DG} kcal/mol" >> binding_energy_report.txt
echo "" >> binding_energy_report.txt
# Calculate comparison metrics
echo "## Comparison" >> binding_energy_report.txt
ABS_DIFF=\$(python3 -c "print('%.2f' % abs(float('\${PREDICTED_DG}') - (${params.exp_dG})))")
REL_ERROR=\$(python3 -c "print('%.2f' % (((float('\${PREDICTED_DG}') - (${params.exp_dG}))/(${params.exp_dG}))*100))")
echo "Absolute Difference: \${ABS_DIFF} kcal/mol" >> binding_energy_report.txt
echo "Relative Error: \${REL_ERROR}%" >> binding_energy_report.txt
"""
}
workflow {
// Convert fasta_list to a channel of [protein_id, fasta_file] tuples
Channel.fromList(params.fasta_list)
.map { fasta_path ->
def file = file(fasta_path)
return [file.baseName, file]
}
.set { fasta_ch }
// Run BioEmu for each protein sequence
BIOEMU(fasta_ch)
// Calculate Gibbs free energy for each protein
CALCULATE_FREE_ENERGY(BIOEMU.out.structures)
// Create channel for proteins
protein_ch = Channel.fromList([
tuple("protein1", file(params.protein1_fasta)),
tuple("protein2", file(params.protein2_fasta))
])
// Generate structures
GENERATE_STRUCTURE(protein_ch)
// Extract structure files for each protein
protein1_files = GENERATE_STRUCTURE.out
.filter { it[0] == "protein1" }
.map { it -> tuple(it[1], it[2]) }
.first()
protein2_files = GENERATE_STRUCTURE.out
.filter { it[0] == "protein2" }
.map { it -> tuple(it[1], it[2]) }
.first()
// Calculate binding energy (direct script reference)
CALCULATE_BINDING(
protein1_files[0], // protein1_topology.pdb
protein1_files[1], // protein1_samples.xtc
protein2_files[0], // protein2_topology.pdb
protein2_files[1] // protein2_samples.xtc
)
}