Implement interface-adaptive binding energy calculator

2025-03-06 12:51:36 -08:00
parent 2cfbf64e92
commit 83525663c9
4 changed files with 654 additions and 60 deletions
--- a/main.nf
+++ b/main.nf
@@ -1,89 +1,129 @@
 #!/usr/bin/env nextflow
-
 nextflow.enable.dsl=2

-// Multiple FASTA files to process
-params.fasta_list = [
-    "/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta",
-    "/mnt/OmicNAS/private/old/olamide/bioemu/input/trp_cage.fasta"
-]
+// Define parameters
+params.complex_name = "hgh_mab1"  // Default complex name
+params.protein1_fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/complexes/hgh.fasta"
+params.protein2_fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/complexes/mab1.fasta"
+params.exp_dG = -13.1  // kcal/mol from experimental database
 params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
 params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
-params.scripts_dir = "${baseDir}/scripts"
-params.num_samples = 10
-params.batch_size_100 = 10
+
+// Parameters for structure generation and analysis
+params.num_samples = 20
+params.batch_size = 5
 params.temperature = 300
 params.n_clusters = 5

-process BIOEMU {
+process GENERATE_STRUCTURE {
    container 'bioemu:latest'
-    containerOptions '--rm --gpus all -v /mnt:/mnt -v /tmp:/tmp'
-    publishDir "${params.outdir}/${protein_id}", mode: 'copy'
-    
+    containerOptions '--rm --gpus all -v /mnt:/mnt'
+    publishDir "${params.outdir}/${params.complex_name}/${protein_id}", mode: 'copy'
+
    input:
        tuple val(protein_id), path(fasta)
-    
+
    output:
-        tuple val(protein_id), path("topology.pdb"), path("samples.xtc"), emit: structures
-        path "sequence.fasta", optional: true
-        path "batch_*.npz", optional: true
-        path "run.log"
-    
+        tuple val(protein_id), path("${protein_id}_topology.pdb"), path("${protein_id}_samples.xtc")
+
    script:
    """
-    # Make sure cache directory exists
-    mkdir -p ${params.cache_dir}
-    
-    # Extract the sequence from the FASTA file
+    # Extract sequence from FASTA
    SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n')
-    
-    # Run BioEmu with the extracted sequence
-    python3 -m bioemu.sample \
-        --sequence "\${SEQUENCE}" \
-        --num_samples ${params.num_samples} \
-        --batch_size_100 ${params.batch_size_100} \
-        --output_dir . \
-        --cache_embeds_dir ${params.cache_dir} 2>&1 | tee run.log
+
+    # Run BioEmu
+    python3 -m bioemu.sample \\
+        --sequence "\${SEQUENCE}" \\
+        --num_samples ${params.num_samples} \\
+        --batch_size_100 ${params.batch_size} \\
+        --output_dir . \\
+        --cache_embeds_dir ${params.cache_dir}
+
+    # Rename output files
+    mv topology.pdb ${protein_id}_topology.pdb
+    mv samples.xtc ${protein_id}_samples.xtc
    """
 }

-process CALCULATE_FREE_ENERGY {
+process CALCULATE_BINDING {
    container 'bioemu:latest'
-    containerOptions '--rm --gpus all -v /mnt:/mnt'
-    publishDir "${params.outdir}/${protein_id}/analysis", mode: 'copy'
-    
+    containerOptions '--rm --gpus all -v /mnt:/mnt -v /data:/data'
+    publishDir "${params.outdir}/${params.complex_name}/analysis", mode: 'copy'
+
    input:
-        tuple val(protein_id), path(topology), path(samples)
-    
+        path protein1_topology
+        path protein1_samples
+        path protein2_topology
+        path protein2_samples
+
    output:
-        tuple val(protein_id), path("free_energy.csv"), emit: free_energy
-        path "energy_plot.png", optional: true
-        
+        path "binding_energy.csv"
+        path "binding_energy_report.txt"
+        path "energy_comparison.png"
+
    script:
    """
-    # Calculate free energy from sampled structures
-    python3 /opt/bioemu/scripts/calculate_gibbs.py \\
-        --samples ${samples} \\
-        --topology ${topology} \\
+    # Run binding energy calculation with the existing script
+    python3 /data/olamide/fresh-bioemu2/scripts/calculate_binding.py \\
+        --protein1_topology ${protein1_topology} \\
+        --protein1_samples ${protein1_samples} \\
+        --protein2_topology ${protein2_topology} \\
+        --protein2_samples ${protein2_samples} \\
        --temperature ${params.temperature} \\
        --n_clusters ${params.n_clusters} \\
-        --output free_energy.csv \\
-        --plot energy_plot.png
+        --output binding_energy.csv \\
+        --plot energy_comparison.png
+
+    # Generate report
+    echo "# Binding Free Energy Analysis: ${params.complex_name}" > binding_energy_report.txt
+    echo "======================================================" >> binding_energy_report.txt
+    echo "## Experimental Value (Database)" >> binding_energy_report.txt
+    echo "ΔG = ${params.exp_dG} kcal/mol" >> binding_energy_report.txt
+    echo "" >> binding_energy_report.txt
+
+    # Extract predicted value
+    PREDICTED_DG=\$(grep -A1 "binding_free_energy" binding_energy.csv | tail -n1 | cut -d',' -f2)
+
+    echo "## BioEmu Prediction" >> binding_energy_report.txt
+    echo "ΔG = \${PREDICTED_DG} kcal/mol" >> binding_energy_report.txt
+    echo "" >> binding_energy_report.txt
+
+    # Calculate comparison metrics
+    echo "## Comparison" >> binding_energy_report.txt
+    ABS_DIFF=\$(python3 -c "print('%.2f' % abs(float('\${PREDICTED_DG}') - (${params.exp_dG})))")
+    REL_ERROR=\$(python3 -c "print('%.2f' % (((float('\${PREDICTED_DG}') - (${params.exp_dG}))/(${params.exp_dG}))*100))")
+
+    echo "Absolute Difference: \${ABS_DIFF} kcal/mol" >> binding_energy_report.txt
+    echo "Relative Error: \${REL_ERROR}%" >> binding_energy_report.txt
    """
 }

 workflow {
-    // Convert fasta_list to a channel of [protein_id, fasta_file] tuples
-    Channel.fromList(params.fasta_list)
-           .map { fasta_path -> 
-               def file = file(fasta_path)
-               return [file.baseName, file]
-           }
-           .set { fasta_ch }
-    
-    // Run BioEmu for each protein sequence
-    BIOEMU(fasta_ch)
-    
-    // Calculate Gibbs free energy for each protein
-    CALCULATE_FREE_ENERGY(BIOEMU.out.structures)
+    // Create channel for proteins
+    protein_ch = Channel.fromList([
+        tuple("protein1", file(params.protein1_fasta)),
+        tuple("protein2", file(params.protein2_fasta))
+    ])
+
+    // Generate structures
+    GENERATE_STRUCTURE(protein_ch)
+
+    // Extract structure files for each protein
+    protein1_files = GENERATE_STRUCTURE.out
+                      .filter { it[0] == "protein1" }
+                      .map { it -> tuple(it[1], it[2]) }
+                      .first()
+
+    protein2_files = GENERATE_STRUCTURE.out
+                      .filter { it[0] == "protein2" }
+                      .map { it -> tuple(it[1], it[2]) }
+                      .first()
+
+    // Calculate binding energy (direct script reference)
+    CALCULATE_BINDING(
+        protein1_files[0],  // protein1_topology.pdb
+        protein1_files[1],  // protein1_samples.xtc
+        protein2_files[0],  // protein2_topology.pdb
+        protein2_files[1]   // protein2_samples.xtc
+    )
 }