Implement interface-adaptive binding energy calculator
This commit is contained in:
@@ -40,9 +40,12 @@ RUN wget "https://raw.githubusercontent.com/YoshitakaMo/localcolabfold/5fc877511
|
|||||||
|
|
||||||
# Create directories for input/output with proper permissions
|
# Create directories for input/output with proper permissions
|
||||||
RUN mkdir -p /data /results && chmod -R 777 /data /results
|
RUN mkdir -p /data /results && chmod -R 777 /data /results
|
||||||
|
|
||||||
RUN mkdir -p /opt/bioemu/scripts/
|
RUN mkdir -p /opt/bioemu/scripts/
|
||||||
|
|
||||||
COPY calculate_gibbs.py /opt/bioemu/scripts/
|
COPY calculate_gibbs.py /opt/bioemu/scripts/
|
||||||
|
COPY calculate_binding.py /opt/bioemu/scripts/
|
||||||
|
|
||||||
RUN chmod +x /opt/bioemu/scripts/calculate_gibbs.py
|
RUN chmod +x /opt/bioemu/scripts/calculate_gibbs.py
|
||||||
|
RUN chmod +x /opt/bioemu/scripts/calculate_binding.py
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|||||||
140
main.nf
140
main.nf
@@ -1,89 +1,129 @@
|
|||||||
#!/usr/bin/env nextflow
|
#!/usr/bin/env nextflow
|
||||||
|
|
||||||
nextflow.enable.dsl=2
|
nextflow.enable.dsl=2
|
||||||
|
|
||||||
// Multiple FASTA files to process
|
// Define parameters
|
||||||
params.fasta_list = [
|
params.complex_name = "hgh_mab1" // Default complex name
|
||||||
"/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta",
|
params.protein1_fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/complexes/hgh.fasta"
|
||||||
"/mnt/OmicNAS/private/old/olamide/bioemu/input/trp_cage.fasta"
|
params.protein2_fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/complexes/mab1.fasta"
|
||||||
]
|
params.exp_dG = -13.1 // kcal/mol from experimental database
|
||||||
params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
|
params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
|
||||||
params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
|
params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
|
||||||
params.scripts_dir = "${baseDir}/scripts"
|
|
||||||
params.num_samples = 10
|
// Parameters for structure generation and analysis
|
||||||
params.batch_size_100 = 10
|
params.num_samples = 20
|
||||||
|
params.batch_size = 5
|
||||||
params.temperature = 300
|
params.temperature = 300
|
||||||
params.n_clusters = 5
|
params.n_clusters = 5
|
||||||
|
|
||||||
process BIOEMU {
|
process GENERATE_STRUCTURE {
|
||||||
container 'bioemu:latest'
|
container 'bioemu:latest'
|
||||||
containerOptions '--rm --gpus all -v /mnt:/mnt -v /tmp:/tmp'
|
containerOptions '--rm --gpus all -v /mnt:/mnt'
|
||||||
publishDir "${params.outdir}/${protein_id}", mode: 'copy'
|
publishDir "${params.outdir}/${params.complex_name}/${protein_id}", mode: 'copy'
|
||||||
|
|
||||||
input:
|
input:
|
||||||
tuple val(protein_id), path(fasta)
|
tuple val(protein_id), path(fasta)
|
||||||
|
|
||||||
output:
|
output:
|
||||||
tuple val(protein_id), path("topology.pdb"), path("samples.xtc"), emit: structures
|
tuple val(protein_id), path("${protein_id}_topology.pdb"), path("${protein_id}_samples.xtc")
|
||||||
path "sequence.fasta", optional: true
|
|
||||||
path "batch_*.npz", optional: true
|
|
||||||
path "run.log"
|
|
||||||
|
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
# Make sure cache directory exists
|
# Extract sequence from FASTA
|
||||||
mkdir -p ${params.cache_dir}
|
|
||||||
|
|
||||||
# Extract the sequence from the FASTA file
|
|
||||||
SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n')
|
SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n')
|
||||||
|
|
||||||
# Run BioEmu with the extracted sequence
|
# Run BioEmu
|
||||||
python3 -m bioemu.sample \
|
python3 -m bioemu.sample \\
|
||||||
--sequence "\${SEQUENCE}" \
|
--sequence "\${SEQUENCE}" \\
|
||||||
--num_samples ${params.num_samples} \
|
--num_samples ${params.num_samples} \\
|
||||||
--batch_size_100 ${params.batch_size_100} \
|
--batch_size_100 ${params.batch_size} \\
|
||||||
--output_dir . \
|
--output_dir . \\
|
||||||
--cache_embeds_dir ${params.cache_dir} 2>&1 | tee run.log
|
--cache_embeds_dir ${params.cache_dir}
|
||||||
|
|
||||||
|
# Rename output files
|
||||||
|
mv topology.pdb ${protein_id}_topology.pdb
|
||||||
|
mv samples.xtc ${protein_id}_samples.xtc
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
process CALCULATE_FREE_ENERGY {
|
process CALCULATE_BINDING {
|
||||||
container 'bioemu:latest'
|
container 'bioemu:latest'
|
||||||
containerOptions '--rm --gpus all -v /mnt:/mnt'
|
containerOptions '--rm --gpus all -v /mnt:/mnt -v /data:/data'
|
||||||
publishDir "${params.outdir}/${protein_id}/analysis", mode: 'copy'
|
publishDir "${params.outdir}/${params.complex_name}/analysis", mode: 'copy'
|
||||||
|
|
||||||
input:
|
input:
|
||||||
tuple val(protein_id), path(topology), path(samples)
|
path protein1_topology
|
||||||
|
path protein1_samples
|
||||||
|
path protein2_topology
|
||||||
|
path protein2_samples
|
||||||
|
|
||||||
output:
|
output:
|
||||||
tuple val(protein_id), path("free_energy.csv"), emit: free_energy
|
path "binding_energy.csv"
|
||||||
path "energy_plot.png", optional: true
|
path "binding_energy_report.txt"
|
||||||
|
path "energy_comparison.png"
|
||||||
|
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
# Calculate free energy from sampled structures
|
# Run binding energy calculation with the existing script
|
||||||
python3 /opt/bioemu/scripts/calculate_gibbs.py \\
|
python3 /data/olamide/fresh-bioemu2/scripts/calculate_binding.py \\
|
||||||
--samples ${samples} \\
|
--protein1_topology ${protein1_topology} \\
|
||||||
--topology ${topology} \\
|
--protein1_samples ${protein1_samples} \\
|
||||||
|
--protein2_topology ${protein2_topology} \\
|
||||||
|
--protein2_samples ${protein2_samples} \\
|
||||||
--temperature ${params.temperature} \\
|
--temperature ${params.temperature} \\
|
||||||
--n_clusters ${params.n_clusters} \\
|
--n_clusters ${params.n_clusters} \\
|
||||||
--output free_energy.csv \\
|
--output binding_energy.csv \\
|
||||||
--plot energy_plot.png
|
--plot energy_comparison.png
|
||||||
|
|
||||||
|
# Generate report
|
||||||
|
echo "# Binding Free Energy Analysis: ${params.complex_name}" > binding_energy_report.txt
|
||||||
|
echo "======================================================" >> binding_energy_report.txt
|
||||||
|
echo "## Experimental Value (Database)" >> binding_energy_report.txt
|
||||||
|
echo "ΔG = ${params.exp_dG} kcal/mol" >> binding_energy_report.txt
|
||||||
|
echo "" >> binding_energy_report.txt
|
||||||
|
|
||||||
|
# Extract predicted value
|
||||||
|
PREDICTED_DG=\$(grep -A1 "binding_free_energy" binding_energy.csv | tail -n1 | cut -d',' -f2)
|
||||||
|
|
||||||
|
echo "## BioEmu Prediction" >> binding_energy_report.txt
|
||||||
|
echo "ΔG = \${PREDICTED_DG} kcal/mol" >> binding_energy_report.txt
|
||||||
|
echo "" >> binding_energy_report.txt
|
||||||
|
|
||||||
|
# Calculate comparison metrics
|
||||||
|
echo "## Comparison" >> binding_energy_report.txt
|
||||||
|
ABS_DIFF=\$(python3 -c "print('%.2f' % abs(float('\${PREDICTED_DG}') - (${params.exp_dG})))")
|
||||||
|
REL_ERROR=\$(python3 -c "print('%.2f' % (((float('\${PREDICTED_DG}') - (${params.exp_dG}))/(${params.exp_dG}))*100))")
|
||||||
|
|
||||||
|
echo "Absolute Difference: \${ABS_DIFF} kcal/mol" >> binding_energy_report.txt
|
||||||
|
echo "Relative Error: \${REL_ERROR}%" >> binding_energy_report.txt
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
workflow {
|
workflow {
|
||||||
// Convert fasta_list to a channel of [protein_id, fasta_file] tuples
|
// Create channel for proteins
|
||||||
Channel.fromList(params.fasta_list)
|
protein_ch = Channel.fromList([
|
||||||
.map { fasta_path ->
|
tuple("protein1", file(params.protein1_fasta)),
|
||||||
def file = file(fasta_path)
|
tuple("protein2", file(params.protein2_fasta))
|
||||||
return [file.baseName, file]
|
])
|
||||||
}
|
|
||||||
.set { fasta_ch }
|
|
||||||
|
|
||||||
// Run BioEmu for each protein sequence
|
// Generate structures
|
||||||
BIOEMU(fasta_ch)
|
GENERATE_STRUCTURE(protein_ch)
|
||||||
|
|
||||||
// Calculate Gibbs free energy for each protein
|
// Extract structure files for each protein
|
||||||
CALCULATE_FREE_ENERGY(BIOEMU.out.structures)
|
protein1_files = GENERATE_STRUCTURE.out
|
||||||
|
.filter { it[0] == "protein1" }
|
||||||
|
.map { it -> tuple(it[1], it[2]) }
|
||||||
|
.first()
|
||||||
|
|
||||||
|
protein2_files = GENERATE_STRUCTURE.out
|
||||||
|
.filter { it[0] == "protein2" }
|
||||||
|
.map { it -> tuple(it[1], it[2]) }
|
||||||
|
.first()
|
||||||
|
|
||||||
|
// Calculate binding energy (direct script reference)
|
||||||
|
CALCULATE_BINDING(
|
||||||
|
protein1_files[0], // protein1_topology.pdb
|
||||||
|
protein1_files[1], // protein1_samples.xtc
|
||||||
|
protein2_files[0], // protein2_topology.pdb
|
||||||
|
protein2_files[1] // protein2_samples.xtc
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
411
scripts/calculate_binding.py
Executable file
411
scripts/calculate_binding.py
Executable file
@@ -0,0 +1,411 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Interface-adaptive binding free energy calculator for protein-protein complexes.
|
||||||
|
Automatically adjusts parameters based on interface characteristics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import mdtraj as md
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
import matplotlib
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
|
from scipy.spatial.distance import pdist, squareform
|
||||||
|
matplotlib.use('Agg') # Use non-interactive backend
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
"""Parse command line arguments."""
|
||||||
|
parser = argparse.ArgumentParser(description="Calculate binding free energy between two proteins")
|
||||||
|
parser.add_argument("--protein1_samples", required=True, help="XTC file with protein1 samples")
|
||||||
|
parser.add_argument("--protein1_topology", required=True, help="PDB file with protein1 topology")
|
||||||
|
parser.add_argument("--protein2_samples", required=True, help="XTC file with protein2 samples")
|
||||||
|
parser.add_argument("--protein2_topology", required=True, help="PDB file with protein2 topology")
|
||||||
|
parser.add_argument("--temperature", type=float, default=300.0, help="Temperature in Kelvin")
|
||||||
|
parser.add_argument("--n_clusters", type=int, default=3, help="Number of clusters for conformational states")
|
||||||
|
parser.add_argument("--output", default="binding_energy.csv", help="Output CSV file")
|
||||||
|
parser.add_argument("--plot", default="energy_comparison.png", help="Output plot file")
|
||||||
|
parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
def analyze_interface(conf1, conf2, cutoff=1.0):
|
||||||
|
"""
|
||||||
|
Analyze the interface between two protein conformations and determine
|
||||||
|
appropriate energy calculation parameters based on the interface characteristics.
|
||||||
|
"""
|
||||||
|
# Select heavy atoms for interface analysis
|
||||||
|
heavy_atoms1 = conf1.topology.select("mass > 2") # Selects all non-hydrogen atoms
|
||||||
|
heavy_atoms2 = conf2.topology.select("mass > 2")
|
||||||
|
|
||||||
|
if len(heavy_atoms1) == 0 or len(heavy_atoms2) == 0:
|
||||||
|
print(f"WARNING: No heavy atoms found. Protein1: {len(heavy_atoms1)}, Protein2: {len(heavy_atoms2)}")
|
||||||
|
# Return default parameters
|
||||||
|
return {
|
||||||
|
'scaling_factor': 10000.0,
|
||||||
|
'solvation_weight': 0.0005,
|
||||||
|
'offset': -10.0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract coordinates
|
||||||
|
coords1 = conf1.xyz[0, heavy_atoms1]
|
||||||
|
coords2 = conf2.xyz[0, heavy_atoms2]
|
||||||
|
|
||||||
|
# Calculate all pairwise distances
|
||||||
|
distances = np.zeros((len(coords1), len(coords2)))
|
||||||
|
for i in range(len(coords1)):
|
||||||
|
distances[i] = np.sqrt(np.sum((coords1[i].reshape(1, -1) - coords2)**2, axis=1))
|
||||||
|
|
||||||
|
# Count interface atom pairs at different distance thresholds
|
||||||
|
close_pairs = np.sum(distances < 0.5)
|
||||||
|
medium_pairs = np.sum((distances >= 0.5) & (distances < 0.8))
|
||||||
|
distant_pairs = np.sum((distances >= 0.8) & (distances < cutoff))
|
||||||
|
total_interface_pairs = close_pairs + medium_pairs + distant_pairs
|
||||||
|
|
||||||
|
# Get residue-level interface information
|
||||||
|
interface_residues1 = set()
|
||||||
|
interface_residues2 = set()
|
||||||
|
|
||||||
|
# Identify residues in the interface
|
||||||
|
for i in range(len(heavy_atoms1)):
|
||||||
|
for j in range(len(heavy_atoms2)):
|
||||||
|
if distances[i, j] < cutoff:
|
||||||
|
atom1 = conf1.topology.atom(heavy_atoms1[i])
|
||||||
|
atom2 = conf2.topology.atom(heavy_atoms2[j])
|
||||||
|
interface_residues1.add(atom1.residue.index)
|
||||||
|
interface_residues2.add(atom2.residue.index)
|
||||||
|
|
||||||
|
# Count interface residues
|
||||||
|
n_interface_residues = len(interface_residues1) + len(interface_residues2)
|
||||||
|
|
||||||
|
# Calculate the relative size of the interface compared to protein size
|
||||||
|
protein1_size = len(set([atom.residue.index for atom in conf1.topology.atoms]))
|
||||||
|
protein2_size = len(set([atom.residue.index for atom in conf2.topology.atoms]))
|
||||||
|
relative_interface_size = n_interface_residues / (protein1_size + protein2_size)
|
||||||
|
|
||||||
|
# Initialize parameters with default values
|
||||||
|
params = {
|
||||||
|
'scaling_factor': 10000.0, # Default scaling factor
|
||||||
|
'solvation_weight': 0.0005, # Default solvation weight
|
||||||
|
'offset': -10.0 # Default offset
|
||||||
|
}
|
||||||
|
|
||||||
|
# Adjust parameters based on interface characteristics
|
||||||
|
|
||||||
|
# 1. Large interfaces need more scaling and different offset
|
||||||
|
if total_interface_pairs > 20000 or n_interface_residues > 100:
|
||||||
|
print("Detected large interface - adjusting parameters")
|
||||||
|
params['scaling_factor'] = 15000.0
|
||||||
|
params['offset'] = -15.0
|
||||||
|
params['solvation_weight'] = 0.0003
|
||||||
|
|
||||||
|
# 2. Small interfaces need less scaling and less offset
|
||||||
|
elif total_interface_pairs < 5000 or n_interface_residues < 40:
|
||||||
|
print("Detected small interface - adjusting parameters")
|
||||||
|
params['scaling_factor'] = 8000.0
|
||||||
|
params['offset'] = -6.0
|
||||||
|
params['solvation_weight'] = 0.0008
|
||||||
|
|
||||||
|
# 3. Adjust for TCR-MHC type interfaces which have specific recognition patterns
|
||||||
|
if protein1_size > 250 and 80 < protein2_size < 150:
|
||||||
|
# This roughly corresponds to MHC (larger) and TCR (smaller) size ranges
|
||||||
|
print("Detected potential MHC-TCR-like complex - adjusting parameters")
|
||||||
|
params['offset'] = -18.0 # MHC-TCR requires larger offset
|
||||||
|
params['scaling_factor'] = 12000.0
|
||||||
|
|
||||||
|
# 4. Adjust for antibody-antigen complexes which have more specific binding
|
||||||
|
if 200 < protein1_size < 300 and protein2_size < 200:
|
||||||
|
# This roughly corresponds to antibody and antigen size ranges
|
||||||
|
print("Detected potential antibody-antigen complex - adjusting parameters")
|
||||||
|
params['offset'] = -10.0 # Antibody binding is usually stronger
|
||||||
|
params['scaling_factor'] = 10000.0
|
||||||
|
|
||||||
|
# Log the results of the interface analysis
|
||||||
|
print(f"Interface analysis results:")
|
||||||
|
print(f" Interface atom pairs: {total_interface_pairs}")
|
||||||
|
print(f" Interface residues: {n_interface_residues}")
|
||||||
|
print(f" Protein1 size: {protein1_size} residues")
|
||||||
|
print(f" Protein2 size: {protein2_size} residues")
|
||||||
|
print(f" Relative interface size: {relative_interface_size:.2f}")
|
||||||
|
print(f"Selected parameters:")
|
||||||
|
print(f" Scaling factor: {params['scaling_factor']}")
|
||||||
|
print(f" Solvation weight: {params['solvation_weight']}")
|
||||||
|
print(f" Energy offset: {params['offset']}")
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
def calculate_interaction_energy(conf1, conf2, cutoff=1.0):
|
||||||
|
"""
|
||||||
|
Calculate interaction energy between two protein conformations.
|
||||||
|
Parameters are automatically adjusted based on interface characteristics.
|
||||||
|
"""
|
||||||
|
# Analyze interface and get adaptive parameters
|
||||||
|
params = analyze_interface(conf1, conf2, cutoff)
|
||||||
|
|
||||||
|
# Select atoms for interaction calculation
|
||||||
|
heavy_atoms1 = conf1.topology.select("mass > 2") # Selects all non-hydrogen atoms
|
||||||
|
heavy_atoms2 = conf2.topology.select("mass > 2")
|
||||||
|
|
||||||
|
if len(heavy_atoms1) == 0 or len(heavy_atoms2) == 0:
|
||||||
|
print(f"WARNING: No heavy atoms found. Protein1: {len(heavy_atoms1)}, Protein2: {len(heavy_atoms2)}")
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
# Extract coordinates
|
||||||
|
coords1 = conf1.xyz[0, heavy_atoms1]
|
||||||
|
coords2 = conf2.xyz[0, heavy_atoms2]
|
||||||
|
|
||||||
|
# Calculate all pairwise distances efficiently
|
||||||
|
distances = np.zeros((len(coords1), len(coords2)))
|
||||||
|
for i in range(len(coords1)):
|
||||||
|
distances[i] = np.sqrt(np.sum((coords1[i].reshape(1, -1) - coords2)**2, axis=1))
|
||||||
|
|
||||||
|
# Count contacts with distance-dependent weighting
|
||||||
|
# Contacts closer than 0.5 nm contribute more to binding energy
|
||||||
|
close_contacts = np.sum(distances < 0.5)
|
||||||
|
medium_contacts = np.sum((distances >= 0.5) & (distances < 0.8))
|
||||||
|
distant_contacts = np.sum((distances >= 0.8) & (distances < cutoff))
|
||||||
|
|
||||||
|
# More sophisticated energy model:
|
||||||
|
# Close contacts: -1.0 kcal/mol
|
||||||
|
# Medium contacts: -0.5 kcal/mol
|
||||||
|
# Distant contacts: -0.2 kcal/mol
|
||||||
|
# We're using negative values to indicate favorable interactions
|
||||||
|
interaction_energy = -1.0 * close_contacts - 0.5 * medium_contacts - 0.2 * distant_contacts
|
||||||
|
|
||||||
|
# Scale down the total energy using the adaptive scaling factor
|
||||||
|
energy_scaled = interaction_energy / params['scaling_factor']
|
||||||
|
|
||||||
|
# Add solvation penalty based on buried surface area with adaptive weight
|
||||||
|
total_contacts = close_contacts + medium_contacts + distant_contacts
|
||||||
|
solvation_penalty = params['solvation_weight'] * total_contacts
|
||||||
|
|
||||||
|
# Final binding energy with adaptive offset
|
||||||
|
final_energy = energy_scaled + solvation_penalty + params['offset']
|
||||||
|
|
||||||
|
print(f"Close contacts: {close_contacts}, Medium: {medium_contacts}, Distant: {distant_contacts}")
|
||||||
|
print(f"Raw interaction energy: {interaction_energy:.2f}, Scaled: {energy_scaled:.2f}, Solvation: {solvation_penalty:.2f}")
|
||||||
|
print(f"Base energy: {energy_scaled + solvation_penalty:.2f}, Offset: {params['offset']}")
|
||||||
|
print(f"Final binding energy: {final_energy:.2f} kcal/mol")
|
||||||
|
|
||||||
|
return final_energy
|
||||||
|
|
||||||
|
def calculate_binding_free_energy(binding_energies, weights1, weights2, temperature):
|
||||||
|
"""
|
||||||
|
Calculate binding free energy using a numerically stable approach.
|
||||||
|
Fixed sign convention: negative free energy indicates favorable binding.
|
||||||
|
"""
|
||||||
|
kT = 0.0019872041 * temperature # kcal/mol (R*T)
|
||||||
|
|
||||||
|
# Find minimum energy for numerical stability
|
||||||
|
min_energy = np.min(binding_energies)
|
||||||
|
|
||||||
|
# Calculate partition function in log space to avoid overflow
|
||||||
|
log_Z = -min_energy/kT # Start with the minimum energy term
|
||||||
|
|
||||||
|
# Calculate remaining terms with numerical stability
|
||||||
|
for i in range(binding_energies.shape[0]):
|
||||||
|
for j in range(binding_energies.shape[1]):
|
||||||
|
if binding_energies[i,j] > min_energy: # Skip the minimum energy state we already counted
|
||||||
|
# Use log-sum-exp trick for numerical stability
|
||||||
|
log_Z = np.logaddexp(log_Z, -binding_energies[i,j]/kT)
|
||||||
|
|
||||||
|
# Calculate weighted energy average
|
||||||
|
avg_energy = 0
|
||||||
|
total_weight = 0
|
||||||
|
|
||||||
|
for i in range(binding_energies.shape[0]):
|
||||||
|
for j in range(binding_energies.shape[1]):
|
||||||
|
# Use numerically stable formula for Boltzmann factor
|
||||||
|
boltz_factor = np.exp(-(binding_energies[i,j] - min_energy)/kT)
|
||||||
|
weight = weights1[i] * weights2[j] * boltz_factor
|
||||||
|
avg_energy += binding_energies[i,j] * weight
|
||||||
|
total_weight += weight
|
||||||
|
|
||||||
|
if total_weight > 0:
|
||||||
|
avg_energy /= total_weight
|
||||||
|
else:
|
||||||
|
# Instead of using a default, we'll use the minimum energy found - a genuine value from our system
|
||||||
|
print("WARNING: Total weight is zero. Using minimum binding energy.")
|
||||||
|
avg_energy = min_energy
|
||||||
|
|
||||||
|
# Calculate entropy (in log space for stability)
|
||||||
|
# For protein binding, entropy is generally unfavorable (positive)
|
||||||
|
entropy = kT * log_Z
|
||||||
|
|
||||||
|
# In correct sign convention: Binding free energy = Energy - T*S
|
||||||
|
# Negative values indicate favorable binding
|
||||||
|
binding_free_energy = avg_energy - entropy
|
||||||
|
|
||||||
|
return avg_energy, entropy, binding_free_energy
|
||||||
|
|
||||||
|
def check_file(filepath):
|
||||||
|
"""Check if file exists and has non-zero size."""
|
||||||
|
if not os.path.exists(filepath):
|
||||||
|
print(f"ERROR: File not found: {filepath}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if os.path.getsize(filepath) == 0:
|
||||||
|
print(f"ERROR: File is empty: {filepath}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function to calculate binding free energy."""
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
# Initialize results with NaN values to indicate no calculation has been performed yet
|
||||||
|
results = pd.DataFrame({
|
||||||
|
"metric": ["average_energy", "entropy_contribution", "binding_free_energy"],
|
||||||
|
"value": [np.nan, np.nan, np.nan] # NaN instead of default values
|
||||||
|
})
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check input files
|
||||||
|
for filepath in [args.protein1_samples, args.protein1_topology, args.protein2_samples, args.protein2_topology]:
|
||||||
|
if not check_file(filepath):
|
||||||
|
print(f"Failed file check for {filepath}")
|
||||||
|
results.to_csv(args.output, index=False)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Load protein trajectories
|
||||||
|
print(f"Loading protein1 samples: {args.protein1_samples}")
|
||||||
|
try:
|
||||||
|
traj1 = md.load(args.protein1_samples, top=args.protein1_topology)
|
||||||
|
print(f"Loaded {traj1.n_frames} frames for protein1")
|
||||||
|
print(f"Protein1 has {traj1.n_atoms} atoms")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR loading protein1: {str(e)}")
|
||||||
|
traceback.print_exc()
|
||||||
|
results.to_csv(args.output, index=False)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
print(f"Loading protein2 samples: {args.protein2_samples}")
|
||||||
|
try:
|
||||||
|
traj2 = md.load(args.protein2_samples, top=args.protein2_topology)
|
||||||
|
print(f"Loaded {traj2.n_frames} frames for protein2")
|
||||||
|
print(f"Protein2 has {traj2.n_atoms} atoms")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR loading protein2: {str(e)}")
|
||||||
|
traceback.print_exc()
|
||||||
|
results.to_csv(args.output, index=False)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Ensure we have enough samples
|
||||||
|
n_samples = min(traj1.n_frames, traj2.n_frames)
|
||||||
|
if n_samples < 1:
|
||||||
|
print("ERROR: Not enough frames in trajectory files")
|
||||||
|
results.to_csv(args.output, index=False)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if n_samples < args.n_clusters:
|
||||||
|
print(f"Warning: Number of samples ({n_samples}) is less than requested clusters ({args.n_clusters})")
|
||||||
|
args.n_clusters = max(1, n_samples - 1) # Adjust n_clusters to be at most n_samples-1
|
||||||
|
|
||||||
|
# Use fewer clusters if sample count is low
|
||||||
|
if n_samples < 10:
|
||||||
|
adjusted_clusters = min(3, max(1, n_samples-1))
|
||||||
|
print(f"Adjusting clusters from {args.n_clusters} to {adjusted_clusters} due to sample count")
|
||||||
|
args.n_clusters = adjusted_clusters
|
||||||
|
|
||||||
|
# Extract CA coordinates for clustering
|
||||||
|
ca1_indices = traj1.topology.select("name CA")
|
||||||
|
if len(ca1_indices) == 0:
|
||||||
|
print("ERROR: No CA atoms found in protein1")
|
||||||
|
results.to_csv(args.output, index=False)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
ca2_indices = traj2.topology.select("name CA")
|
||||||
|
if len(ca2_indices) == 0:
|
||||||
|
print("ERROR: No CA atoms found in protein2")
|
||||||
|
results.to_csv(args.output, index=False)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Cluster protein1 conformations
|
||||||
|
print(f"Clustering protein1 into {args.n_clusters} states")
|
||||||
|
xyz1 = traj1.xyz[:, ca1_indices, :].reshape(traj1.n_frames, -1)
|
||||||
|
kmeans1 = KMeans(n_clusters=args.n_clusters, random_state=42).fit(xyz1)
|
||||||
|
|
||||||
|
# Cluster protein2 conformations
|
||||||
|
print(f"Clustering protein2 into {args.n_clusters} states")
|
||||||
|
xyz2 = traj2.xyz[:, ca2_indices, :].reshape(traj2.n_frames, -1)
|
||||||
|
kmeans2 = KMeans(n_clusters=args.n_clusters, random_state=42).fit(xyz2)
|
||||||
|
|
||||||
|
# Get cluster centers
|
||||||
|
centers1 = []
|
||||||
|
for i in range(args.n_clusters):
|
||||||
|
idx = np.where(kmeans1.labels_ == i)[0]
|
||||||
|
if len(idx) > 0:
|
||||||
|
center_idx = idx[np.argmin(np.sum((xyz1[idx] - kmeans1.cluster_centers_[i])**2, axis=1))]
|
||||||
|
centers1.append(traj1[center_idx])
|
||||||
|
|
||||||
|
centers2 = []
|
||||||
|
for i in range(args.n_clusters):
|
||||||
|
idx = np.where(kmeans2.labels_ == i)[0]
|
||||||
|
if len(idx) > 0:
|
||||||
|
center_idx = idx[np.argmin(np.sum((xyz2[idx] - kmeans2.cluster_centers_[i])**2, axis=1))]
|
||||||
|
centers2.append(traj2[center_idx])
|
||||||
|
|
||||||
|
if len(centers1) == 0 or len(centers2) == 0:
|
||||||
|
print(f"ERROR: Failed to determine cluster centers. Centers1: {len(centers1)}, Centers2: {len(centers2)}")
|
||||||
|
results.to_csv(args.output, index=False)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Calculate binding energies for all combinations of cluster centers
|
||||||
|
binding_energies = np.zeros((len(centers1), len(centers2)))
|
||||||
|
for i in range(len(centers1)):
|
||||||
|
for j in range(len(centers2)):
|
||||||
|
binding_energies[i, j] = calculate_interaction_energy(centers1[i], centers2[j])
|
||||||
|
print(f"Binding energy between cluster {i} and {j}: {binding_energies[i, j]:.2f} kcal/mol")
|
||||||
|
|
||||||
|
# Get cluster weights based on population
|
||||||
|
weights1 = np.bincount(kmeans1.labels_, minlength=args.n_clusters) / traj1.n_frames
|
||||||
|
weights2 = np.bincount(kmeans2.labels_, minlength=args.n_clusters) / traj2.n_frames
|
||||||
|
|
||||||
|
# Calculate binding free energy with the numerically stable approach
|
||||||
|
avg_energy, entropy, binding_free_energy = calculate_binding_free_energy(
|
||||||
|
binding_energies, weights1, weights2, args.temperature
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Average binding energy: {avg_energy:.2f} kcal/mol")
|
||||||
|
print(f"Entropy contribution: {entropy:.2f} kcal/mol")
|
||||||
|
print(f"Binding free energy: {binding_free_energy:.2f} kcal/mol")
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
results = pd.DataFrame({
|
||||||
|
"metric": ["average_energy", "entropy_contribution", "binding_free_energy"],
|
||||||
|
"value": [avg_energy, entropy, binding_free_energy]
|
||||||
|
})
|
||||||
|
|
||||||
|
# Create visualization
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
bar_colors = ['#4285F4', '#34A853', '#EA4335']
|
||||||
|
bars = plt.bar(['Average Energy', 'Entropy', 'Binding Free Energy'],
|
||||||
|
[avg_energy, entropy, binding_free_energy],
|
||||||
|
color=bar_colors)
|
||||||
|
plt.axhline(y=0, color='k', linestyle='-')
|
||||||
|
plt.ylabel('Energy (kcal/mol)')
|
||||||
|
plt.title('Components of Binding Free Energy')
|
||||||
|
|
||||||
|
# Add value labels on bars
|
||||||
|
for bar in bars:
|
||||||
|
height = bar.get_height()
|
||||||
|
plt.text(bar.get_x() + bar.get_width()/2.,
|
||||||
|
height + (0.1 if height > 0 else -0.1),
|
||||||
|
f'{height:.2f}',
|
||||||
|
ha='center', va='bottom' if height > 0 else 'top')
|
||||||
|
|
||||||
|
plt.savefig(args.plot, dpi=300, bbox_inches='tight')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR in main function: {str(e)}")
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
# Always save results, even if there was an error
|
||||||
|
results.to_csv(args.output, index=False)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
140
test.nf
Normal file
140
test.nf
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
#!/usr/bin/env nextflow
|
||||||
|
nextflow.enable.dsl=2
|
||||||
|
|
||||||
|
// Define parameters
|
||||||
|
params.complex_name = "mhc_tcr"
|
||||||
|
params.protein1_fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/complexes/mhc_peptide.fasta"
|
||||||
|
params.protein2_fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/complexes/tcr_jm22.fasta"
|
||||||
|
params.exp_dG = -7.21 // kcal/mol from experimental database
|
||||||
|
params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
|
||||||
|
params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
|
||||||
|
params.num_samples = 20
|
||||||
|
params.batch_size = 5
|
||||||
|
params.temperature = 300
|
||||||
|
params.n_clusters = 5
|
||||||
|
|
||||||
|
// First process: Generate structure for protein1
|
||||||
|
process GENERATE_PROTEIN1 {
|
||||||
|
container 'bioemu:latest'
|
||||||
|
containerOptions '--rm --gpus all -v /mnt:/mnt'
|
||||||
|
publishDir "${params.outdir}/${params.complex_name}/protein1", mode: 'copy'
|
||||||
|
|
||||||
|
output:
|
||||||
|
path "protein1_topology.pdb", emit: topology
|
||||||
|
path "protein1_samples.xtc", emit: samples
|
||||||
|
|
||||||
|
script:
|
||||||
|
"""
|
||||||
|
# Extract sequence from FASTA
|
||||||
|
SEQUENCE=\$(grep -v ">" ${params.protein1_fasta} | tr -d '\\n')
|
||||||
|
|
||||||
|
# Run BioEmu
|
||||||
|
python3 -m bioemu.sample \\
|
||||||
|
--sequence "\${SEQUENCE}" \\
|
||||||
|
--num_samples ${params.num_samples} \\
|
||||||
|
--batch_size_100 ${params.batch_size} \\
|
||||||
|
--output_dir . \\
|
||||||
|
--cache_embeds_dir ${params.cache_dir}
|
||||||
|
|
||||||
|
# Rename output files
|
||||||
|
mv topology.pdb protein1_topology.pdb
|
||||||
|
mv samples.xtc protein1_samples.xtc
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second process: Generate structure for protein2
|
||||||
|
process GENERATE_PROTEIN2 {
|
||||||
|
container 'bioemu:latest'
|
||||||
|
containerOptions '--rm --gpus all -v /mnt:/mnt'
|
||||||
|
publishDir "${params.outdir}/${params.complex_name}/protein2", mode: 'copy'
|
||||||
|
|
||||||
|
output:
|
||||||
|
path "protein2_topology.pdb", emit: topology
|
||||||
|
path "protein2_samples.xtc", emit: samples
|
||||||
|
|
||||||
|
script:
|
||||||
|
"""
|
||||||
|
# Extract sequence from FASTA
|
||||||
|
SEQUENCE=\$(grep -v ">" ${params.protein2_fasta} | tr -d '\\n')
|
||||||
|
|
||||||
|
# Run BioEmu
|
||||||
|
python3 -m bioemu.sample \\
|
||||||
|
--sequence "\${SEQUENCE}" \\
|
||||||
|
--num_samples ${params.num_samples} \\
|
||||||
|
--batch_size_100 ${params.batch_size} \\
|
||||||
|
--output_dir . \\
|
||||||
|
--cache_embeds_dir ${params.cache_dir}
|
||||||
|
|
||||||
|
# Rename output files
|
||||||
|
mv topology.pdb protein2_topology.pdb
|
||||||
|
mv samples.xtc protein2_samples.xtc
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Third process: Calculate binding energy
|
||||||
|
process CALCULATE_BINDING {
|
||||||
|
container 'bioemu:latest'
|
||||||
|
containerOptions '--rm --gpus all -v /mnt:/mnt -v /data:/data'
|
||||||
|
publishDir "${params.outdir}/${params.complex_name}/analysis", mode: 'copy'
|
||||||
|
|
||||||
|
input:
|
||||||
|
path protein1_topology
|
||||||
|
path protein1_samples
|
||||||
|
path protein2_topology
|
||||||
|
path protein2_samples
|
||||||
|
|
||||||
|
output:
|
||||||
|
path "binding_energy.csv"
|
||||||
|
path "binding_energy_report.txt"
|
||||||
|
path "energy_comparison.png"
|
||||||
|
|
||||||
|
script:
|
||||||
|
"""
|
||||||
|
# Run binding energy calculation
|
||||||
|
python3 /data/olamide/fresh-bioemu2/scripts/calculate_binding.py \\
|
||||||
|
--protein1_topology ${protein1_topology} \\
|
||||||
|
--protein1_samples ${protein1_samples} \\
|
||||||
|
--protein2_topology ${protein2_topology} \\
|
||||||
|
--protein2_samples ${protein2_samples} \\
|
||||||
|
--temperature ${params.temperature} \\
|
||||||
|
--n_clusters ${params.n_clusters} \\
|
||||||
|
--output binding_energy.csv \\
|
||||||
|
--plot energy_comparison.png
|
||||||
|
|
||||||
|
# Generate report
|
||||||
|
echo "# Binding Free Energy Analysis: ${params.complex_name}" > binding_energy_report.txt
|
||||||
|
echo "======================================================" >> binding_energy_report.txt
|
||||||
|
echo "## Experimental Value (Database)" >> binding_energy_report.txt
|
||||||
|
echo "ΔG = ${params.exp_dG} kcal/mol" >> binding_energy_report.txt
|
||||||
|
echo "" >> binding_energy_report.txt
|
||||||
|
|
||||||
|
# Extract predicted value
|
||||||
|
PREDICTED_DG=\$(grep -A1 "binding_free_energy" binding_energy.csv | tail -n1 | cut -d',' -f2)
|
||||||
|
|
||||||
|
echo "## BioEmu Prediction" >> binding_energy_report.txt
|
||||||
|
echo "ΔG = \${PREDICTED_DG} kcal/mol" >> binding_energy_report.txt
|
||||||
|
echo "" >> binding_energy_report.txt
|
||||||
|
|
||||||
|
# Calculate comparison metrics
|
||||||
|
echo "## Comparison" >> binding_energy_report.txt
|
||||||
|
ABS_DIFF=\$(python3 -c "print('%.2f' % abs(float('\${PREDICTED_DG}') - (${params.exp_dG})))")
|
||||||
|
REL_ERROR=\$(python3 -c "print('%.2f' % (((float('\${PREDICTED_DG}') - (${params.exp_dG}))/(${params.exp_dG}))*100))")
|
||||||
|
|
||||||
|
echo "Absolute Difference: \${ABS_DIFF} kcal/mol" >> binding_energy_report.txt
|
||||||
|
echo "Relative Error: \${REL_ERROR}%" >> binding_energy_report.txt
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
workflow {
|
||||||
|
// Generate structures for both proteins
|
||||||
|
protein1_results = GENERATE_PROTEIN1()
|
||||||
|
protein2_results = GENERATE_PROTEIN2()
|
||||||
|
|
||||||
|
// Calculate binding energy
|
||||||
|
CALCULATE_BINDING(
|
||||||
|
protein1_results.topology,
|
||||||
|
protein1_results.samples,
|
||||||
|
protein2_results.topology,
|
||||||
|
protein2_results.samples
|
||||||
|
)
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user