bioemu/calculate_gibbs.py

#!/usr/bin/env python3

import argparse
import numpy as np
import mdtraj as md
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import os

def parse_args():
    parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles')
    parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples')
    parser.add_argument('--topology', required=True, help='Path to PDB topology file')
    parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin')
    parser.add_argument('--output', required=True, help='Output CSV file for free energy data')
    parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters')
    parser.add_argument('--plot', help='Path to save energy plot (optional)')
    return parser.parse_args()

def main():
    args = parse_args()

    # Load trajectory
    print(f"Loading trajectory {args.samples} with topology {args.topology}")
    traj = md.load(args.samples, top=args.topology)

    # Calculate RMSD to first frame
    print("Calculating RMSD to reference structure")
    # Align to the first frame
    traj.superpose(traj, 0)
    # Calculate RMSD for CA atoms only
    atom_indices = traj.topology.select('name CA')
    distances = np.zeros(traj.n_frames)
    for i in range(traj.n_frames):
        # Fixed line - using slicing instead of frame parameter
        distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0]

    # Feature extraction for clustering
    # Use the RMSD and radius of gyration as features
    rg = md.compute_rg(traj)
    features = np.column_stack((distances, rg))

    # Cluster structures
    print(f"Clustering structures into {args.n_clusters} states")
    kmeans = KMeans(n_clusters=args.n_clusters, random_state=42)
    labels = kmeans.fit_predict(features)

    # Calculate state populations
    unique_labels, counts = np.unique(labels, return_counts=True)
    populations = counts / len(labels)

    # Calculate free energies
    R = 0.0019872041 # kcal/(mol·K)
    T = args.temperature
    RT = R * T

    # Reference state is the most populated one
    reference_idx = np.argmax(populations)
    reference_pop = populations[reference_idx]

    # Calculate ΔG values
    free_energies = -RT * np.log(populations / reference_pop)

    # Get representative structures from each cluster
    representatives = []
    for i in range(args.n_clusters):
        cluster_frames = np.where(labels == i)[0]
        if len(cluster_frames) > 0:
            # Find frame closest to cluster center
            cluster_features = features[cluster_frames]
            center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1)
            center_idx = cluster_frames[np.argmin(center_dists)]
            representatives.append(int(center_idx))
        else:
            representatives.append(-1)  # No members in this cluster

    # Create results dataframe
    results = pd.DataFrame({
        'Cluster': unique_labels,
        'Population': populations,
        'DeltaG_kcal_mol': free_energies,
        'Representative_Frame': representatives
    })

    # Sort by free energy
    results = results.sort_values('DeltaG_kcal_mol')

    # Save results
    results.to_csv(args.output, index=False)
    print(f"Results saved to {args.output}")

    # Print summary
    print("\nFree Energy Summary:")
    print(results)

    # Calculate overall free energy range
    print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol")

    # Create plot if requested
    if args.plot:
        plt.figure(figsize=(10, 6))

        # Plot free energy for each cluster
        plt.bar(range(len(unique_labels)), free_energies, alpha=0.7)
        plt.xlabel('Cluster')
        plt.ylabel('ΔG (kcal/mol)')
        plt.title('Free Energy Landscape')
        plt.xticks(range(len(unique_labels)))
        plt.grid(axis='y', linestyle='--', alpha=0.7)

        # Add population as text on bars
        for i, (energy, pop) in enumerate(zip(free_energies, populations)):
            plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center')

        plt.tight_layout()
        plt.savefig(args.plot, dpi=300)
        print(f"Plot saved to {args.plot}")

if __name__ == "__main__":
    main()