Add BioEmu Nextflow pipeline implementation
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
.nextflow*
|
||||||
|
work/
|
||||||
48
Dockerfile
Normal file
48
Dockerfile
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
# Set non-interactive installation
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
wget \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
build-essential \
|
||||||
|
curl \
|
||||||
|
python3-tk \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /opt/bioemu
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN python3 -m pip install --upgrade pip
|
||||||
|
|
||||||
|
# Install BioEmu from PyPI
|
||||||
|
RUN pip install bioemu
|
||||||
|
|
||||||
|
# Install dependencies for free energy calculation
|
||||||
|
RUN pip install mdtraj scikit-learn pandas matplotlib seaborn
|
||||||
|
|
||||||
|
# Create directory for ColabFold and set environment variable
|
||||||
|
ENV COLABFOLD_DIR=/opt/colabfold
|
||||||
|
|
||||||
|
# Create cache directories with proper permissions
|
||||||
|
RUN mkdir -p ${COLABFOLD_DIR}/embeds_cache /tmp/colabfold_temp && \
|
||||||
|
chmod -R 777 ${COLABFOLD_DIR} /tmp/colabfold_temp
|
||||||
|
|
||||||
|
# Pre-setup ColabFold during build to avoid runtime issues
|
||||||
|
RUN wget "https://raw.githubusercontent.com/YoshitakaMo/localcolabfold/5fc8775114b637b5672234179c50e694ab057db4/install_colabbatch_linux.sh" -O ${COLABFOLD_DIR}/install_colabbatch_linux.sh && \
|
||||||
|
chmod +x ${COLABFOLD_DIR}/install_colabbatch_linux.sh && \
|
||||||
|
bash ${COLABFOLD_DIR}/install_colabbatch_linux.sh
|
||||||
|
|
||||||
|
# Create directories for input/output with proper permissions
|
||||||
|
RUN mkdir -p /data /results && chmod -R 777 /data /results
|
||||||
|
|
||||||
|
RUN mkdir -p /opt/bioemu/scripts/
|
||||||
|
COPY calculate_gibbs.py /opt/bioemu/scripts/
|
||||||
|
RUN chmod +x /opt/bioemu/scripts/calculate_gibbs.py
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
10
README.md
Normal file
10
README.md
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# BioEmu Nextflow Pipeline
|
||||||
|
|
||||||
|
A Nextflow pipeline implementation for BioEmu to sample protein structures and calculate Gibbs free energy.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
- Sample structures for various protein sequences
|
||||||
|
- Calculate Gibbs free energy differences
|
||||||
|
- Containerized execution with Docker
|
||||||
|
|
||||||
|
## Usage
|
||||||
121
calculate_gibbs.py
Executable file
121
calculate_gibbs.py
Executable file
@@ -0,0 +1,121 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
import mdtraj as md
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles')
|
||||||
|
parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples')
|
||||||
|
parser.add_argument('--topology', required=True, help='Path to PDB topology file')
|
||||||
|
parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin')
|
||||||
|
parser.add_argument('--output', required=True, help='Output CSV file for free energy data')
|
||||||
|
parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters')
|
||||||
|
parser.add_argument('--plot', help='Path to save energy plot (optional)')
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
# Load trajectory
|
||||||
|
print(f"Loading trajectory {args.samples} with topology {args.topology}")
|
||||||
|
traj = md.load(args.samples, top=args.topology)
|
||||||
|
|
||||||
|
# Calculate RMSD to first frame
|
||||||
|
print("Calculating RMSD to reference structure")
|
||||||
|
# Align to the first frame
|
||||||
|
traj.superpose(traj, 0)
|
||||||
|
# Calculate RMSD for CA atoms only
|
||||||
|
atom_indices = traj.topology.select('name CA')
|
||||||
|
distances = np.zeros(traj.n_frames)
|
||||||
|
for i in range(traj.n_frames):
|
||||||
|
# Fixed line - using slicing instead of frame parameter
|
||||||
|
distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0]
|
||||||
|
|
||||||
|
# Feature extraction for clustering
|
||||||
|
# Use the RMSD and radius of gyration as features
|
||||||
|
rg = md.compute_rg(traj)
|
||||||
|
features = np.column_stack((distances, rg))
|
||||||
|
|
||||||
|
# Cluster structures
|
||||||
|
print(f"Clustering structures into {args.n_clusters} states")
|
||||||
|
kmeans = KMeans(n_clusters=args.n_clusters, random_state=42)
|
||||||
|
labels = kmeans.fit_predict(features)
|
||||||
|
|
||||||
|
# Calculate state populations
|
||||||
|
unique_labels, counts = np.unique(labels, return_counts=True)
|
||||||
|
populations = counts / len(labels)
|
||||||
|
|
||||||
|
# Calculate free energies
|
||||||
|
R = 0.0019872041 # kcal/(mol·K)
|
||||||
|
T = args.temperature
|
||||||
|
RT = R * T
|
||||||
|
|
||||||
|
# Reference state is the most populated one
|
||||||
|
reference_idx = np.argmax(populations)
|
||||||
|
reference_pop = populations[reference_idx]
|
||||||
|
|
||||||
|
# Calculate ΔG values
|
||||||
|
free_energies = -RT * np.log(populations / reference_pop)
|
||||||
|
|
||||||
|
# Get representative structures from each cluster
|
||||||
|
representatives = []
|
||||||
|
for i in range(args.n_clusters):
|
||||||
|
cluster_frames = np.where(labels == i)[0]
|
||||||
|
if len(cluster_frames) > 0:
|
||||||
|
# Find frame closest to cluster center
|
||||||
|
cluster_features = features[cluster_frames]
|
||||||
|
center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1)
|
||||||
|
center_idx = cluster_frames[np.argmin(center_dists)]
|
||||||
|
representatives.append(int(center_idx))
|
||||||
|
else:
|
||||||
|
representatives.append(-1) # No members in this cluster
|
||||||
|
|
||||||
|
# Create results dataframe
|
||||||
|
results = pd.DataFrame({
|
||||||
|
'Cluster': unique_labels,
|
||||||
|
'Population': populations,
|
||||||
|
'DeltaG_kcal_mol': free_energies,
|
||||||
|
'Representative_Frame': representatives
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by free energy
|
||||||
|
results = results.sort_values('DeltaG_kcal_mol')
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
results.to_csv(args.output, index=False)
|
||||||
|
print(f"Results saved to {args.output}")
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\nFree Energy Summary:")
|
||||||
|
print(results)
|
||||||
|
|
||||||
|
# Calculate overall free energy range
|
||||||
|
print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol")
|
||||||
|
|
||||||
|
# Create plot if requested
|
||||||
|
if args.plot:
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
|
||||||
|
# Plot free energy for each cluster
|
||||||
|
plt.bar(range(len(unique_labels)), free_energies, alpha=0.7)
|
||||||
|
plt.xlabel('Cluster')
|
||||||
|
plt.ylabel('ΔG (kcal/mol)')
|
||||||
|
plt.title('Free Energy Landscape')
|
||||||
|
plt.xticks(range(len(unique_labels)))
|
||||||
|
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
||||||
|
|
||||||
|
# Add population as text on bars
|
||||||
|
for i, (energy, pop) in enumerate(zip(free_energies, populations)):
|
||||||
|
plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center')
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(args.plot, dpi=300)
|
||||||
|
print(f"Plot saved to {args.plot}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
5
entrypoint.sh
Normal file
5
entrypoint.sh
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Execute the command passed to docker
|
||||||
|
exec "$@"
|
||||||
89
main.nf
Normal file
89
main.nf
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
#!/usr/bin/env nextflow
|
||||||
|
|
||||||
|
nextflow.enable.dsl=2
|
||||||
|
|
||||||
|
// Multiple FASTA files to process
|
||||||
|
params.fasta_list = [
|
||||||
|
"/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta",
|
||||||
|
"/mnt/OmicNAS/private/old/olamide/bioemu/input/trp_cage.fasta"
|
||||||
|
]
|
||||||
|
params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
|
||||||
|
params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
|
||||||
|
params.scripts_dir = "${baseDir}/scripts"
|
||||||
|
params.num_samples = 10
|
||||||
|
params.batch_size_100 = 10
|
||||||
|
params.temperature = 300
|
||||||
|
params.n_clusters = 5
|
||||||
|
|
||||||
|
process BIOEMU {
|
||||||
|
container 'bioemu:latest'
|
||||||
|
containerOptions '--rm --gpus all -v /mnt:/mnt -v /tmp:/tmp'
|
||||||
|
publishDir "${params.outdir}/${protein_id}", mode: 'copy'
|
||||||
|
|
||||||
|
input:
|
||||||
|
tuple val(protein_id), path(fasta)
|
||||||
|
|
||||||
|
output:
|
||||||
|
tuple val(protein_id), path("topology.pdb"), path("samples.xtc"), emit: structures
|
||||||
|
path "sequence.fasta", optional: true
|
||||||
|
path "batch_*.npz", optional: true
|
||||||
|
path "run.log"
|
||||||
|
|
||||||
|
script:
|
||||||
|
"""
|
||||||
|
# Make sure cache directory exists
|
||||||
|
mkdir -p ${params.cache_dir}
|
||||||
|
|
||||||
|
# Extract the sequence from the FASTA file
|
||||||
|
SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n')
|
||||||
|
|
||||||
|
# Run BioEmu with the extracted sequence
|
||||||
|
python3 -m bioemu.sample \
|
||||||
|
--sequence "\${SEQUENCE}" \
|
||||||
|
--num_samples ${params.num_samples} \
|
||||||
|
--batch_size_100 ${params.batch_size_100} \
|
||||||
|
--output_dir . \
|
||||||
|
--cache_embeds_dir ${params.cache_dir} 2>&1 | tee run.log
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
process CALCULATE_FREE_ENERGY {
|
||||||
|
container 'bioemu:latest'
|
||||||
|
containerOptions '--rm --gpus all -v /mnt:/mnt'
|
||||||
|
publishDir "${params.outdir}/${protein_id}/analysis", mode: 'copy'
|
||||||
|
|
||||||
|
input:
|
||||||
|
tuple val(protein_id), path(topology), path(samples)
|
||||||
|
|
||||||
|
output:
|
||||||
|
tuple val(protein_id), path("free_energy.csv"), emit: free_energy
|
||||||
|
path "energy_plot.png", optional: true
|
||||||
|
|
||||||
|
script:
|
||||||
|
"""
|
||||||
|
# Calculate free energy from sampled structures
|
||||||
|
python3 /opt/bioemu/scripts/calculate_gibbs.py \\
|
||||||
|
--samples ${samples} \\
|
||||||
|
--topology ${topology} \\
|
||||||
|
--temperature ${params.temperature} \\
|
||||||
|
--n_clusters ${params.n_clusters} \\
|
||||||
|
--output free_energy.csv \\
|
||||||
|
--plot energy_plot.png
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
workflow {
|
||||||
|
// Convert fasta_list to a channel of [protein_id, fasta_file] tuples
|
||||||
|
Channel.fromList(params.fasta_list)
|
||||||
|
.map { fasta_path ->
|
||||||
|
def file = file(fasta_path)
|
||||||
|
return [file.baseName, file]
|
||||||
|
}
|
||||||
|
.set { fasta_ch }
|
||||||
|
|
||||||
|
// Run BioEmu for each protein sequence
|
||||||
|
BIOEMU(fasta_ch)
|
||||||
|
|
||||||
|
// Calculate Gibbs free energy for each protein
|
||||||
|
CALCULATE_FREE_ENERGY(BIOEMU.out.structures)
|
||||||
|
}
|
||||||
38
nextflow.config
Normal file
38
nextflow.config
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
// Manifest for Nextflow metadata
|
||||||
|
manifest {
|
||||||
|
name = 'BioEmu-Nextflow'
|
||||||
|
author = 'Generated from BioEmu repository'
|
||||||
|
homePage = 'https://github.com/microsoft/bioemu'
|
||||||
|
description = 'Nextflow pipeline for BioEmu - Biomolecular Emulator for protein structure sampling'
|
||||||
|
mainScript = 'main.nf'
|
||||||
|
version = '1.0.0'
|
||||||
|
}
|
||||||
|
|
||||||
|
// Global default parameters
|
||||||
|
params {
|
||||||
|
fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta"
|
||||||
|
outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
|
||||||
|
cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
|
||||||
|
num_samples = 10
|
||||||
|
batch_size_100 = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
// Container configurations
|
||||||
|
docker {
|
||||||
|
enabled = true
|
||||||
|
runOptions = '--gpus all'
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process configurations
|
||||||
|
process {
|
||||||
|
cpus = 1
|
||||||
|
memory = '8 GB'
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execution configurations
|
||||||
|
executor {
|
||||||
|
$local {
|
||||||
|
cpus = 4
|
||||||
|
memory = '8 GB'
|
||||||
|
}
|
||||||
|
}
|
||||||
106
params.json
Normal file
106
params.json
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
{
|
||||||
|
"params": {
|
||||||
|
"fasta_list": {
|
||||||
|
"type": "file[]",
|
||||||
|
"description": "FASTA files containing protein sequences",
|
||||||
|
"default": [],
|
||||||
|
"required": true,
|
||||||
|
"pipeline_io": "input",
|
||||||
|
"var_name": "params.fasta_list",
|
||||||
|
"examples": [
|
||||||
|
["/omic/olamide/examples/prot1.fasta", "/omic/olamide/examples/prot2.fasta"]
|
||||||
|
],
|
||||||
|
"pattern": ".*\\.fasta$",
|
||||||
|
"validation": {},
|
||||||
|
"notes": "Select one or more FASTA files with protein sequences"
|
||||||
|
},
|
||||||
|
"outdir": {
|
||||||
|
"type": "folder",
|
||||||
|
"description": "Output Directory",
|
||||||
|
"default": "/omic/olamide/output",
|
||||||
|
"required": true,
|
||||||
|
"pipeline_io": "output",
|
||||||
|
"var_name": "params.outdir",
|
||||||
|
"examples": [
|
||||||
|
"/omic/olamide/output"
|
||||||
|
],
|
||||||
|
"pattern": ".*",
|
||||||
|
"validation": {},
|
||||||
|
"notes": "Select where to save your analysis results"
|
||||||
|
},
|
||||||
|
"num_samples": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Number of protein structure samples",
|
||||||
|
"default": 10,
|
||||||
|
"required": true,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.num_samples",
|
||||||
|
"examples": [
|
||||||
|
"10"
|
||||||
|
],
|
||||||
|
"pattern": "^\\d+$",
|
||||||
|
"validation": {
|
||||||
|
"min": 1
|
||||||
|
},
|
||||||
|
"notes": "More samples provide better coverage of conformational space"
|
||||||
|
},
|
||||||
|
"batch_size_100": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Batch size parameter",
|
||||||
|
"default": 10,
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.batch_size_100",
|
||||||
|
"hidden": true,
|
||||||
|
"examples": [
|
||||||
|
"10"
|
||||||
|
],
|
||||||
|
"pattern": "^\\d+$"
|
||||||
|
},
|
||||||
|
"temperature": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Temperature (K) for free energy",
|
||||||
|
"default": 300,
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.temperature",
|
||||||
|
"examples": [
|
||||||
|
"300"
|
||||||
|
],
|
||||||
|
"pattern": "^\\d+$",
|
||||||
|
"validation": {
|
||||||
|
"min": 200,
|
||||||
|
"max": 500
|
||||||
|
},
|
||||||
|
"notes": "Temperature in Kelvin for free energy calculations"
|
||||||
|
},
|
||||||
|
"n_clusters": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Number of conformational clusters",
|
||||||
|
"default": 5,
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.n_clusters",
|
||||||
|
"examples": [
|
||||||
|
"5"
|
||||||
|
],
|
||||||
|
"pattern": "^\\d+$",
|
||||||
|
"validation": {
|
||||||
|
"min": 2
|
||||||
|
},
|
||||||
|
"notes": "Number of clusters for free energy analysis"
|
||||||
|
},
|
||||||
|
"cache_dir": {
|
||||||
|
"type": "folder",
|
||||||
|
"description": "Embeddings cache directory",
|
||||||
|
"default": "/tmp/bioemu_cache",
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.cache_dir",
|
||||||
|
"hidden": true,
|
||||||
|
"examples": [
|
||||||
|
"/tmp/bioemu_cache"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
121
scripts/calculate_gibbs.py
Executable file
121
scripts/calculate_gibbs.py
Executable file
@@ -0,0 +1,121 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
import mdtraj as md
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles')
|
||||||
|
parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples')
|
||||||
|
parser.add_argument('--topology', required=True, help='Path to PDB topology file')
|
||||||
|
parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin')
|
||||||
|
parser.add_argument('--output', required=True, help='Output CSV file for free energy data')
|
||||||
|
parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters')
|
||||||
|
parser.add_argument('--plot', help='Path to save energy plot (optional)')
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
# Load trajectory
|
||||||
|
print(f"Loading trajectory {args.samples} with topology {args.topology}")
|
||||||
|
traj = md.load(args.samples, top=args.topology)
|
||||||
|
|
||||||
|
# Calculate RMSD to first frame
|
||||||
|
print("Calculating RMSD to reference structure")
|
||||||
|
# Align to the first frame
|
||||||
|
traj.superpose(traj, 0)
|
||||||
|
# Calculate RMSD for CA atoms only
|
||||||
|
atom_indices = traj.topology.select('name CA')
|
||||||
|
distances = np.zeros(traj.n_frames)
|
||||||
|
for i in range(traj.n_frames):
|
||||||
|
# Fixed line - using slicing instead of frame parameter
|
||||||
|
distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0]
|
||||||
|
|
||||||
|
# Feature extraction for clustering
|
||||||
|
# Use the RMSD and radius of gyration as features
|
||||||
|
rg = md.compute_rg(traj)
|
||||||
|
features = np.column_stack((distances, rg))
|
||||||
|
|
||||||
|
# Cluster structures
|
||||||
|
print(f"Clustering structures into {args.n_clusters} states")
|
||||||
|
kmeans = KMeans(n_clusters=args.n_clusters, random_state=42)
|
||||||
|
labels = kmeans.fit_predict(features)
|
||||||
|
|
||||||
|
# Calculate state populations
|
||||||
|
unique_labels, counts = np.unique(labels, return_counts=True)
|
||||||
|
populations = counts / len(labels)
|
||||||
|
|
||||||
|
# Calculate free energies
|
||||||
|
R = 0.0019872041 # kcal/(mol·K)
|
||||||
|
T = args.temperature
|
||||||
|
RT = R * T
|
||||||
|
|
||||||
|
# Reference state is the most populated one
|
||||||
|
reference_idx = np.argmax(populations)
|
||||||
|
reference_pop = populations[reference_idx]
|
||||||
|
|
||||||
|
# Calculate ΔG values
|
||||||
|
free_energies = -RT * np.log(populations / reference_pop)
|
||||||
|
|
||||||
|
# Get representative structures from each cluster
|
||||||
|
representatives = []
|
||||||
|
for i in range(args.n_clusters):
|
||||||
|
cluster_frames = np.where(labels == i)[0]
|
||||||
|
if len(cluster_frames) > 0:
|
||||||
|
# Find frame closest to cluster center
|
||||||
|
cluster_features = features[cluster_frames]
|
||||||
|
center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1)
|
||||||
|
center_idx = cluster_frames[np.argmin(center_dists)]
|
||||||
|
representatives.append(int(center_idx))
|
||||||
|
else:
|
||||||
|
representatives.append(-1) # No members in this cluster
|
||||||
|
|
||||||
|
# Create results dataframe
|
||||||
|
results = pd.DataFrame({
|
||||||
|
'Cluster': unique_labels,
|
||||||
|
'Population': populations,
|
||||||
|
'DeltaG_kcal_mol': free_energies,
|
||||||
|
'Representative_Frame': representatives
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by free energy
|
||||||
|
results = results.sort_values('DeltaG_kcal_mol')
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
results.to_csv(args.output, index=False)
|
||||||
|
print(f"Results saved to {args.output}")
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\nFree Energy Summary:")
|
||||||
|
print(results)
|
||||||
|
|
||||||
|
# Calculate overall free energy range
|
||||||
|
print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol")
|
||||||
|
|
||||||
|
# Create plot if requested
|
||||||
|
if args.plot:
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
|
||||||
|
# Plot free energy for each cluster
|
||||||
|
plt.bar(range(len(unique_labels)), free_energies, alpha=0.7)
|
||||||
|
plt.xlabel('Cluster')
|
||||||
|
plt.ylabel('ΔG (kcal/mol)')
|
||||||
|
plt.title('Free Energy Landscape')
|
||||||
|
plt.xticks(range(len(unique_labels)))
|
||||||
|
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
||||||
|
|
||||||
|
# Add population as text on bars
|
||||||
|
for i, (energy, pop) in enumerate(zip(free_energies, populations)):
|
||||||
|
plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center')
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(args.plot, dpi=300)
|
||||||
|
print(f"Plot saved to {args.plot}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
21
setup.sh
Normal file
21
setup.sh
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Set up directory structure for BioEmu workflow
|
||||||
|
BASE_DIR="/mnt/OmicNAS/private/old/olamide/bioemu"
|
||||||
|
|
||||||
|
# Create necessary directories
|
||||||
|
mkdir -p ${BASE_DIR}/input
|
||||||
|
mkdir -p ${BASE_DIR}/output
|
||||||
|
mkdir -p ${BASE_DIR}/cache
|
||||||
|
mkdir -p ${BASE_DIR}/scripts
|
||||||
|
|
||||||
|
# Copy FASTA files
|
||||||
|
cp villin_headpiece.fasta ${BASE_DIR}/input/
|
||||||
|
cp trp_cage.fasta ${BASE_DIR}/input/
|
||||||
|
|
||||||
|
# Copy scripts
|
||||||
|
cp calculate_gibbs.py ${BASE_DIR}/scripts/
|
||||||
|
chmod +x ${BASE_DIR}/scripts/calculate_gibbs.py
|
||||||
|
|
||||||
|
echo "Directory structure set up at ${BASE_DIR}"
|
||||||
|
echo "FASTA files and scripts copied"
|
||||||
2
trp_cage.fasta
Normal file
2
trp_cage.fasta
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
>Trp_cage
|
||||||
|
NLYIQWLKDGGPSSGRPPPS
|
||||||
2
villin_headpiece.fasta
Normal file
2
villin_headpiece.fasta
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
>Villin_Headpiece
|
||||||
|
LSDEDFKAVFGMTRSAFANLPLWKQQNLKKEKGLF
|
||||||
Reference in New Issue
Block a user