Add BioEmu Nextflow pipeline implementation

This commit is contained in:
2025-03-04 09:38:55 -08:00
commit 2cfbf64e92
12 changed files with 565 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
.nextflow*
work/

48
Dockerfile Normal file
View File

@@ -0,0 +1,48 @@
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
# Set non-interactive installation
ENV DEBIAN_FRONTEND=noninteractive
# Install system dependencies
RUN apt-get update && apt-get install -y \
wget \
git \
python3 \
python3-pip \
build-essential \
curl \
python3-tk \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /opt/bioemu
# Install Python dependencies
RUN python3 -m pip install --upgrade pip
# Install BioEmu from PyPI
RUN pip install bioemu
# Install dependencies for free energy calculation
RUN pip install mdtraj scikit-learn pandas matplotlib seaborn
# Create directory for ColabFold and set environment variable
ENV COLABFOLD_DIR=/opt/colabfold
# Create cache directories with proper permissions
RUN mkdir -p ${COLABFOLD_DIR}/embeds_cache /tmp/colabfold_temp && \
chmod -R 777 ${COLABFOLD_DIR} /tmp/colabfold_temp
# Pre-setup ColabFold during build to avoid runtime issues
RUN wget "https://raw.githubusercontent.com/YoshitakaMo/localcolabfold/5fc8775114b637b5672234179c50e694ab057db4/install_colabbatch_linux.sh" -O ${COLABFOLD_DIR}/install_colabbatch_linux.sh && \
chmod +x ${COLABFOLD_DIR}/install_colabbatch_linux.sh && \
bash ${COLABFOLD_DIR}/install_colabbatch_linux.sh
# Create directories for input/output with proper permissions
RUN mkdir -p /data /results && chmod -R 777 /data /results
RUN mkdir -p /opt/bioemu/scripts/
COPY calculate_gibbs.py /opt/bioemu/scripts/
RUN chmod +x /opt/bioemu/scripts/calculate_gibbs.py
CMD ["/bin/bash"]

10
README.md Normal file
View File

@@ -0,0 +1,10 @@
# BioEmu Nextflow Pipeline
A Nextflow pipeline implementation for BioEmu to sample protein structures and calculate Gibbs free energy.
## Features
- Sample structures for various protein sequences
- Calculate Gibbs free energy differences
- Containerized execution with Docker
## Usage

121
calculate_gibbs.py Executable file
View File

@@ -0,0 +1,121 @@
#!/usr/bin/env python3
import argparse
import numpy as np
import mdtraj as md
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import os
def parse_args():
parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles')
parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples')
parser.add_argument('--topology', required=True, help='Path to PDB topology file')
parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin')
parser.add_argument('--output', required=True, help='Output CSV file for free energy data')
parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters')
parser.add_argument('--plot', help='Path to save energy plot (optional)')
return parser.parse_args()
def main():
args = parse_args()
# Load trajectory
print(f"Loading trajectory {args.samples} with topology {args.topology}")
traj = md.load(args.samples, top=args.topology)
# Calculate RMSD to first frame
print("Calculating RMSD to reference structure")
# Align to the first frame
traj.superpose(traj, 0)
# Calculate RMSD for CA atoms only
atom_indices = traj.topology.select('name CA')
distances = np.zeros(traj.n_frames)
for i in range(traj.n_frames):
# Fixed line - using slicing instead of frame parameter
distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0]
# Feature extraction for clustering
# Use the RMSD and radius of gyration as features
rg = md.compute_rg(traj)
features = np.column_stack((distances, rg))
# Cluster structures
print(f"Clustering structures into {args.n_clusters} states")
kmeans = KMeans(n_clusters=args.n_clusters, random_state=42)
labels = kmeans.fit_predict(features)
# Calculate state populations
unique_labels, counts = np.unique(labels, return_counts=True)
populations = counts / len(labels)
# Calculate free energies
R = 0.0019872041 # kcal/(mol·K)
T = args.temperature
RT = R * T
# Reference state is the most populated one
reference_idx = np.argmax(populations)
reference_pop = populations[reference_idx]
# Calculate ΔG values
free_energies = -RT * np.log(populations / reference_pop)
# Get representative structures from each cluster
representatives = []
for i in range(args.n_clusters):
cluster_frames = np.where(labels == i)[0]
if len(cluster_frames) > 0:
# Find frame closest to cluster center
cluster_features = features[cluster_frames]
center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1)
center_idx = cluster_frames[np.argmin(center_dists)]
representatives.append(int(center_idx))
else:
representatives.append(-1) # No members in this cluster
# Create results dataframe
results = pd.DataFrame({
'Cluster': unique_labels,
'Population': populations,
'DeltaG_kcal_mol': free_energies,
'Representative_Frame': representatives
})
# Sort by free energy
results = results.sort_values('DeltaG_kcal_mol')
# Save results
results.to_csv(args.output, index=False)
print(f"Results saved to {args.output}")
# Print summary
print("\nFree Energy Summary:")
print(results)
# Calculate overall free energy range
print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol")
# Create plot if requested
if args.plot:
plt.figure(figsize=(10, 6))
# Plot free energy for each cluster
plt.bar(range(len(unique_labels)), free_energies, alpha=0.7)
plt.xlabel('Cluster')
plt.ylabel('ΔG (kcal/mol)')
plt.title('Free Energy Landscape')
plt.xticks(range(len(unique_labels)))
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Add population as text on bars
for i, (energy, pop) in enumerate(zip(free_energies, populations)):
plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center')
plt.tight_layout()
plt.savefig(args.plot, dpi=300)
print(f"Plot saved to {args.plot}")
if __name__ == "__main__":
main()

5
entrypoint.sh Normal file
View File

@@ -0,0 +1,5 @@
#!/bin/bash
set -e
# Execute the command passed to docker
exec "$@"

89
main.nf Normal file
View File

@@ -0,0 +1,89 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
// Multiple FASTA files to process
params.fasta_list = [
"/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta",
"/mnt/OmicNAS/private/old/olamide/bioemu/input/trp_cage.fasta"
]
params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
params.scripts_dir = "${baseDir}/scripts"
params.num_samples = 10
params.batch_size_100 = 10
params.temperature = 300
params.n_clusters = 5
process BIOEMU {
container 'bioemu:latest'
containerOptions '--rm --gpus all -v /mnt:/mnt -v /tmp:/tmp'
publishDir "${params.outdir}/${protein_id}", mode: 'copy'
input:
tuple val(protein_id), path(fasta)
output:
tuple val(protein_id), path("topology.pdb"), path("samples.xtc"), emit: structures
path "sequence.fasta", optional: true
path "batch_*.npz", optional: true
path "run.log"
script:
"""
# Make sure cache directory exists
mkdir -p ${params.cache_dir}
# Extract the sequence from the FASTA file
SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n')
# Run BioEmu with the extracted sequence
python3 -m bioemu.sample \
--sequence "\${SEQUENCE}" \
--num_samples ${params.num_samples} \
--batch_size_100 ${params.batch_size_100} \
--output_dir . \
--cache_embeds_dir ${params.cache_dir} 2>&1 | tee run.log
"""
}
process CALCULATE_FREE_ENERGY {
container 'bioemu:latest'
containerOptions '--rm --gpus all -v /mnt:/mnt'
publishDir "${params.outdir}/${protein_id}/analysis", mode: 'copy'
input:
tuple val(protein_id), path(topology), path(samples)
output:
tuple val(protein_id), path("free_energy.csv"), emit: free_energy
path "energy_plot.png", optional: true
script:
"""
# Calculate free energy from sampled structures
python3 /opt/bioemu/scripts/calculate_gibbs.py \\
--samples ${samples} \\
--topology ${topology} \\
--temperature ${params.temperature} \\
--n_clusters ${params.n_clusters} \\
--output free_energy.csv \\
--plot energy_plot.png
"""
}
workflow {
// Convert fasta_list to a channel of [protein_id, fasta_file] tuples
Channel.fromList(params.fasta_list)
.map { fasta_path ->
def file = file(fasta_path)
return [file.baseName, file]
}
.set { fasta_ch }
// Run BioEmu for each protein sequence
BIOEMU(fasta_ch)
// Calculate Gibbs free energy for each protein
CALCULATE_FREE_ENERGY(BIOEMU.out.structures)
}

38
nextflow.config Normal file
View File

@@ -0,0 +1,38 @@
// Manifest for Nextflow metadata
manifest {
name = 'BioEmu-Nextflow'
author = 'Generated from BioEmu repository'
homePage = 'https://github.com/microsoft/bioemu'
description = 'Nextflow pipeline for BioEmu - Biomolecular Emulator for protein structure sampling'
mainScript = 'main.nf'
version = '1.0.0'
}
// Global default parameters
params {
fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta"
outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
num_samples = 10
batch_size_100 = 10
}
// Container configurations
docker {
enabled = true
runOptions = '--gpus all'
}
// Process configurations
process {
cpus = 1
memory = '8 GB'
}
// Execution configurations
executor {
$local {
cpus = 4
memory = '8 GB'
}
}

106
params.json Normal file
View File

@@ -0,0 +1,106 @@
{
"params": {
"fasta_list": {
"type": "file[]",
"description": "FASTA files containing protein sequences",
"default": [],
"required": true,
"pipeline_io": "input",
"var_name": "params.fasta_list",
"examples": [
["/omic/olamide/examples/prot1.fasta", "/omic/olamide/examples/prot2.fasta"]
],
"pattern": ".*\\.fasta$",
"validation": {},
"notes": "Select one or more FASTA files with protein sequences"
},
"outdir": {
"type": "folder",
"description": "Output Directory",
"default": "/omic/olamide/output",
"required": true,
"pipeline_io": "output",
"var_name": "params.outdir",
"examples": [
"/omic/olamide/output"
],
"pattern": ".*",
"validation": {},
"notes": "Select where to save your analysis results"
},
"num_samples": {
"type": "integer",
"description": "Number of protein structure samples",
"default": 10,
"required": true,
"pipeline_io": "parameter",
"var_name": "params.num_samples",
"examples": [
"10"
],
"pattern": "^\\d+$",
"validation": {
"min": 1
},
"notes": "More samples provide better coverage of conformational space"
},
"batch_size_100": {
"type": "integer",
"description": "Batch size parameter",
"default": 10,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.batch_size_100",
"hidden": true,
"examples": [
"10"
],
"pattern": "^\\d+$"
},
"temperature": {
"type": "integer",
"description": "Temperature (K) for free energy",
"default": 300,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.temperature",
"examples": [
"300"
],
"pattern": "^\\d+$",
"validation": {
"min": 200,
"max": 500
},
"notes": "Temperature in Kelvin for free energy calculations"
},
"n_clusters": {
"type": "integer",
"description": "Number of conformational clusters",
"default": 5,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.n_clusters",
"examples": [
"5"
],
"pattern": "^\\d+$",
"validation": {
"min": 2
},
"notes": "Number of clusters for free energy analysis"
},
"cache_dir": {
"type": "folder",
"description": "Embeddings cache directory",
"default": "/tmp/bioemu_cache",
"required": false,
"pipeline_io": "parameter",
"var_name": "params.cache_dir",
"hidden": true,
"examples": [
"/tmp/bioemu_cache"
]
}
}
}

121
scripts/calculate_gibbs.py Executable file
View File

@@ -0,0 +1,121 @@
#!/usr/bin/env python3
import argparse
import numpy as np
import mdtraj as md
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import os
def parse_args():
parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles')
parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples')
parser.add_argument('--topology', required=True, help='Path to PDB topology file')
parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin')
parser.add_argument('--output', required=True, help='Output CSV file for free energy data')
parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters')
parser.add_argument('--plot', help='Path to save energy plot (optional)')
return parser.parse_args()
def main():
args = parse_args()
# Load trajectory
print(f"Loading trajectory {args.samples} with topology {args.topology}")
traj = md.load(args.samples, top=args.topology)
# Calculate RMSD to first frame
print("Calculating RMSD to reference structure")
# Align to the first frame
traj.superpose(traj, 0)
# Calculate RMSD for CA atoms only
atom_indices = traj.topology.select('name CA')
distances = np.zeros(traj.n_frames)
for i in range(traj.n_frames):
# Fixed line - using slicing instead of frame parameter
distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0]
# Feature extraction for clustering
# Use the RMSD and radius of gyration as features
rg = md.compute_rg(traj)
features = np.column_stack((distances, rg))
# Cluster structures
print(f"Clustering structures into {args.n_clusters} states")
kmeans = KMeans(n_clusters=args.n_clusters, random_state=42)
labels = kmeans.fit_predict(features)
# Calculate state populations
unique_labels, counts = np.unique(labels, return_counts=True)
populations = counts / len(labels)
# Calculate free energies
R = 0.0019872041 # kcal/(mol·K)
T = args.temperature
RT = R * T
# Reference state is the most populated one
reference_idx = np.argmax(populations)
reference_pop = populations[reference_idx]
# Calculate ΔG values
free_energies = -RT * np.log(populations / reference_pop)
# Get representative structures from each cluster
representatives = []
for i in range(args.n_clusters):
cluster_frames = np.where(labels == i)[0]
if len(cluster_frames) > 0:
# Find frame closest to cluster center
cluster_features = features[cluster_frames]
center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1)
center_idx = cluster_frames[np.argmin(center_dists)]
representatives.append(int(center_idx))
else:
representatives.append(-1) # No members in this cluster
# Create results dataframe
results = pd.DataFrame({
'Cluster': unique_labels,
'Population': populations,
'DeltaG_kcal_mol': free_energies,
'Representative_Frame': representatives
})
# Sort by free energy
results = results.sort_values('DeltaG_kcal_mol')
# Save results
results.to_csv(args.output, index=False)
print(f"Results saved to {args.output}")
# Print summary
print("\nFree Energy Summary:")
print(results)
# Calculate overall free energy range
print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol")
# Create plot if requested
if args.plot:
plt.figure(figsize=(10, 6))
# Plot free energy for each cluster
plt.bar(range(len(unique_labels)), free_energies, alpha=0.7)
plt.xlabel('Cluster')
plt.ylabel('ΔG (kcal/mol)')
plt.title('Free Energy Landscape')
plt.xticks(range(len(unique_labels)))
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Add population as text on bars
for i, (energy, pop) in enumerate(zip(free_energies, populations)):
plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center')
plt.tight_layout()
plt.savefig(args.plot, dpi=300)
print(f"Plot saved to {args.plot}")
if __name__ == "__main__":
main()

21
setup.sh Normal file
View File

@@ -0,0 +1,21 @@
#!/bin/bash
# Set up directory structure for BioEmu workflow
BASE_DIR="/mnt/OmicNAS/private/old/olamide/bioemu"
# Create necessary directories
mkdir -p ${BASE_DIR}/input
mkdir -p ${BASE_DIR}/output
mkdir -p ${BASE_DIR}/cache
mkdir -p ${BASE_DIR}/scripts
# Copy FASTA files
cp villin_headpiece.fasta ${BASE_DIR}/input/
cp trp_cage.fasta ${BASE_DIR}/input/
# Copy scripts
cp calculate_gibbs.py ${BASE_DIR}/scripts/
chmod +x ${BASE_DIR}/scripts/calculate_gibbs.py
echo "Directory structure set up at ${BASE_DIR}"
echo "FASTA files and scripts copied"

2
trp_cage.fasta Normal file
View File

@@ -0,0 +1,2 @@
>Trp_cage
NLYIQWLKDGGPSSGRPPPS

2
villin_headpiece.fasta Normal file
View File

@@ -0,0 +1,2 @@
>Villin_Headpiece
LSDEDFKAVFGMTRSAFANLPLWKQQNLKKEKGLF