commit 2cfbf64e922130e9aa1a1fc8a40c5d15818c6b35 Author: olamide24 Date: Tue Mar 4 09:38:55 2025 -0800 Add BioEmu Nextflow pipeline implementation diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7db340a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.nextflow* +work/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1681e8a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,48 @@ +FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 + +# Set non-interactive installation +ENV DEBIAN_FRONTEND=noninteractive + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + wget \ + git \ + python3 \ + python3-pip \ + build-essential \ + curl \ + python3-tk \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /opt/bioemu + +# Install Python dependencies +RUN python3 -m pip install --upgrade pip + +# Install BioEmu from PyPI +RUN pip install bioemu + +# Install dependencies for free energy calculation +RUN pip install mdtraj scikit-learn pandas matplotlib seaborn + +# Create directory for ColabFold and set environment variable +ENV COLABFOLD_DIR=/opt/colabfold + +# Create cache directories with proper permissions +RUN mkdir -p ${COLABFOLD_DIR}/embeds_cache /tmp/colabfold_temp && \ + chmod -R 777 ${COLABFOLD_DIR} /tmp/colabfold_temp + +# Pre-setup ColabFold during build to avoid runtime issues +RUN wget "https://raw.githubusercontent.com/YoshitakaMo/localcolabfold/5fc8775114b637b5672234179c50e694ab057db4/install_colabbatch_linux.sh" -O ${COLABFOLD_DIR}/install_colabbatch_linux.sh && \ + chmod +x ${COLABFOLD_DIR}/install_colabbatch_linux.sh && \ + bash ${COLABFOLD_DIR}/install_colabbatch_linux.sh + +# Create directories for input/output with proper permissions +RUN mkdir -p /data /results && chmod -R 777 /data /results + +RUN mkdir -p /opt/bioemu/scripts/ +COPY calculate_gibbs.py /opt/bioemu/scripts/ +RUN chmod +x /opt/bioemu/scripts/calculate_gibbs.py + +CMD ["/bin/bash"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..a1535b1 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# BioEmu Nextflow Pipeline + +A Nextflow pipeline implementation for BioEmu to sample protein structures and calculate Gibbs free energy. + +## Features +- Sample structures for various protein sequences +- Calculate Gibbs free energy differences +- Containerized execution with Docker + +## Usage diff --git a/calculate_gibbs.py b/calculate_gibbs.py new file mode 100755 index 0000000..315b901 --- /dev/null +++ b/calculate_gibbs.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +import argparse +import numpy as np +import mdtraj as md +from sklearn.cluster import KMeans +import pandas as pd +import matplotlib.pyplot as plt +import os + +def parse_args(): + parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles') + parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples') + parser.add_argument('--topology', required=True, help='Path to PDB topology file') + parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin') + parser.add_argument('--output', required=True, help='Output CSV file for free energy data') + parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters') + parser.add_argument('--plot', help='Path to save energy plot (optional)') + return parser.parse_args() + +def main(): + args = parse_args() + + # Load trajectory + print(f"Loading trajectory {args.samples} with topology {args.topology}") + traj = md.load(args.samples, top=args.topology) + + # Calculate RMSD to first frame + print("Calculating RMSD to reference structure") + # Align to the first frame + traj.superpose(traj, 0) + # Calculate RMSD for CA atoms only + atom_indices = traj.topology.select('name CA') + distances = np.zeros(traj.n_frames) + for i in range(traj.n_frames): + # Fixed line - using slicing instead of frame parameter + distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0] + + # Feature extraction for clustering + # Use the RMSD and radius of gyration as features + rg = md.compute_rg(traj) + features = np.column_stack((distances, rg)) + + # Cluster structures + print(f"Clustering structures into {args.n_clusters} states") + kmeans = KMeans(n_clusters=args.n_clusters, random_state=42) + labels = kmeans.fit_predict(features) + + # Calculate state populations + unique_labels, counts = np.unique(labels, return_counts=True) + populations = counts / len(labels) + + # Calculate free energies + R = 0.0019872041 # kcal/(mol·K) + T = args.temperature + RT = R * T + + # Reference state is the most populated one + reference_idx = np.argmax(populations) + reference_pop = populations[reference_idx] + + # Calculate ΔG values + free_energies = -RT * np.log(populations / reference_pop) + + # Get representative structures from each cluster + representatives = [] + for i in range(args.n_clusters): + cluster_frames = np.where(labels == i)[0] + if len(cluster_frames) > 0: + # Find frame closest to cluster center + cluster_features = features[cluster_frames] + center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1) + center_idx = cluster_frames[np.argmin(center_dists)] + representatives.append(int(center_idx)) + else: + representatives.append(-1) # No members in this cluster + + # Create results dataframe + results = pd.DataFrame({ + 'Cluster': unique_labels, + 'Population': populations, + 'DeltaG_kcal_mol': free_energies, + 'Representative_Frame': representatives + }) + + # Sort by free energy + results = results.sort_values('DeltaG_kcal_mol') + + # Save results + results.to_csv(args.output, index=False) + print(f"Results saved to {args.output}") + + # Print summary + print("\nFree Energy Summary:") + print(results) + + # Calculate overall free energy range + print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol") + + # Create plot if requested + if args.plot: + plt.figure(figsize=(10, 6)) + + # Plot free energy for each cluster + plt.bar(range(len(unique_labels)), free_energies, alpha=0.7) + plt.xlabel('Cluster') + plt.ylabel('ΔG (kcal/mol)') + plt.title('Free Energy Landscape') + plt.xticks(range(len(unique_labels))) + plt.grid(axis='y', linestyle='--', alpha=0.7) + + # Add population as text on bars + for i, (energy, pop) in enumerate(zip(free_energies, populations)): + plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center') + + plt.tight_layout() + plt.savefig(args.plot, dpi=300) + print(f"Plot saved to {args.plot}") + +if __name__ == "__main__": + main() diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..544863e --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# Execute the command passed to docker +exec "$@" diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..24865ff --- /dev/null +++ b/main.nf @@ -0,0 +1,89 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +// Multiple FASTA files to process +params.fasta_list = [ + "/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta", + "/mnt/OmicNAS/private/old/olamide/bioemu/input/trp_cage.fasta" +] +params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output" +params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache" +params.scripts_dir = "${baseDir}/scripts" +params.num_samples = 10 +params.batch_size_100 = 10 +params.temperature = 300 +params.n_clusters = 5 + +process BIOEMU { + container 'bioemu:latest' + containerOptions '--rm --gpus all -v /mnt:/mnt -v /tmp:/tmp' + publishDir "${params.outdir}/${protein_id}", mode: 'copy' + + input: + tuple val(protein_id), path(fasta) + + output: + tuple val(protein_id), path("topology.pdb"), path("samples.xtc"), emit: structures + path "sequence.fasta", optional: true + path "batch_*.npz", optional: true + path "run.log" + + script: + """ + # Make sure cache directory exists + mkdir -p ${params.cache_dir} + + # Extract the sequence from the FASTA file + SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n') + + # Run BioEmu with the extracted sequence + python3 -m bioemu.sample \ + --sequence "\${SEQUENCE}" \ + --num_samples ${params.num_samples} \ + --batch_size_100 ${params.batch_size_100} \ + --output_dir . \ + --cache_embeds_dir ${params.cache_dir} 2>&1 | tee run.log + """ +} + +process CALCULATE_FREE_ENERGY { + container 'bioemu:latest' + containerOptions '--rm --gpus all -v /mnt:/mnt' + publishDir "${params.outdir}/${protein_id}/analysis", mode: 'copy' + + input: + tuple val(protein_id), path(topology), path(samples) + + output: + tuple val(protein_id), path("free_energy.csv"), emit: free_energy + path "energy_plot.png", optional: true + + script: + """ + # Calculate free energy from sampled structures + python3 /opt/bioemu/scripts/calculate_gibbs.py \\ + --samples ${samples} \\ + --topology ${topology} \\ + --temperature ${params.temperature} \\ + --n_clusters ${params.n_clusters} \\ + --output free_energy.csv \\ + --plot energy_plot.png + """ +} + +workflow { + // Convert fasta_list to a channel of [protein_id, fasta_file] tuples + Channel.fromList(params.fasta_list) + .map { fasta_path -> + def file = file(fasta_path) + return [file.baseName, file] + } + .set { fasta_ch } + + // Run BioEmu for each protein sequence + BIOEMU(fasta_ch) + + // Calculate Gibbs free energy for each protein + CALCULATE_FREE_ENERGY(BIOEMU.out.structures) +} diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..7c45752 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,38 @@ +// Manifest for Nextflow metadata +manifest { + name = 'BioEmu-Nextflow' + author = 'Generated from BioEmu repository' + homePage = 'https://github.com/microsoft/bioemu' + description = 'Nextflow pipeline for BioEmu - Biomolecular Emulator for protein structure sampling' + mainScript = 'main.nf' + version = '1.0.0' +} + +// Global default parameters +params { + fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta" + outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output" + cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache" + num_samples = 10 + batch_size_100 = 10 +} + +// Container configurations +docker { + enabled = true + runOptions = '--gpus all' +} + +// Process configurations +process { + cpus = 1 + memory = '8 GB' +} + +// Execution configurations +executor { + $local { + cpus = 4 + memory = '8 GB' + } +} diff --git a/params.json b/params.json new file mode 100644 index 0000000..65877f2 --- /dev/null +++ b/params.json @@ -0,0 +1,106 @@ +{ + "params": { + "fasta_list": { + "type": "file[]", + "description": "FASTA files containing protein sequences", + "default": [], + "required": true, + "pipeline_io": "input", + "var_name": "params.fasta_list", + "examples": [ + ["/omic/olamide/examples/prot1.fasta", "/omic/olamide/examples/prot2.fasta"] + ], + "pattern": ".*\\.fasta$", + "validation": {}, + "notes": "Select one or more FASTA files with protein sequences" + }, + "outdir": { + "type": "folder", + "description": "Output Directory", + "default": "/omic/olamide/output", + "required": true, + "pipeline_io": "output", + "var_name": "params.outdir", + "examples": [ + "/omic/olamide/output" + ], + "pattern": ".*", + "validation": {}, + "notes": "Select where to save your analysis results" + }, + "num_samples": { + "type": "integer", + "description": "Number of protein structure samples", + "default": 10, + "required": true, + "pipeline_io": "parameter", + "var_name": "params.num_samples", + "examples": [ + "10" + ], + "pattern": "^\\d+$", + "validation": { + "min": 1 + }, + "notes": "More samples provide better coverage of conformational space" + }, + "batch_size_100": { + "type": "integer", + "description": "Batch size parameter", + "default": 10, + "required": false, + "pipeline_io": "parameter", + "var_name": "params.batch_size_100", + "hidden": true, + "examples": [ + "10" + ], + "pattern": "^\\d+$" + }, + "temperature": { + "type": "integer", + "description": "Temperature (K) for free energy", + "default": 300, + "required": false, + "pipeline_io": "parameter", + "var_name": "params.temperature", + "examples": [ + "300" + ], + "pattern": "^\\d+$", + "validation": { + "min": 200, + "max": 500 + }, + "notes": "Temperature in Kelvin for free energy calculations" + }, + "n_clusters": { + "type": "integer", + "description": "Number of conformational clusters", + "default": 5, + "required": false, + "pipeline_io": "parameter", + "var_name": "params.n_clusters", + "examples": [ + "5" + ], + "pattern": "^\\d+$", + "validation": { + "min": 2 + }, + "notes": "Number of clusters for free energy analysis" + }, + "cache_dir": { + "type": "folder", + "description": "Embeddings cache directory", + "default": "/tmp/bioemu_cache", + "required": false, + "pipeline_io": "parameter", + "var_name": "params.cache_dir", + "hidden": true, + "examples": [ + "/tmp/bioemu_cache" + ] + } + } +} diff --git a/scripts/calculate_gibbs.py b/scripts/calculate_gibbs.py new file mode 100755 index 0000000..315b901 --- /dev/null +++ b/scripts/calculate_gibbs.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +import argparse +import numpy as np +import mdtraj as md +from sklearn.cluster import KMeans +import pandas as pd +import matplotlib.pyplot as plt +import os + +def parse_args(): + parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles') + parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples') + parser.add_argument('--topology', required=True, help='Path to PDB topology file') + parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin') + parser.add_argument('--output', required=True, help='Output CSV file for free energy data') + parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters') + parser.add_argument('--plot', help='Path to save energy plot (optional)') + return parser.parse_args() + +def main(): + args = parse_args() + + # Load trajectory + print(f"Loading trajectory {args.samples} with topology {args.topology}") + traj = md.load(args.samples, top=args.topology) + + # Calculate RMSD to first frame + print("Calculating RMSD to reference structure") + # Align to the first frame + traj.superpose(traj, 0) + # Calculate RMSD for CA atoms only + atom_indices = traj.topology.select('name CA') + distances = np.zeros(traj.n_frames) + for i in range(traj.n_frames): + # Fixed line - using slicing instead of frame parameter + distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0] + + # Feature extraction for clustering + # Use the RMSD and radius of gyration as features + rg = md.compute_rg(traj) + features = np.column_stack((distances, rg)) + + # Cluster structures + print(f"Clustering structures into {args.n_clusters} states") + kmeans = KMeans(n_clusters=args.n_clusters, random_state=42) + labels = kmeans.fit_predict(features) + + # Calculate state populations + unique_labels, counts = np.unique(labels, return_counts=True) + populations = counts / len(labels) + + # Calculate free energies + R = 0.0019872041 # kcal/(mol·K) + T = args.temperature + RT = R * T + + # Reference state is the most populated one + reference_idx = np.argmax(populations) + reference_pop = populations[reference_idx] + + # Calculate ΔG values + free_energies = -RT * np.log(populations / reference_pop) + + # Get representative structures from each cluster + representatives = [] + for i in range(args.n_clusters): + cluster_frames = np.where(labels == i)[0] + if len(cluster_frames) > 0: + # Find frame closest to cluster center + cluster_features = features[cluster_frames] + center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1) + center_idx = cluster_frames[np.argmin(center_dists)] + representatives.append(int(center_idx)) + else: + representatives.append(-1) # No members in this cluster + + # Create results dataframe + results = pd.DataFrame({ + 'Cluster': unique_labels, + 'Population': populations, + 'DeltaG_kcal_mol': free_energies, + 'Representative_Frame': representatives + }) + + # Sort by free energy + results = results.sort_values('DeltaG_kcal_mol') + + # Save results + results.to_csv(args.output, index=False) + print(f"Results saved to {args.output}") + + # Print summary + print("\nFree Energy Summary:") + print(results) + + # Calculate overall free energy range + print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol") + + # Create plot if requested + if args.plot: + plt.figure(figsize=(10, 6)) + + # Plot free energy for each cluster + plt.bar(range(len(unique_labels)), free_energies, alpha=0.7) + plt.xlabel('Cluster') + plt.ylabel('ΔG (kcal/mol)') + plt.title('Free Energy Landscape') + plt.xticks(range(len(unique_labels))) + plt.grid(axis='y', linestyle='--', alpha=0.7) + + # Add population as text on bars + for i, (energy, pop) in enumerate(zip(free_energies, populations)): + plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center') + + plt.tight_layout() + plt.savefig(args.plot, dpi=300) + print(f"Plot saved to {args.plot}") + +if __name__ == "__main__": + main() diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..b24ccbf --- /dev/null +++ b/setup.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Set up directory structure for BioEmu workflow +BASE_DIR="/mnt/OmicNAS/private/old/olamide/bioemu" + +# Create necessary directories +mkdir -p ${BASE_DIR}/input +mkdir -p ${BASE_DIR}/output +mkdir -p ${BASE_DIR}/cache +mkdir -p ${BASE_DIR}/scripts + +# Copy FASTA files +cp villin_headpiece.fasta ${BASE_DIR}/input/ +cp trp_cage.fasta ${BASE_DIR}/input/ + +# Copy scripts +cp calculate_gibbs.py ${BASE_DIR}/scripts/ +chmod +x ${BASE_DIR}/scripts/calculate_gibbs.py + +echo "Directory structure set up at ${BASE_DIR}" +echo "FASTA files and scripts copied" diff --git a/trp_cage.fasta b/trp_cage.fasta new file mode 100644 index 0000000..cee001d --- /dev/null +++ b/trp_cage.fasta @@ -0,0 +1,2 @@ +>Trp_cage +NLYIQWLKDGGPSSGRPPPS diff --git a/villin_headpiece.fasta b/villin_headpiece.fasta new file mode 100644 index 0000000..cf2099b --- /dev/null +++ b/villin_headpiece.fasta @@ -0,0 +1,2 @@ +>Villin_Headpiece +LSDEDFKAVFGMTRSAFANLPLWKQQNLKKEKGLF