Add BioEmu Nextflow pipeline implementation

2025-03-04 09:38:55 -08:00
commit 2cfbf64e92
12 changed files with 565 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.nextflow*
+work/
--- a/48
+++ b/48
@@ -0,0 +1,48 @@
+FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
+
+# Set non-interactive installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    wget \
+    git \
+    python3 \
+    python3-pip \
+    build-essential \
+    curl \
+    python3-tk \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /opt/bioemu
+
+# Install Python dependencies
+RUN python3 -m pip install --upgrade pip
+
+# Install BioEmu from PyPI
+RUN pip install bioemu
+
+# Install dependencies for free energy calculation
+RUN pip install mdtraj scikit-learn pandas matplotlib seaborn
+
+# Create directory for ColabFold and set environment variable
+ENV COLABFOLD_DIR=/opt/colabfold
+
+# Create cache directories with proper permissions
+RUN mkdir -p ${COLABFOLD_DIR}/embeds_cache /tmp/colabfold_temp && \
+    chmod -R 777 ${COLABFOLD_DIR} /tmp/colabfold_temp
+
+# Pre-setup ColabFold during build to avoid runtime issues
+RUN wget "https://raw.githubusercontent.com/YoshitakaMo/localcolabfold/5fc8775114b637b5672234179c50e694ab057db4/install_colabbatch_linux.sh" -O ${COLABFOLD_DIR}/install_colabbatch_linux.sh && \
+    chmod +x ${COLABFOLD_DIR}/install_colabbatch_linux.sh && \
+    bash ${COLABFOLD_DIR}/install_colabbatch_linux.sh
+
+# Create directories for input/output with proper permissions
+RUN mkdir -p /data /results && chmod -R 777 /data /results
+
+RUN mkdir -p /opt/bioemu/scripts/
+COPY calculate_gibbs.py /opt/bioemu/scripts/
+RUN chmod +x /opt/bioemu/scripts/calculate_gibbs.py
+
+CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,10 @@
+# BioEmu Nextflow Pipeline
+
+A Nextflow pipeline implementation for BioEmu to sample protein structures and calculate Gibbs free energy.
+
+## Features
+- Sample structures for various protein sequences
+- Calculate Gibbs free energy differences
+- Containerized execution with Docker
+
+## Usage
--- a/calculate_gibbs.py
+++ b/calculate_gibbs.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+import argparse
+import numpy as np
+import mdtraj as md
+from sklearn.cluster import KMeans
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles')
+    parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples')
+    parser.add_argument('--topology', required=True, help='Path to PDB topology file')
+    parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin')
+    parser.add_argument('--output', required=True, help='Output CSV file for free energy data')
+    parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters')
+    parser.add_argument('--plot', help='Path to save energy plot (optional)')
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+    
+    # Load trajectory
+    print(f"Loading trajectory {args.samples} with topology {args.topology}")
+    traj = md.load(args.samples, top=args.topology)
+    
+    # Calculate RMSD to first frame
+    print("Calculating RMSD to reference structure")
+    # Align to the first frame
+    traj.superpose(traj, 0)
+    # Calculate RMSD for CA atoms only
+    atom_indices = traj.topology.select('name CA')
+    distances = np.zeros(traj.n_frames)
+    for i in range(traj.n_frames):
+        # Fixed line - using slicing instead of frame parameter
+        distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0]
+    
+    # Feature extraction for clustering
+    # Use the RMSD and radius of gyration as features
+    rg = md.compute_rg(traj)
+    features = np.column_stack((distances, rg))
+    
+    # Cluster structures
+    print(f"Clustering structures into {args.n_clusters} states")
+    kmeans = KMeans(n_clusters=args.n_clusters, random_state=42)
+    labels = kmeans.fit_predict(features)
+    
+    # Calculate state populations
+    unique_labels, counts = np.unique(labels, return_counts=True)
+    populations = counts / len(labels)
+    
+    # Calculate free energies
+    R = 0.0019872041 # kcal/(mol·K)
+    T = args.temperature
+    RT = R * T
+    
+    # Reference state is the most populated one
+    reference_idx = np.argmax(populations)
+    reference_pop = populations[reference_idx]
+    
+    # Calculate ΔG values 
+    free_energies = -RT * np.log(populations / reference_pop)
+    
+    # Get representative structures from each cluster
+    representatives = []
+    for i in range(args.n_clusters):
+        cluster_frames = np.where(labels == i)[0]
+        if len(cluster_frames) > 0:
+            # Find frame closest to cluster center
+            cluster_features = features[cluster_frames]
+            center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1)
+            center_idx = cluster_frames[np.argmin(center_dists)]
+            representatives.append(int(center_idx))
+        else:
+            representatives.append(-1)  # No members in this cluster
+    
+    # Create results dataframe
+    results = pd.DataFrame({
+        'Cluster': unique_labels,
+        'Population': populations,
+        'DeltaG_kcal_mol': free_energies,
+        'Representative_Frame': representatives
+    })
+    
+    # Sort by free energy
+    results = results.sort_values('DeltaG_kcal_mol')
+    
+    # Save results
+    results.to_csv(args.output, index=False)
+    print(f"Results saved to {args.output}")
+    
+    # Print summary
+    print("\nFree Energy Summary:")
+    print(results)
+    
+    # Calculate overall free energy range
+    print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol")
+    
+    # Create plot if requested
+    if args.plot:
+        plt.figure(figsize=(10, 6))
+        
+        # Plot free energy for each cluster
+        plt.bar(range(len(unique_labels)), free_energies, alpha=0.7)
+        plt.xlabel('Cluster')
+        plt.ylabel('ΔG (kcal/mol)')
+        plt.title('Free Energy Landscape')
+        plt.xticks(range(len(unique_labels)))
+        plt.grid(axis='y', linestyle='--', alpha=0.7)
+        
+        # Add population as text on bars
+        for i, (energy, pop) in enumerate(zip(free_energies, populations)):
+            plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center')
+        
+        plt.tight_layout()
+        plt.savefig(args.plot, dpi=300)
+        print(f"Plot saved to {args.plot}")
+
+if __name__ == "__main__":
+    main()
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -e
+
+# Execute the command passed to docker
+exec "$@"
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,89 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+// Multiple FASTA files to process
+params.fasta_list = [
+    "/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta",
+    "/mnt/OmicNAS/private/old/olamide/bioemu/input/trp_cage.fasta"
+]
+params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
+params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
+params.scripts_dir = "${baseDir}/scripts"
+params.num_samples = 10
+params.batch_size_100 = 10
+params.temperature = 300
+params.n_clusters = 5
+
+process BIOEMU {
+    container 'bioemu:latest'
+    containerOptions '--rm --gpus all -v /mnt:/mnt -v /tmp:/tmp'
+    publishDir "${params.outdir}/${protein_id}", mode: 'copy'
+    
+    input:
+        tuple val(protein_id), path(fasta)
+    
+    output:
+        tuple val(protein_id), path("topology.pdb"), path("samples.xtc"), emit: structures
+        path "sequence.fasta", optional: true
+        path "batch_*.npz", optional: true
+        path "run.log"
+    
+    script:
+    """
+    # Make sure cache directory exists
+    mkdir -p ${params.cache_dir}
+    
+    # Extract the sequence from the FASTA file
+    SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n')
+    
+    # Run BioEmu with the extracted sequence
+    python3 -m bioemu.sample \
+        --sequence "\${SEQUENCE}" \
+        --num_samples ${params.num_samples} \
+        --batch_size_100 ${params.batch_size_100} \
+        --output_dir . \
+        --cache_embeds_dir ${params.cache_dir} 2>&1 | tee run.log
+    """
+}
+
+process CALCULATE_FREE_ENERGY {
+    container 'bioemu:latest'
+    containerOptions '--rm --gpus all -v /mnt:/mnt'
+    publishDir "${params.outdir}/${protein_id}/analysis", mode: 'copy'
+    
+    input:
+        tuple val(protein_id), path(topology), path(samples)
+    
+    output:
+        tuple val(protein_id), path("free_energy.csv"), emit: free_energy
+        path "energy_plot.png", optional: true
+        
+    script:
+    """
+    # Calculate free energy from sampled structures
+    python3 /opt/bioemu/scripts/calculate_gibbs.py \\
+        --samples ${samples} \\
+        --topology ${topology} \\
+        --temperature ${params.temperature} \\
+        --n_clusters ${params.n_clusters} \\
+        --output free_energy.csv \\
+        --plot energy_plot.png
+    """
+}
+
+workflow {
+    // Convert fasta_list to a channel of [protein_id, fasta_file] tuples
+    Channel.fromList(params.fasta_list)
+           .map { fasta_path -> 
+               def file = file(fasta_path)
+               return [file.baseName, file]
+           }
+           .set { fasta_ch }
+    
+    // Run BioEmu for each protein sequence
+    BIOEMU(fasta_ch)
+    
+    // Calculate Gibbs free energy for each protein
+    CALCULATE_FREE_ENERGY(BIOEMU.out.structures)
+}
--- a/nextflow.config
+++ b/nextflow.config
@@ -0,0 +1,38 @@
+// Manifest for Nextflow metadata
+manifest {
+    name = 'BioEmu-Nextflow'
+    author = 'Generated from BioEmu repository'
+    homePage = 'https://github.com/microsoft/bioemu'
+    description = 'Nextflow pipeline for BioEmu - Biomolecular Emulator for protein structure sampling'
+    mainScript = 'main.nf'
+    version = '1.0.0'
+}
+
+// Global default parameters
+params {
+    fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta"
+    outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
+    cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
+    num_samples = 10
+    batch_size_100 = 10
+}
+
+// Container configurations
+docker {
+    enabled = true
+    runOptions = '--gpus all'
+}
+
+// Process configurations
+process {
+    cpus = 1
+    memory = '8 GB'
+}
+
+// Execution configurations
+executor {
+    $local {
+        cpus = 4
+        memory = '8 GB'
+    }
+}
--- a/params.json
+++ b/params.json
@@ -0,0 +1,106 @@
+{
+    "params": {
+        "fasta_list": {
+            "type": "file[]",
+            "description": "FASTA files containing protein sequences",
+            "default": [],
+            "required": true,
+            "pipeline_io": "input",
+            "var_name": "params.fasta_list",
+            "examples": [
+                ["/omic/olamide/examples/prot1.fasta", "/omic/olamide/examples/prot2.fasta"]
+            ],
+            "pattern": ".*\\.fasta$",
+            "validation": {},
+            "notes": "Select one or more FASTA files with protein sequences"
+        },
+        "outdir": {
+            "type": "folder",
+            "description": "Output Directory",
+            "default": "/omic/olamide/output",
+            "required": true,
+            "pipeline_io": "output",
+            "var_name": "params.outdir",
+            "examples": [
+                "/omic/olamide/output"
+            ],
+            "pattern": ".*",
+            "validation": {},
+            "notes": "Select where to save your analysis results"
+        },
+        "num_samples": {
+            "type": "integer",
+            "description": "Number of protein structure samples",
+            "default": 10,
+            "required": true,
+            "pipeline_io": "parameter",
+            "var_name": "params.num_samples",
+            "examples": [
+                "10"
+            ],
+            "pattern": "^\\d+$",
+            "validation": {
+                "min": 1
+            },
+            "notes": "More samples provide better coverage of conformational space"
+        },
+        "batch_size_100": {
+            "type": "integer",
+            "description": "Batch size parameter",
+            "default": 10,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.batch_size_100",
+            "hidden": true,
+            "examples": [
+                "10"
+            ],
+            "pattern": "^\\d+$"
+        },
+        "temperature": {
+            "type": "integer",
+            "description": "Temperature (K) for free energy",
+            "default": 300,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.temperature",
+            "examples": [
+                "300"
+            ],
+            "pattern": "^\\d+$",
+            "validation": {
+                "min": 200,
+                "max": 500
+            },
+            "notes": "Temperature in Kelvin for free energy calculations"
+        },
+        "n_clusters": {
+            "type": "integer",
+            "description": "Number of conformational clusters",
+            "default": 5,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.n_clusters",
+            "examples": [
+                "5"
+            ],
+            "pattern": "^\\d+$",
+            "validation": {
+                "min": 2
+            },
+            "notes": "Number of clusters for free energy analysis"
+        },
+        "cache_dir": {
+            "type": "folder",
+            "description": "Embeddings cache directory",
+            "default": "/tmp/bioemu_cache",
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.cache_dir",
+            "hidden": true,
+            "examples": [
+                "/tmp/bioemu_cache"
+            ]
+        }
+    }
+}
--- a/scripts/calculate_gibbs.py
+++ b/scripts/calculate_gibbs.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+import argparse
+import numpy as np
+import mdtraj as md
+from sklearn.cluster import KMeans
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles')
+    parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples')
+    parser.add_argument('--topology', required=True, help='Path to PDB topology file')
+    parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin')
+    parser.add_argument('--output', required=True, help='Output CSV file for free energy data')
+    parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters')
+    parser.add_argument('--plot', help='Path to save energy plot (optional)')
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+    
+    # Load trajectory
+    print(f"Loading trajectory {args.samples} with topology {args.topology}")
+    traj = md.load(args.samples, top=args.topology)
+    
+    # Calculate RMSD to first frame
+    print("Calculating RMSD to reference structure")
+    # Align to the first frame
+    traj.superpose(traj, 0)
+    # Calculate RMSD for CA atoms only
+    atom_indices = traj.topology.select('name CA')
+    distances = np.zeros(traj.n_frames)
+    for i in range(traj.n_frames):
+        # Fixed line - using slicing instead of frame parameter
+        distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0]
+    
+    # Feature extraction for clustering
+    # Use the RMSD and radius of gyration as features
+    rg = md.compute_rg(traj)
+    features = np.column_stack((distances, rg))
+    
+    # Cluster structures
+    print(f"Clustering structures into {args.n_clusters} states")
+    kmeans = KMeans(n_clusters=args.n_clusters, random_state=42)
+    labels = kmeans.fit_predict(features)
+    
+    # Calculate state populations
+    unique_labels, counts = np.unique(labels, return_counts=True)
+    populations = counts / len(labels)
+    
+    # Calculate free energies
+    R = 0.0019872041 # kcal/(mol·K)
+    T = args.temperature
+    RT = R * T
+    
+    # Reference state is the most populated one
+    reference_idx = np.argmax(populations)
+    reference_pop = populations[reference_idx]
+    
+    # Calculate ΔG values 
+    free_energies = -RT * np.log(populations / reference_pop)
+    
+    # Get representative structures from each cluster
+    representatives = []
+    for i in range(args.n_clusters):
+        cluster_frames = np.where(labels == i)[0]
+        if len(cluster_frames) > 0:
+            # Find frame closest to cluster center
+            cluster_features = features[cluster_frames]
+            center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1)
+            center_idx = cluster_frames[np.argmin(center_dists)]
+            representatives.append(int(center_idx))
+        else:
+            representatives.append(-1)  # No members in this cluster
+    
+    # Create results dataframe
+    results = pd.DataFrame({
+        'Cluster': unique_labels,
+        'Population': populations,
+        'DeltaG_kcal_mol': free_energies,
+        'Representative_Frame': representatives
+    })
+    
+    # Sort by free energy
+    results = results.sort_values('DeltaG_kcal_mol')
+    
+    # Save results
+    results.to_csv(args.output, index=False)
+    print(f"Results saved to {args.output}")
+    
+    # Print summary
+    print("\nFree Energy Summary:")
+    print(results)
+    
+    # Calculate overall free energy range
+    print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol")
+    
+    # Create plot if requested
+    if args.plot:
+        plt.figure(figsize=(10, 6))
+        
+        # Plot free energy for each cluster
+        plt.bar(range(len(unique_labels)), free_energies, alpha=0.7)
+        plt.xlabel('Cluster')
+        plt.ylabel('ΔG (kcal/mol)')
+        plt.title('Free Energy Landscape')
+        plt.xticks(range(len(unique_labels)))
+        plt.grid(axis='y', linestyle='--', alpha=0.7)
+        
+        # Add population as text on bars
+        for i, (energy, pop) in enumerate(zip(free_energies, populations)):
+            plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center')
+        
+        plt.tight_layout()
+        plt.savefig(args.plot, dpi=300)
+        print(f"Plot saved to {args.plot}")
+
+if __name__ == "__main__":
+    main()
--- a/setup.sh
+++ b/setup.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Set up directory structure for BioEmu workflow
+BASE_DIR="/mnt/OmicNAS/private/old/olamide/bioemu"
+
+# Create necessary directories
+mkdir -p ${BASE_DIR}/input
+mkdir -p ${BASE_DIR}/output
+mkdir -p ${BASE_DIR}/cache
+mkdir -p ${BASE_DIR}/scripts
+
+# Copy FASTA files
+cp villin_headpiece.fasta ${BASE_DIR}/input/
+cp trp_cage.fasta ${BASE_DIR}/input/
+
+# Copy scripts
+cp calculate_gibbs.py ${BASE_DIR}/scripts/
+chmod +x ${BASE_DIR}/scripts/calculate_gibbs.py
+
+echo "Directory structure set up at ${BASE_DIR}"
+echo "FASTA files and scripts copied"
--- a/trp_cage.fasta
+++ b/trp_cage.fasta
@@ -0,0 +1,2 @@
+>Trp_cage
+NLYIQWLKDGGPSSGRPPPS
--- a/villin_headpiece.fasta
+++ b/villin_headpiece.fasta
@@ -0,0 +1,2 @@
+>Villin_Headpiece
+LSDEDFKAVFGMTRSAFANLPLWKQQNLKKEKGLF