Add BioEmu Nextflow pipeline implementation

2025-03-04 09:38:55 -08:00
commit 2cfbf64e92
12 changed files with 565 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 .nextflow*
 work/
--- a/48
+++ b/48
@@ -0,0 +1,48 @@
 FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
 # Set non-interactive installation
 ENV DEBIAN_FRONTEND=noninteractive
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    wget \
    git \
    python3 \
    python3-pip \
    build-essential \
    curl \
    python3-tk \
    && rm -rf /var/lib/apt/lists/*
 # Set working directory
 WORKDIR /opt/bioemu
 # Install Python dependencies
 RUN python3 -m pip install --upgrade pip
 # Install BioEmu from PyPI
 RUN pip install bioemu
 # Install dependencies for free energy calculation
 RUN pip install mdtraj scikit-learn pandas matplotlib seaborn
 # Create directory for ColabFold and set environment variable
 ENV COLABFOLD_DIR=/opt/colabfold
 # Create cache directories with proper permissions
 RUN mkdir -p ${COLABFOLD_DIR}/embeds_cache /tmp/colabfold_temp && \
    chmod -R 777 ${COLABFOLD_DIR} /tmp/colabfold_temp
 # Pre-setup ColabFold during build to avoid runtime issues
 RUN wget "https://raw.githubusercontent.com/YoshitakaMo/localcolabfold/5fc8775114b637b5672234179c50e694ab057db4/install_colabbatch_linux.sh" -O ${COLABFOLD_DIR}/install_colabbatch_linux.sh && \
    chmod +x ${COLABFOLD_DIR}/install_colabbatch_linux.sh && \
    bash ${COLABFOLD_DIR}/install_colabbatch_linux.sh
 # Create directories for input/output with proper permissions
 RUN mkdir -p /data /results && chmod -R 777 /data /results
 RUN mkdir -p /opt/bioemu/scripts/
 COPY calculate_gibbs.py /opt/bioemu/scripts/
 RUN chmod +x /opt/bioemu/scripts/calculate_gibbs.py
 CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,10 @@
 # BioEmu Nextflow Pipeline
 A Nextflow pipeline implementation for BioEmu to sample protein structures and calculate Gibbs free energy.
 ## Features
 - Sample structures for various protein sequences
 - Calculate Gibbs free energy differences
 - Containerized execution with Docker
 ## Usage
--- a/calculate_gibbs.py
+++ b/calculate_gibbs.py
@@ -0,0 +1,121 @@
 #!/usr/bin/env python3
 import argparse
 import numpy as np
 import mdtraj as md
 from sklearn.cluster import KMeans
 import pandas as pd
 import matplotlib.pyplot as plt
 import os
 def parse_args():
    parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles')
    parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples')
    parser.add_argument('--topology', required=True, help='Path to PDB topology file')
    parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin')
    parser.add_argument('--output', required=True, help='Output CSV file for free energy data')
    parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters')
    parser.add_argument('--plot', help='Path to save energy plot (optional)')
    return parser.parse_args()
 def main():
    args = parse_args()
    # Load trajectory
    print(f"Loading trajectory {args.samples} with topology {args.topology}")
    traj = md.load(args.samples, top=args.topology)
    # Calculate RMSD to first frame
    print("Calculating RMSD to reference structure")
    # Align to the first frame
    traj.superpose(traj, 0)
    # Calculate RMSD for CA atoms only
    atom_indices = traj.topology.select('name CA')
    distances = np.zeros(traj.n_frames)
    for i in range(traj.n_frames):
        # Fixed line - using slicing instead of frame parameter
        distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0]
    # Feature extraction for clustering
    # Use the RMSD and radius of gyration as features
    rg = md.compute_rg(traj)
    features = np.column_stack((distances, rg))
    # Cluster structures
    print(f"Clustering structures into {args.n_clusters} states")
    kmeans = KMeans(n_clusters=args.n_clusters, random_state=42)
    labels = kmeans.fit_predict(features)
    # Calculate state populations
    unique_labels, counts = np.unique(labels, return_counts=True)
    populations = counts / len(labels)
    # Calculate free energies
    R = 0.0019872041 # kcal/(mol·K)
    T = args.temperature
    RT = R * T
    # Reference state is the most populated one
    reference_idx = np.argmax(populations)
    reference_pop = populations[reference_idx]
    # Calculate ΔG values 
    free_energies = -RT * np.log(populations / reference_pop)
    # Get representative structures from each cluster
    representatives = []
    for i in range(args.n_clusters):
        cluster_frames = np.where(labels == i)[0]
        if len(cluster_frames) > 0:
            # Find frame closest to cluster center
            cluster_features = features[cluster_frames]
            center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1)
            center_idx = cluster_frames[np.argmin(center_dists)]
            representatives.append(int(center_idx))
        else:
            representatives.append(-1)  # No members in this cluster
    # Create results dataframe
    results = pd.DataFrame({
        'Cluster': unique_labels,
        'Population': populations,
        'DeltaG_kcal_mol': free_energies,
        'Representative_Frame': representatives
    })
    # Sort by free energy
    results = results.sort_values('DeltaG_kcal_mol')
    # Save results
    results.to_csv(args.output, index=False)
    print(f"Results saved to {args.output}")
    # Print summary
    print("\nFree Energy Summary:")
    print(results)
    # Calculate overall free energy range
    print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol")
    # Create plot if requested
    if args.plot:
        plt.figure(figsize=(10, 6))
        # Plot free energy for each cluster
        plt.bar(range(len(unique_labels)), free_energies, alpha=0.7)
        plt.xlabel('Cluster')
        plt.ylabel('ΔG (kcal/mol)')
        plt.title('Free Energy Landscape')
        plt.xticks(range(len(unique_labels)))
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        # Add population as text on bars
        for i, (energy, pop) in enumerate(zip(free_energies, populations)):
            plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center')
        plt.tight_layout()
        plt.savefig(args.plot, dpi=300)
        print(f"Plot saved to {args.plot}")
 if __name__ == "__main__":
    main()
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -0,0 +1,5 @@
 #!/bin/bash
 set -e
 # Execute the command passed to docker
 exec "$@"
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,89 @@
 #!/usr/bin/env nextflow
 nextflow.enable.dsl=2
 // Multiple FASTA files to process
 params.fasta_list = [
    "/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta",
    "/mnt/OmicNAS/private/old/olamide/bioemu/input/trp_cage.fasta"
 ]
 params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
 params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
 params.scripts_dir = "${baseDir}/scripts"
 params.num_samples = 10
 params.batch_size_100 = 10
 params.temperature = 300
 params.n_clusters = 5
 process BIOEMU {
    container 'bioemu:latest'
    containerOptions '--rm --gpus all -v /mnt:/mnt -v /tmp:/tmp'
    publishDir "${params.outdir}/${protein_id}", mode: 'copy'
    input:
        tuple val(protein_id), path(fasta)
    output:
        tuple val(protein_id), path("topology.pdb"), path("samples.xtc"), emit: structures
        path "sequence.fasta", optional: true
        path "batch_*.npz", optional: true
        path "run.log"
    script:
    """
    # Make sure cache directory exists
    mkdir -p ${params.cache_dir}
    # Extract the sequence from the FASTA file
    SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n')
    # Run BioEmu with the extracted sequence
    python3 -m bioemu.sample \
        --sequence "\${SEQUENCE}" \
        --num_samples ${params.num_samples} \
        --batch_size_100 ${params.batch_size_100} \
        --output_dir . \
        --cache_embeds_dir ${params.cache_dir} 2>&1 | tee run.log
    """
 }
 process CALCULATE_FREE_ENERGY {
    container 'bioemu:latest'
    containerOptions '--rm --gpus all -v /mnt:/mnt'
    publishDir "${params.outdir}/${protein_id}/analysis", mode: 'copy'
    input:
        tuple val(protein_id), path(topology), path(samples)
    output:
        tuple val(protein_id), path("free_energy.csv"), emit: free_energy
        path "energy_plot.png", optional: true
    script:
    """
    # Calculate free energy from sampled structures
    python3 /opt/bioemu/scripts/calculate_gibbs.py \\
        --samples ${samples} \\
        --topology ${topology} \\
        --temperature ${params.temperature} \\
        --n_clusters ${params.n_clusters} \\
        --output free_energy.csv \\
        --plot energy_plot.png
    """
 }
 workflow {
    // Convert fasta_list to a channel of [protein_id, fasta_file] tuples
    Channel.fromList(params.fasta_list)
           .map { fasta_path -> 
               def file = file(fasta_path)
               return [file.baseName, file]
           }
           .set { fasta_ch }
    // Run BioEmu for each protein sequence
    BIOEMU(fasta_ch)
    // Calculate Gibbs free energy for each protein
    CALCULATE_FREE_ENERGY(BIOEMU.out.structures)
 }
--- a/nextflow.config
+++ b/nextflow.config
@@ -0,0 +1,38 @@
 // Manifest for Nextflow metadata
 manifest {
    name = 'BioEmu-Nextflow'
    author = 'Generated from BioEmu repository'
    homePage = 'https://github.com/microsoft/bioemu'
    description = 'Nextflow pipeline for BioEmu - Biomolecular Emulator for protein structure sampling'
    mainScript = 'main.nf'
    version = '1.0.0'
 }
 // Global default parameters
 params {
    fasta = "/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta"
    outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
    cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
    num_samples = 10
    batch_size_100 = 10
 }
 // Container configurations
 docker {
    enabled = true
    runOptions = '--gpus all'
 }
 // Process configurations
 process {
    cpus = 1
    memory = '8 GB'
 }
 // Execution configurations
 executor {
    $local {
        cpus = 4
        memory = '8 GB'
    }
 }
--- a/params.json
+++ b/params.json
@@ -0,0 +1,106 @@
 {
    "params": {
        "fasta_list": {
            "type": "file[]",
            "description": "FASTA files containing protein sequences",
            "default": [],
            "required": true,
            "pipeline_io": "input",
            "var_name": "params.fasta_list",
            "examples": [
                ["/omic/olamide/examples/prot1.fasta", "/omic/olamide/examples/prot2.fasta"]
            ],
            "pattern": ".*\\.fasta$",
            "validation": {},
            "notes": "Select one or more FASTA files with protein sequences"
        },
        "outdir": {
            "type": "folder",
            "description": "Output Directory",
            "default": "/omic/olamide/output",
            "required": true,
            "pipeline_io": "output",
            "var_name": "params.outdir",
            "examples": [
                "/omic/olamide/output"
            ],
            "pattern": ".*",
            "validation": {},
            "notes": "Select where to save your analysis results"
        },
        "num_samples": {
            "type": "integer",
            "description": "Number of protein structure samples",
            "default": 10,
            "required": true,
            "pipeline_io": "parameter",
            "var_name": "params.num_samples",
            "examples": [
                "10"
            ],
            "pattern": "^\\d+$",
            "validation": {
                "min": 1
            },
            "notes": "More samples provide better coverage of conformational space"
        },
        "batch_size_100": {
            "type": "integer",
            "description": "Batch size parameter",
            "default": 10,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.batch_size_100",
            "hidden": true,
            "examples": [
                "10"
            ],
            "pattern": "^\\d+$"
        },
        "temperature": {
            "type": "integer",
            "description": "Temperature (K) for free energy",
            "default": 300,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.temperature",
            "examples": [
                "300"
            ],
            "pattern": "^\\d+$",
            "validation": {
                "min": 200,
                "max": 500
            },
            "notes": "Temperature in Kelvin for free energy calculations"
        },
        "n_clusters": {
            "type": "integer",
            "description": "Number of conformational clusters",
            "default": 5,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.n_clusters",
            "examples": [
                "5"
            ],
            "pattern": "^\\d+$",
            "validation": {
                "min": 2
            },
            "notes": "Number of clusters for free energy analysis"
        },
        "cache_dir": {
            "type": "folder",
            "description": "Embeddings cache directory",
            "default": "/tmp/bioemu_cache",
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.cache_dir",
            "hidden": true,
            "examples": [
                "/tmp/bioemu_cache"
            ]
        }
    }
 }
--- a/scripts/calculate_gibbs.py
+++ b/scripts/calculate_gibbs.py
@@ -0,0 +1,121 @@
 #!/usr/bin/env python3
 import argparse
 import numpy as np
 import mdtraj as md
 from sklearn.cluster import KMeans
 import pandas as pd
 import matplotlib.pyplot as plt
 import os
 def parse_args():
    parser = argparse.ArgumentParser(description='Calculate Gibbs free energy from protein structure ensembles')
    parser.add_argument('--samples', required=True, help='Path to XTC trajectory file with structure samples')
    parser.add_argument('--topology', required=True, help='Path to PDB topology file')
    parser.add_argument('--temperature', type=float, default=300.0, help='Temperature in Kelvin')
    parser.add_argument('--output', required=True, help='Output CSV file for free energy data')
    parser.add_argument('--n_clusters', type=int, default=5, help='Number of conformational clusters')
    parser.add_argument('--plot', help='Path to save energy plot (optional)')
    return parser.parse_args()
 def main():
    args = parse_args()
    # Load trajectory
    print(f"Loading trajectory {args.samples} with topology {args.topology}")
    traj = md.load(args.samples, top=args.topology)
    # Calculate RMSD to first frame
    print("Calculating RMSD to reference structure")
    # Align to the first frame
    traj.superpose(traj, 0)
    # Calculate RMSD for CA atoms only
    atom_indices = traj.topology.select('name CA')
    distances = np.zeros(traj.n_frames)
    for i in range(traj.n_frames):
        # Fixed line - using slicing instead of frame parameter
        distances[i] = md.rmsd(traj[i:i+1], traj[0:1], atom_indices=atom_indices)[0]
    # Feature extraction for clustering
    # Use the RMSD and radius of gyration as features
    rg = md.compute_rg(traj)
    features = np.column_stack((distances, rg))
    # Cluster structures
    print(f"Clustering structures into {args.n_clusters} states")
    kmeans = KMeans(n_clusters=args.n_clusters, random_state=42)
    labels = kmeans.fit_predict(features)
    # Calculate state populations
    unique_labels, counts = np.unique(labels, return_counts=True)
    populations = counts / len(labels)
    # Calculate free energies
    R = 0.0019872041 # kcal/(mol·K)
    T = args.temperature
    RT = R * T
    # Reference state is the most populated one
    reference_idx = np.argmax(populations)
    reference_pop = populations[reference_idx]
    # Calculate ΔG values 
    free_energies = -RT * np.log(populations / reference_pop)
    # Get representative structures from each cluster
    representatives = []
    for i in range(args.n_clusters):
        cluster_frames = np.where(labels == i)[0]
        if len(cluster_frames) > 0:
            # Find frame closest to cluster center
            cluster_features = features[cluster_frames]
            center_dists = np.linalg.norm(cluster_features - kmeans.cluster_centers_[i], axis=1)
            center_idx = cluster_frames[np.argmin(center_dists)]
            representatives.append(int(center_idx))
        else:
            representatives.append(-1)  # No members in this cluster
    # Create results dataframe
    results = pd.DataFrame({
        'Cluster': unique_labels,
        'Population': populations,
        'DeltaG_kcal_mol': free_energies,
        'Representative_Frame': representatives
    })
    # Sort by free energy
    results = results.sort_values('DeltaG_kcal_mol')
    # Save results
    results.to_csv(args.output, index=False)
    print(f"Results saved to {args.output}")
    # Print summary
    print("\nFree Energy Summary:")
    print(results)
    # Calculate overall free energy range
    print(f"\nFree energy range: {np.max(free_energies) - np.min(free_energies):.2f} kcal/mol")
    # Create plot if requested
    if args.plot:
        plt.figure(figsize=(10, 6))
        # Plot free energy for each cluster
        plt.bar(range(len(unique_labels)), free_energies, alpha=0.7)
        plt.xlabel('Cluster')
        plt.ylabel('ΔG (kcal/mol)')
        plt.title('Free Energy Landscape')
        plt.xticks(range(len(unique_labels)))
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        # Add population as text on bars
        for i, (energy, pop) in enumerate(zip(free_energies, populations)):
            plt.text(i, energy + 0.1, f"{pop*100:.1f}%", ha='center')
        plt.tight_layout()
        plt.savefig(args.plot, dpi=300)
        print(f"Plot saved to {args.plot}")
 if __name__ == "__main__":
    main()
--- a/setup.sh
+++ b/setup.sh
@@ -0,0 +1,21 @@
 #!/bin/bash
 # Set up directory structure for BioEmu workflow
 BASE_DIR="/mnt/OmicNAS/private/old/olamide/bioemu"
 # Create necessary directories
 mkdir -p ${BASE_DIR}/input
 mkdir -p ${BASE_DIR}/output
 mkdir -p ${BASE_DIR}/cache
 mkdir -p ${BASE_DIR}/scripts
 # Copy FASTA files
 cp villin_headpiece.fasta ${BASE_DIR}/input/
 cp trp_cage.fasta ${BASE_DIR}/input/
 # Copy scripts
 cp calculate_gibbs.py ${BASE_DIR}/scripts/
 chmod +x ${BASE_DIR}/scripts/calculate_gibbs.py
 echo "Directory structure set up at ${BASE_DIR}"
 echo "FASTA files and scripts copied"
--- a/trp_cage.fasta
+++ b/trp_cage.fasta
@@ -0,0 +1,2 @@
 >Trp_cage
 NLYIQWLKDGGPSSGRPPPS
--- a/villin_headpiece.fasta
+++ b/villin_headpiece.fasta
@@ -0,0 +1,2 @@
 >Villin_Headpiece
 LSDEDFKAVFGMTRSAFANLPLWKQQNLKKEKGLF
		`@@ -0,0 +1,2 @@`
							`>Villin_Headpiece`
							`LSDEDFKAVFGMTRSAFANLPLWKQQNLKKEKGLF`