Add BioEmu Nextflow pipeline implementation

2025-03-04 09:38:55 -08:00
commit 2cfbf64e92
12 changed files with 565 additions and 0 deletions
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,89 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+// Multiple FASTA files to process
+params.fasta_list = [
+    "/mnt/OmicNAS/private/old/olamide/bioemu/input/villin_headpiece.fasta",
+    "/mnt/OmicNAS/private/old/olamide/bioemu/input/trp_cage.fasta"
+]
+params.outdir = "/mnt/OmicNAS/private/old/olamide/bioemu/output"
+params.cache_dir = "/mnt/OmicNAS/private/old/olamide/bioemu/cache"
+params.scripts_dir = "${baseDir}/scripts"
+params.num_samples = 10
+params.batch_size_100 = 10
+params.temperature = 300
+params.n_clusters = 5
+
+process BIOEMU {
+    container 'bioemu:latest'
+    containerOptions '--rm --gpus all -v /mnt:/mnt -v /tmp:/tmp'
+    publishDir "${params.outdir}/${protein_id}", mode: 'copy'
+    
+    input:
+        tuple val(protein_id), path(fasta)
+    
+    output:
+        tuple val(protein_id), path("topology.pdb"), path("samples.xtc"), emit: structures
+        path "sequence.fasta", optional: true
+        path "batch_*.npz", optional: true
+        path "run.log"
+    
+    script:
+    """
+    # Make sure cache directory exists
+    mkdir -p ${params.cache_dir}
+    
+    # Extract the sequence from the FASTA file
+    SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n')
+    
+    # Run BioEmu with the extracted sequence
+    python3 -m bioemu.sample \
+        --sequence "\${SEQUENCE}" \
+        --num_samples ${params.num_samples} \
+        --batch_size_100 ${params.batch_size_100} \
+        --output_dir . \
+        --cache_embeds_dir ${params.cache_dir} 2>&1 | tee run.log
+    """
+}
+
+process CALCULATE_FREE_ENERGY {
+    container 'bioemu:latest'
+    containerOptions '--rm --gpus all -v /mnt:/mnt'
+    publishDir "${params.outdir}/${protein_id}/analysis", mode: 'copy'
+    
+    input:
+        tuple val(protein_id), path(topology), path(samples)
+    
+    output:
+        tuple val(protein_id), path("free_energy.csv"), emit: free_energy
+        path "energy_plot.png", optional: true
+        
+    script:
+    """
+    # Calculate free energy from sampled structures
+    python3 /opt/bioemu/scripts/calculate_gibbs.py \\
+        --samples ${samples} \\
+        --topology ${topology} \\
+        --temperature ${params.temperature} \\
+        --n_clusters ${params.n_clusters} \\
+        --output free_energy.csv \\
+        --plot energy_plot.png
+    """
+}
+
+workflow {
+    // Convert fasta_list to a channel of [protein_id, fasta_file] tuples
+    Channel.fromList(params.fasta_list)
+           .map { fasta_path -> 
+               def file = file(fasta_path)
+               return [file.baseName, file]
+           }
+           .set { fasta_ch }
+    
+    // Run BioEmu for each protein sequence
+    BIOEMU(fasta_ch)
+    
+    // Calculate Gibbs free energy for each protein
+    CALCULATE_FREE_ENERGY(BIOEMU.out.structures)
+}