From e172ccc453830b3ed782e5446a9b1589e5ebe807 Mon Sep 17 00:00:00 2001 From: Olamide Isreal Date: Thu, 26 Mar 2026 14:55:33 +0100 Subject: [PATCH] Simplify workflow: separate processes per protein, avoid filter/map chains Split GENERATE_STRUCTURE into two explicit processes to avoid channel filter/map/first() chains that cause StackOverflowError in Nextflow 24.10.x --- main.nf | 69 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 22 deletions(-) diff --git a/main.nf b/main.nf index fc377bf..6754e39 100644 --- a/main.nf +++ b/main.nf @@ -1,7 +1,6 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 -// Define parameters — PVC mount paths for k8s execution params.protein1_fasta = '/omic/eureka/bioemu/input/trp_cage.fasta' params.protein2_fasta = '/omic/eureka/bioemu/input/villin_headpiece.fasta' params.complex_name = 'protein_complex' @@ -13,15 +12,16 @@ params.batch_size = 5 params.temperature = 300 params.n_clusters = 5 -process GENERATE_STRUCTURE { +process GENERATE_STRUCTURE_1 { container 'harbor.cluster.omic.ai/omic/bioemu:latest' - publishDir "${params.outdir}/${params.complex_name}", mode: 'copy' + publishDir "${params.outdir}/${params.complex_name}/protein1", mode: 'copy' input: - tuple val(protein_id), path(fasta) + path fasta output: - tuple val(protein_id), path("${protein_id}_topology.pdb"), path("${protein_id}_samples.xtc") + path "protein1_topology.pdb", emit: topology + path "protein1_samples.xtc", emit: samples script: """ @@ -35,8 +35,36 @@ process GENERATE_STRUCTURE { --output_dir . \\ --cache_embeds_dir ${params.cache_dir} - mv topology.pdb ${protein_id}_topology.pdb - mv samples.xtc ${protein_id}_samples.xtc + mv topology.pdb protein1_topology.pdb + mv samples.xtc protein1_samples.xtc + """ +} + +process GENERATE_STRUCTURE_2 { + container 'harbor.cluster.omic.ai/omic/bioemu:latest' + publishDir "${params.outdir}/${params.complex_name}/protein2", mode: 'copy' + + input: + path fasta + + output: + path "protein2_topology.pdb", emit: topology + path "protein2_samples.xtc", emit: samples + + script: + """ + SEQUENCE=\$(grep -v ">" ${fasta} | tr -d '\\n') + mkdir -p ${params.cache_dir} + + python3 -m bioemu.sample \\ + --sequence "\${SEQUENCE}" \\ + --num_samples ${params.num_samples} \\ + --batch_size_100 ${params.batch_size} \\ + --output_dir . \\ + --cache_embeds_dir ${params.cache_dir} + + mv topology.pdb protein2_topology.pdb + mv samples.xtc protein2_samples.xtc """ } @@ -68,26 +96,23 @@ process CALCULATE_BINDING { --plot energy_comparison.png echo "# Binding Free Energy Analysis: ${params.complex_name}" > binding_energy_report.txt - echo "======================================================" >> binding_energy_report.txt - echo "## Experimental Value: ${params.exp_dG} kcal/mol" >> binding_energy_report.txt - echo "" >> binding_energy_report.txt + echo "Experimental: ${params.exp_dG} kcal/mol" >> binding_energy_report.txt PREDICTED_DG=\$(grep -A1 "binding_free_energy" binding_energy.csv | tail -n1 | cut -d',' -f2) - echo "## BioEmu Prediction: \${PREDICTED_DG} kcal/mol" >> binding_energy_report.txt + echo "Predicted: \${PREDICTED_DG} kcal/mol" >> binding_energy_report.txt """ } workflow { - protein1_ch = Channel.fromPath(params.protein1_fasta) - .map { fasta -> tuple("protein1", fasta) } - protein2_ch = Channel.fromPath(params.protein2_fasta) - .map { fasta -> tuple("protein2", fasta) } + fasta1 = Channel.fromPath(params.protein1_fasta) + fasta2 = Channel.fromPath(params.protein2_fasta) - all_proteins = protein1_ch.mix(protein2_ch) + GENERATE_STRUCTURE_1(fasta1) + GENERATE_STRUCTURE_2(fasta2) - GENERATE_STRUCTURE(all_proteins) - - p1 = GENERATE_STRUCTURE.out.filter { it[0] == "protein1" }.map { [it[1], it[2]] }.first() - p2 = GENERATE_STRUCTURE.out.filter { it[0] == "protein2" }.map { [it[1], it[2]] }.first() - - CALCULATE_BINDING(p1[0], p1[1], p2[0], p2[1]) + CALCULATE_BINDING( + GENERATE_STRUCTURE_1.out.topology, + GENERATE_STRUCTURE_1.out.samples, + GENERATE_STRUCTURE_2.out.topology, + GENERATE_STRUCTURE_2.out.samples + ) }