From 42d4e6cb87a4027a1d4b9a4ec8228dc9685860ff Mon Sep 17 00:00:00 2001
From: Olamide Isreal <olamide@omicmd.com>
Date: Mon, 23 Mar 2026 13:27:40 +0100
Subject: [PATCH] Add WES pipeline configuration for pocketminer

- Add Nextflow pipeline (main.nf) with Harbor container image
- Add nextflow.config with k8s/k8s_gpu/standard profiles
- Add params.json for TRS/WES parameter discovery
- Add Dockerfile, entrypoint.py, meta.yml from original implementation
- Update paths to use /omic/eureka/Pocketminer/ convention
- Update .gitignore to allow params.json
---
 .gitignore      |  37 ++++++
 Dockerfile      |  37 ++++++
 Makefile        |  66 +++++++++++
 entrypoint.py   | 293 ++++++++++++++++++++++++++++++++++++++++++++++++
 main.nf         |  44 ++++++++
 meta.yml        |  42 +++++++
 nextflow.config |  36 ++++++
 params.json     |  51 +++++++++
 8 files changed, 606 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 Makefile
 create mode 100644 entrypoint.py
 create mode 100644 main.nf
 create mode 100644 meta.yml
 create mode 100644 nextflow.config
 create mode 100644 params.json

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7bd8c4d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,37 @@
+# Nextflow
+.nextflow/
+.nextflow.log*
+work/
+results/
+*.html
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+
+# Data
+*.pdb
+*.npy
+output/
+data/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Docker
+.dockerignore
+
+# Models (large files)
+models/
+*.ckpt
+*.h5
+*.pkl
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..19da1ae
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,37 @@
+FROM continuumio/miniconda3:latest
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Update and install basic dependencies
+RUN apt-get update -y \
+    && apt-get -y upgrade --fix-missing \
+    && apt-get -y install git procps coreutils wget \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Clone PocketMiner repository
+RUN git clone https://github.com/Mickdub/gvp.git \
+    && cd gvp \
+    && git checkout pocket_pred
+
+# Create conda environment and install dependencies
+RUN conda create -n pocketminer python=3.9 -y && \
+    conda install -n pocketminer -c conda-forge \
+    numpy scipy pandas tensorflow tqdm mdtraj pyyaml -y && \
+    conda clean -afy
+
+# Activate environment and set up PATH
+ENV PATH=/opt/conda/envs/pocketminer/bin:$PATH
+ENV CONDA_DEFAULT_ENV=pocketminer
+
+# Copy entrypoint script
+COPY entrypoint.py /workspace/entrypoint.py
+RUN chmod +x /workspace/entrypoint.py
+
+# Set Python path to include the gvp/src directory
+ENV PYTHONPATH=/workspace/gvp/src:$PYTHONPATH
+
+# Default command
+CMD ["python", "/workspace/entrypoint.py", "--help"]
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..9fa5d02
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,66 @@
+.PHONY: help build run test clean
+
+# Default target
+help:
+	@echo "PocketMiner - Cryptic Pocket Prediction Tool"
+	@echo ""
+	@echo "Available targets:"
+	@echo "  make build       - Build Docker image (conda-based, includes all dependencies)"
+	@echo "  make run         - Run test prediction (requires test.pdb)"
+	@echo "  make test        - Run Nextflow pipeline on test data"
+	@echo "  make clean       - Clean up generated files"
+	@echo "  make shell       - Open shell in Docker container"
+	@echo "  make download-example - Download example PDB file"
+	@echo ""
+
+# Build Docker image (conda-based with all dependencies)
+build:
+	@echo "Building PocketMiner Docker image (conda-based)..."
+	docker build -t pocketminer:latest .
+	@echo "Build complete!"
+
+# Run single test prediction
+run:
+	@if [ ! -f test.pdb ]; then \
+		echo "Error: test.pdb not found. Please provide a test PDB file."; \
+		exit 1; \
+	fi
+	@echo "Running PocketMiner prediction on test.pdb..."
+	docker run --rm \
+		-v $(PWD):/data \
+		pocketminer:latest \
+		python /workspace/entrypoint.py \
+			--pdb /data/test.pdb \
+			--output-folder /data/output \
+			--output-name test
+	@echo "Results saved to output/"
+
+# Run Nextflow pipeline
+test:
+	@echo "Running Nextflow pipeline..."
+	nextflow run main.nf
+	@echo "Pipeline complete!"
+
+# Clean generated files
+clean:
+	@echo "Cleaning up..."
+	rm -rf output/
+	rm -rf .nextflow/
+	rm -f .nextflow.log*
+	rm -rf work/
+	rm -rf results/
+	rm -f *.npy *.txt
+	@echo "Clean complete!"
+
+# Open shell in container
+shell:
+	docker run --rm -it \
+		-v $(PWD):/data \
+		pocketminer:latest \
+		/bin/bash
+
+# Download example PDB (if internet available)
+download-example:
+	@echo "Downloading example PDB (1HSG - HIV protease)..."
+	wget -O test.pdb https://files.rcsb.org/download/1HSG.pdb
+	@echo "Example downloaded as test.pdb"
diff --git a/entrypoint.py b/entrypoint.py
new file mode 100644
index 0000000..5cb46be
--- /dev/null
+++ b/entrypoint.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python3
+"""
+PocketMiner Entrypoint - Command-line wrapper for cryptic pocket prediction
+
+This script wraps the PocketMiner xtal_predict.py functionality with a proper
+command-line interface for Nextflow/Docker integration.
+"""
+
+import argparse
+import json
+import os
+import sys
+import numpy as np
+from pathlib import Path
+import warnings
+
+# Suppress TensorFlow warnings
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
+
+# Import PocketMiner components
+sys.path.insert(0, '/workspace/gvp/src')
+
+try:
+    import tensorflow as tf
+    import mdtraj as md
+    from models import MQAModel
+    from util import load_checkpoint
+    from validate_performance_on_xtals import process_strucs, predict_on_xtals
+except ImportError as e:
+    print(f"Error importing PocketMiner modules: {e}", file=sys.stderr)
+    print("Please ensure the GVP repository is properly cloned and models are available.", file=sys.stderr)
+    sys.exit(1)
+
+
+def load_model(model_path, dropout=0.1, num_layers=4, hidden_dim=100):
+    """Load pre-trained PocketMiner model"""
+    # Model architecture from original PocketMiner (must match checkpoint exactly)
+    model = MQAModel(
+        node_features=(8, 50),
+        edge_features=(1, 32),
+        hidden_dim=(16, hidden_dim),  # (16, 100) for pocketminer checkpoint
+        num_layers=num_layers,
+        dropout=dropout
+    )
+
+    # Load checkpoint
+    opt = tf.keras.optimizers.Adam()
+    load_checkpoint(model, opt, model_path)
+
+    return model
+
+
+def make_predictions(pdb_file, model, model_path, output_folder, output_name, debug=False):
+    """Make cryptic pocket predictions for a PDB structure"""
+
+    # Load structure using mdtraj
+    try:
+        struc = md.load(pdb_file)
+        strucs = [struc]
+    except Exception as e:
+        raise ValueError(f"Failed to load PDB file {pdb_file}: {e}")
+
+    # Process structure to get features
+    X, S, mask = process_strucs(strucs)
+
+    # Get predictions using PocketMiner model
+    predictions = predict_on_xtals(model, model_path, X, S, mask)
+
+    # Extract predictions for the single structure
+    # predictions shape: (batch, max_length)
+    pred_array = predictions[0]  # First (and only) structure
+    mask_array = mask[0]  # Corresponding mask
+
+    # Convert TensorFlow tensors to NumPy arrays explicitly
+    if hasattr(pred_array, 'numpy'):
+        pred_array = pred_array.numpy()
+    if hasattr(mask_array, 'numpy'):
+        mask_array = mask_array.numpy()
+
+    # Ensure arrays are NumPy (in case they weren't TensorFlow tensors)
+    pred_array = np.asarray(pred_array)
+    mask_array = np.asarray(mask_array)
+
+    # Get only valid (masked) residues
+    valid_residues = mask_array > 0
+    pred_valid = pred_array[valid_residues]
+
+    # Save outputs
+    output_path = Path(output_folder)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Save binary predictions (full array with padding)
+    pred_file = output_path / f"{output_name}-preds.npy"
+    np.save(pred_file, pred_valid)
+
+    # Save human-readable predictions
+    txt_file = output_path / f"{output_name}-predictions.txt"
+    np.savetxt(txt_file, pred_valid, fmt='%.4f')
+
+    # Calculate summary statistics
+    cryptic_pocket_score = float(np.mean(pred_valid))
+    high_confidence_residues = int(np.sum(pred_valid > 0.7))
+    medium_confidence_residues = int(np.sum((pred_valid > 0.4) & (pred_valid <= 0.7)))
+
+    # Save debug features if requested
+    if debug:
+        np.save(output_path / f"{output_name}_X.npy", X)
+        np.save(output_path / f"{output_name}_S.npy", S)
+        np.save(output_path / f"{output_name}_mask.npy", mask)
+
+    # Cluster high-confidence residues
+    pocket_clusters = cluster_residues(pred_valid, threshold=0.5)
+
+    # Generate summary JSON
+    summary = {
+        "cryptic_pocket_score": cryptic_pocket_score,
+        "high_confidence_residues": high_confidence_residues,
+        "medium_confidence_residues": medium_confidence_residues,
+        "total_residues": len(pred_valid),
+        "pocket_clusters": pocket_clusters,
+        "output_files": {
+            "predictions_npy": str(pred_file),
+            "predictions_txt": str(txt_file)
+        }
+    }
+
+    summary_file = output_path / f"{output_name}-summary.json"
+    with open(summary_file, 'w') as f:
+        json.dump(summary, f, indent=2)
+
+    return summary
+
+
+def cluster_residues(predictions, threshold=0.5, min_cluster_size=3):
+    """
+    Cluster high-scoring residues into spatial pockets
+
+    Returns list of clusters with residue indices and average scores
+    """
+    # Ensure predictions is a pure NumPy array
+    if hasattr(predictions, 'numpy'):
+        predictions = predictions.numpy()
+    predictions = np.asarray(predictions)
+
+    high_score_idx = np.where(predictions > threshold)[0]
+
+    if len(high_score_idx) == 0:
+        return []
+
+    # Simple sequential clustering (assumes residues are ordered by sequence)
+    # More sophisticated spatial clustering would require 3D coordinates
+    clusters = []
+    current_cluster = [int(high_score_idx[0])]  # Convert to Python int
+
+    for idx in high_score_idx[1:]:
+        idx = int(idx)  # Convert to Python int
+        if idx - current_cluster[-1] <= 2:  # Allow 2-residue gaps
+            current_cluster.append(idx)
+        else:
+            if len(current_cluster) >= min_cluster_size:
+                # Use NumPy array indexing for safety
+                cluster_indices = np.array(current_cluster)
+                cluster_score = float(np.mean(predictions[cluster_indices]))
+                clusters.append({
+                    "residue_indices": current_cluster,
+                    "size": len(current_cluster),
+                    "average_score": cluster_score
+                })
+            current_cluster = [idx]
+
+    # Add final cluster
+    if len(current_cluster) >= min_cluster_size:
+        cluster_indices = np.array(current_cluster)
+        cluster_score = float(np.mean(predictions[cluster_indices]))
+        clusters.append({
+            "residue_indices": current_cluster,
+            "size": len(current_cluster),
+            "average_score": cluster_score
+        })
+
+    # Sort by score
+    clusters.sort(key=lambda x: x['average_score'], reverse=True)
+
+    return clusters
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='PocketMiner: Predict cryptic binding pockets in protein structures'
+    )
+
+    parser.add_argument(
+        '--pdb',
+        required=True,
+        help='Input PDB file path'
+    )
+
+    parser.add_argument(
+        '--output-folder',
+        default='.',
+        help='Output directory for results (default: current directory)'
+    )
+
+    parser.add_argument(
+        '--output-name',
+        required=True,
+        help='Base name for output files'
+    )
+
+    parser.add_argument(
+        '--model-path',
+        default='/workspace/gvp/models/pocketminer',
+        help='Path to pre-trained model checkpoint'
+    )
+
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='Save debug features (X, S, mask arrays)'
+    )
+
+    parser.add_argument(
+        '--dropout',
+        type=float,
+        default=0.1,
+        help='Model dropout rate (default: 0.1)'
+    )
+
+    parser.add_argument(
+        '--num-layers',
+        type=int,
+        default=4,
+        help='Number of model layers (default: 4)'
+    )
+
+    parser.add_argument(
+        '--hidden-dim',
+        type=int,
+        default=100,
+        help='Hidden dimension size (default: 100)'
+    )
+
+    args = parser.parse_args()
+
+    # Validate inputs
+    if not os.path.exists(args.pdb):
+        print(f"Error: PDB file not found: {args.pdb}", file=sys.stderr)
+        sys.exit(1)
+
+    # Check if model checkpoint files exist (model_path is a prefix, not a directory)
+    model_index = f"{args.model_path}.index"
+    if not os.path.exists(model_index):
+        print(f"Error: Model checkpoint not found: {args.model_path}", file=sys.stderr)
+        print(f"Looking for: {model_index}", file=sys.stderr)
+        print("Please ensure the pre-trained model is available.", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Loading PocketMiner model from {args.model_path}...")
+    model = load_model(
+        args.model_path,
+        dropout=args.dropout,
+        num_layers=args.num_layers,
+        hidden_dim=args.hidden_dim
+    )
+
+    print(f"Processing structure: {args.pdb}")
+    summary = make_predictions(
+        pdb_file=args.pdb,
+        model=model,
+        model_path=args.model_path,
+        output_folder=args.output_folder,
+        output_name=args.output_name,
+        debug=args.debug
+    )
+
+    print("\n" + "="*60)
+    print("PocketMiner Prediction Summary")
+    print("="*60)
+    print(f"Overall cryptic pocket score: {summary['cryptic_pocket_score']:.4f}")
+    print(f"High confidence residues (>0.7): {summary['high_confidence_residues']}")
+    print(f"Medium confidence residues (0.4-0.7): {summary['medium_confidence_residues']}")
+    print(f"Total residues analyzed: {summary['total_residues']}")
+    print(f"\nPocket clusters identified: {len(summary['pocket_clusters'])}")
+
+    for i, cluster in enumerate(summary['pocket_clusters'][:5], 1):
+        print(f"  Cluster {i}: {cluster['size']} residues, score={cluster['average_score']:.4f}")
+
+    print(f"\nResults saved to: {args.output_folder}")
+    print("="*60 + "\n")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/main.nf b/main.nf
new file mode 100644
index 0000000..a7bfe58
--- /dev/null
+++ b/main.nf
@@ -0,0 +1,44 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+// Parameters
+params.pdb = '/omic/eureka/Pocketminer/1HSG.pdb'
+params.outdir = '/omic/eureka/Pocketminer/output'
+params.model_path = '/workspace/gvp/models/pocketminer'
+params.debug = false
+
+// Process definition
+process POCKETMINER {
+    container 'harbor.cluster.omic.ai/omic/pocketminer:latest'
+    publishDir params.outdir, mode: 'copy'
+    stageInMode 'copy'
+
+    input:
+    path pdb_file
+
+    output:
+    path "*-preds.npy", emit: predictions_npy
+    path "*-predictions.txt", emit: predictions_txt
+    path "*-summary.json", emit: summary
+    path "*_X.npy", optional: true, emit: features_debug
+    path "*_S.npy", optional: true, emit: sequence_debug
+    path "*_mask.npy", optional: true, emit: mask_debug
+
+    script:
+    def pdb_basename = pdb_file.baseName
+    def debug_flag = params.debug ? '--debug' : ''
+    """
+    python /workspace/entrypoint.py \\
+        --pdb ${pdb_file} \\
+        --output-folder . \\
+        --output-name ${pdb_basename} \\
+        --model-path ${params.model_path} \\
+        ${debug_flag}
+    """
+}
+
+// Workflow
+workflow {
+    POCKETMINER(Channel.of(file(params.pdb)))
+}
diff --git a/meta.yml b/meta.yml
new file mode 100644
index 0000000..e87acff
--- /dev/null
+++ b/meta.yml
@@ -0,0 +1,42 @@
+params:
+  - outdir:
+      type: file
+      description: path where output files will be deposited
+      required: true
+  - model_path:
+      type: file
+      description: path to pre-trained PocketMiner model checkpoint
+      default: /workspace/gvp/models/pocketminer
+      required: false
+  - debug:
+      type: boolean
+      description: save debug features (X, S, mask arrays)
+      default: false
+      required: false
+input:
+  - pdb:
+      type: file
+      description: PDB file path for protein structure
+  - dynamics_optional:
+      type: file
+      description: optional MD trajectory or ensemble of conformers for enhanced cryptic pocket detection
+      required: false
+output:
+  - predictions_npy:
+      type: file
+      description: NumPy binary file containing per-residue cryptic pocket scores
+  - predictions_txt:
+      type: file
+      description: human-readable text file with per-residue cryptic pocket scores (4 decimal places)
+  - summary:
+      type: file
+      description: JSON file containing overall cryptic pocket score, high/medium confidence residue counts, pocket clusters, and metadata
+  - features_debug:
+      type: file
+      description: (optional) protein features array for debugging
+  - sequence_debug:
+      type: file
+      description: (optional) sequence data array for debugging
+  - mask_debug:
+      type: file
+      description: (optional) masking array for debugging
diff --git a/nextflow.config b/nextflow.config
new file mode 100644
index 0000000..065a56f
--- /dev/null
+++ b/nextflow.config
@@ -0,0 +1,36 @@
+profiles {
+    standard {
+        docker {
+            enabled = true
+            temp = 'auto'
+        }
+    }
+
+    k8s {
+        process {
+            executor = 'k8s'
+        }
+        docker {
+            enabled = true
+        }
+        k8s {
+            storageClaimName = 'eureka-pvc'
+            storageMountPath = '/omic/eureka'
+        }
+    }
+
+    k8s_gpu {
+        process {
+            executor = 'k8s'
+            pod = [[nodeSelector: 'nvidia.com/gpu.present=true']]
+            accelerator = [request: 1, type: 'nvidia.com/gpu']
+        }
+        docker {
+            enabled = true
+        }
+        k8s {
+            storageClaimName = 'eureka-pvc'
+            storageMountPath = '/omic/eureka'
+        }
+    }
+}
diff --git a/params.json b/params.json
new file mode 100644
index 0000000..f299235
--- /dev/null
+++ b/params.json
@@ -0,0 +1,51 @@
+{
+    "params": {
+        "pdb": {
+            "type": "file",
+            "description": "Path to input PDB file for cryptic pocket prediction",
+            "default": "s3://omic/eureka/Pocketminer/1HSG.pdb",
+            "required": true,
+            "pipeline_io": "input",
+            "var_name": "params.pdb",
+            "examples": [
+                "s3://omic/eureka/Pocketminer/1HSG.pdb",
+                "s3://omic/eureka/Pocketminer/protein.pdb"
+            ],
+            "pattern": ".*\\.pdb$",
+            "enum": [],
+            "validation": {},
+            "notes": "PDB file containing the protein structure for cryptic binding pocket prediction."
+        },
+        "outdir": {
+            "type": "folder",
+            "description": "Output directory for PocketMiner prediction results",
+            "default": "s3://omic/eureka/Pocketminer/output",
+            "required": true,
+            "pipeline_io": "output",
+            "var_name": "params.outdir",
+            "examples": [
+                "s3://omic/eureka/Pocketminer/output",
+                "s3://omic/eureka/Pocketminer/results"
+            ],
+            "pattern": ".*",
+            "enum": [],
+            "validation": {},
+            "notes": "Directory where prediction results (numpy arrays, text predictions, and JSON summary) will be stored."
+        },
+        "debug": {
+            "type": "boolean",
+            "description": "Save debug features (X, S, mask arrays)",
+            "default": false,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.debug",
+            "examples": [
+                false,
+                true
+            ],
+            "enum": [true, false],
+            "validation": {},
+            "notes": "Enable to save intermediate feature arrays for debugging purposes."
+        }
+    }
+}