Add WES pipeline configuration for pocketminer

- Add Nextflow pipeline (main.nf) with Harbor container image - Add nextflow.config with k8s/k8s_gpu/standard profiles - Add params.json for TRS/WES parameter discovery - Add Dockerfile, entrypoint.py, meta.yml from original implementation - Update paths to use /omic/eureka/Pocketminer/ convention - Update .gitignore to allow params.json
2026-03-23 13:27:40 +01:00
parent 6071e5ad1f
commit 42d4e6cb87
8 changed files with 606 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,37 @@
+# Nextflow
+.nextflow/
+.nextflow.log*
+work/
+results/
+*.html
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+
+# Data
+*.pdb
+*.npy
+output/
+data/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Docker
+.dockerignore
+
+# Models (large files)
+models/
+*.ckpt
+*.h5
+*.pkl
--- a/37
+++ b/37
@@ -0,0 +1,37 @@
+FROM continuumio/miniconda3:latest
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Update and install basic dependencies
+RUN apt-get update -y \
+    && apt-get -y upgrade --fix-missing \
+    && apt-get -y install git procps coreutils wget \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Clone PocketMiner repository
+RUN git clone https://github.com/Mickdub/gvp.git \
+    && cd gvp \
+    && git checkout pocket_pred
+
+# Create conda environment and install dependencies
+RUN conda create -n pocketminer python=3.9 -y && \
+    conda install -n pocketminer -c conda-forge \
+    numpy scipy pandas tensorflow tqdm mdtraj pyyaml -y && \
+    conda clean -afy
+
+# Activate environment and set up PATH
+ENV PATH=/opt/conda/envs/pocketminer/bin:$PATH
+ENV CONDA_DEFAULT_ENV=pocketminer
+
+# Copy entrypoint script
+COPY entrypoint.py /workspace/entrypoint.py
+RUN chmod +x /workspace/entrypoint.py
+
+# Set Python path to include the gvp/src directory
+ENV PYTHONPATH=/workspace/gvp/src:$PYTHONPATH
+
+# Default command
+CMD ["python", "/workspace/entrypoint.py", "--help"]
--- a/66
+++ b/66
@@ -0,0 +1,66 @@
+.PHONY: help build run test clean
+
+# Default target
+help:
+	@echo "PocketMiner - Cryptic Pocket Prediction Tool"
+	@echo ""
+	@echo "Available targets:"
+	@echo "  make build       - Build Docker image (conda-based, includes all dependencies)"
+	@echo "  make run         - Run test prediction (requires test.pdb)"
+	@echo "  make test        - Run Nextflow pipeline on test data"
+	@echo "  make clean       - Clean up generated files"
+	@echo "  make shell       - Open shell in Docker container"
+	@echo "  make download-example - Download example PDB file"
+	@echo ""
+
+# Build Docker image (conda-based with all dependencies)
+build:
+	@echo "Building PocketMiner Docker image (conda-based)..."
+	docker build -t pocketminer:latest .
+	@echo "Build complete!"
+
+# Run single test prediction
+run:
+	@if [ ! -f test.pdb ]; then \
+		echo "Error: test.pdb not found. Please provide a test PDB file."; \
+		exit 1; \
+	fi
+	@echo "Running PocketMiner prediction on test.pdb..."
+	docker run --rm \
+		-v $(PWD):/data \
+		pocketminer:latest \
+		python /workspace/entrypoint.py \
+			--pdb /data/test.pdb \
+			--output-folder /data/output \
+			--output-name test
+	@echo "Results saved to output/"
+
+# Run Nextflow pipeline
+test:
+	@echo "Running Nextflow pipeline..."
+	nextflow run main.nf
+	@echo "Pipeline complete!"
+
+# Clean generated files
+clean:
+	@echo "Cleaning up..."
+	rm -rf output/
+	rm -rf .nextflow/
+	rm -f .nextflow.log*
+	rm -rf work/
+	rm -rf results/
+	rm -f *.npy *.txt
+	@echo "Clean complete!"
+
+# Open shell in container
+shell:
+	docker run --rm -it \
+		-v $(PWD):/data \
+		pocketminer:latest \
+		/bin/bash
+
+# Download example PDB (if internet available)
+download-example:
+	@echo "Downloading example PDB (1HSG - HIV protease)..."
+	wget -O test.pdb https://files.rcsb.org/download/1HSG.pdb
+	@echo "Example downloaded as test.pdb"
--- a/entrypoint.py
+++ b/entrypoint.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python3
+"""
+PocketMiner Entrypoint - Command-line wrapper for cryptic pocket prediction
+
+This script wraps the PocketMiner xtal_predict.py functionality with a proper
+command-line interface for Nextflow/Docker integration.
+"""
+
+import argparse
+import json
+import os
+import sys
+import numpy as np
+from pathlib import Path
+import warnings
+
+# Suppress TensorFlow warnings
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
+
+# Import PocketMiner components
+sys.path.insert(0, '/workspace/gvp/src')
+
+try:
+    import tensorflow as tf
+    import mdtraj as md
+    from models import MQAModel
+    from util import load_checkpoint
+    from validate_performance_on_xtals import process_strucs, predict_on_xtals
+except ImportError as e:
+    print(f"Error importing PocketMiner modules: {e}", file=sys.stderr)
+    print("Please ensure the GVP repository is properly cloned and models are available.", file=sys.stderr)
+    sys.exit(1)
+
+
+def load_model(model_path, dropout=0.1, num_layers=4, hidden_dim=100):
+    """Load pre-trained PocketMiner model"""
+    # Model architecture from original PocketMiner (must match checkpoint exactly)
+    model = MQAModel(
+        node_features=(8, 50),
+        edge_features=(1, 32),
+        hidden_dim=(16, hidden_dim),  # (16, 100) for pocketminer checkpoint
+        num_layers=num_layers,
+        dropout=dropout
+    )
+
+    # Load checkpoint
+    opt = tf.keras.optimizers.Adam()
+    load_checkpoint(model, opt, model_path)
+
+    return model
+
+
+def make_predictions(pdb_file, model, model_path, output_folder, output_name, debug=False):
+    """Make cryptic pocket predictions for a PDB structure"""
+
+    # Load structure using mdtraj
+    try:
+        struc = md.load(pdb_file)
+        strucs = [struc]
+    except Exception as e:
+        raise ValueError(f"Failed to load PDB file {pdb_file}: {e}")
+
+    # Process structure to get features
+    X, S, mask = process_strucs(strucs)
+
+    # Get predictions using PocketMiner model
+    predictions = predict_on_xtals(model, model_path, X, S, mask)
+
+    # Extract predictions for the single structure
+    # predictions shape: (batch, max_length)
+    pred_array = predictions[0]  # First (and only) structure
+    mask_array = mask[0]  # Corresponding mask
+
+    # Convert TensorFlow tensors to NumPy arrays explicitly
+    if hasattr(pred_array, 'numpy'):
+        pred_array = pred_array.numpy()
+    if hasattr(mask_array, 'numpy'):
+        mask_array = mask_array.numpy()
+
+    # Ensure arrays are NumPy (in case they weren't TensorFlow tensors)
+    pred_array = np.asarray(pred_array)
+    mask_array = np.asarray(mask_array)
+
+    # Get only valid (masked) residues
+    valid_residues = mask_array > 0
+    pred_valid = pred_array[valid_residues]
+
+    # Save outputs
+    output_path = Path(output_folder)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Save binary predictions (full array with padding)
+    pred_file = output_path / f"{output_name}-preds.npy"
+    np.save(pred_file, pred_valid)
+
+    # Save human-readable predictions
+    txt_file = output_path / f"{output_name}-predictions.txt"
+    np.savetxt(txt_file, pred_valid, fmt='%.4f')
+
+    # Calculate summary statistics
+    cryptic_pocket_score = float(np.mean(pred_valid))
+    high_confidence_residues = int(np.sum(pred_valid > 0.7))
+    medium_confidence_residues = int(np.sum((pred_valid > 0.4) & (pred_valid <= 0.7)))
+
+    # Save debug features if requested
+    if debug:
+        np.save(output_path / f"{output_name}_X.npy", X)
+        np.save(output_path / f"{output_name}_S.npy", S)
+        np.save(output_path / f"{output_name}_mask.npy", mask)
+
+    # Cluster high-confidence residues
+    pocket_clusters = cluster_residues(pred_valid, threshold=0.5)
+
+    # Generate summary JSON
+    summary = {
+        "cryptic_pocket_score": cryptic_pocket_score,
+        "high_confidence_residues": high_confidence_residues,
+        "medium_confidence_residues": medium_confidence_residues,
+        "total_residues": len(pred_valid),
+        "pocket_clusters": pocket_clusters,
+        "output_files": {
+            "predictions_npy": str(pred_file),
+            "predictions_txt": str(txt_file)
+        }
+    }
+
+    summary_file = output_path / f"{output_name}-summary.json"
+    with open(summary_file, 'w') as f:
+        json.dump(summary, f, indent=2)
+
+    return summary
+
+
+def cluster_residues(predictions, threshold=0.5, min_cluster_size=3):
+    """
+    Cluster high-scoring residues into spatial pockets
+
+    Returns list of clusters with residue indices and average scores
+    """
+    # Ensure predictions is a pure NumPy array
+    if hasattr(predictions, 'numpy'):
+        predictions = predictions.numpy()
+    predictions = np.asarray(predictions)
+
+    high_score_idx = np.where(predictions > threshold)[0]
+
+    if len(high_score_idx) == 0:
+        return []
+
+    # Simple sequential clustering (assumes residues are ordered by sequence)
+    # More sophisticated spatial clustering would require 3D coordinates
+    clusters = []
+    current_cluster = [int(high_score_idx[0])]  # Convert to Python int
+
+    for idx in high_score_idx[1:]:
+        idx = int(idx)  # Convert to Python int
+        if idx - current_cluster[-1] <= 2:  # Allow 2-residue gaps
+            current_cluster.append(idx)
+        else:
+            if len(current_cluster) >= min_cluster_size:
+                # Use NumPy array indexing for safety
+                cluster_indices = np.array(current_cluster)
+                cluster_score = float(np.mean(predictions[cluster_indices]))
+                clusters.append({
+                    "residue_indices": current_cluster,
+                    "size": len(current_cluster),
+                    "average_score": cluster_score
+                })
+            current_cluster = [idx]
+
+    # Add final cluster
+    if len(current_cluster) >= min_cluster_size:
+        cluster_indices = np.array(current_cluster)
+        cluster_score = float(np.mean(predictions[cluster_indices]))
+        clusters.append({
+            "residue_indices": current_cluster,
+            "size": len(current_cluster),
+            "average_score": cluster_score
+        })
+
+    # Sort by score
+    clusters.sort(key=lambda x: x['average_score'], reverse=True)
+
+    return clusters
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='PocketMiner: Predict cryptic binding pockets in protein structures'
+    )
+
+    parser.add_argument(
+        '--pdb',
+        required=True,
+        help='Input PDB file path'
+    )
+
+    parser.add_argument(
+        '--output-folder',
+        default='.',
+        help='Output directory for results (default: current directory)'
+    )
+
+    parser.add_argument(
+        '--output-name',
+        required=True,
+        help='Base name for output files'
+    )
+
+    parser.add_argument(
+        '--model-path',
+        default='/workspace/gvp/models/pocketminer',
+        help='Path to pre-trained model checkpoint'
+    )
+
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='Save debug features (X, S, mask arrays)'
+    )
+
+    parser.add_argument(
+        '--dropout',
+        type=float,
+        default=0.1,
+        help='Model dropout rate (default: 0.1)'
+    )
+
+    parser.add_argument(
+        '--num-layers',
+        type=int,
+        default=4,
+        help='Number of model layers (default: 4)'
+    )
+
+    parser.add_argument(
+        '--hidden-dim',
+        type=int,
+        default=100,
+        help='Hidden dimension size (default: 100)'
+    )
+
+    args = parser.parse_args()
+
+    # Validate inputs
+    if not os.path.exists(args.pdb):
+        print(f"Error: PDB file not found: {args.pdb}", file=sys.stderr)
+        sys.exit(1)
+
+    # Check if model checkpoint files exist (model_path is a prefix, not a directory)
+    model_index = f"{args.model_path}.index"
+    if not os.path.exists(model_index):
+        print(f"Error: Model checkpoint not found: {args.model_path}", file=sys.stderr)
+        print(f"Looking for: {model_index}", file=sys.stderr)
+        print("Please ensure the pre-trained model is available.", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Loading PocketMiner model from {args.model_path}...")
+    model = load_model(
+        args.model_path,
+        dropout=args.dropout,
+        num_layers=args.num_layers,
+        hidden_dim=args.hidden_dim
+    )
+
+    print(f"Processing structure: {args.pdb}")
+    summary = make_predictions(
+        pdb_file=args.pdb,
+        model=model,
+        model_path=args.model_path,
+        output_folder=args.output_folder,
+        output_name=args.output_name,
+        debug=args.debug
+    )
+
+    print("\n" + "="*60)
+    print("PocketMiner Prediction Summary")
+    print("="*60)
+    print(f"Overall cryptic pocket score: {summary['cryptic_pocket_score']:.4f}")
+    print(f"High confidence residues (>0.7): {summary['high_confidence_residues']}")
+    print(f"Medium confidence residues (0.4-0.7): {summary['medium_confidence_residues']}")
+    print(f"Total residues analyzed: {summary['total_residues']}")
+    print(f"\nPocket clusters identified: {len(summary['pocket_clusters'])}")
+
+    for i, cluster in enumerate(summary['pocket_clusters'][:5], 1):
+        print(f"  Cluster {i}: {cluster['size']} residues, score={cluster['average_score']:.4f}")
+
+    print(f"\nResults saved to: {args.output_folder}")
+    print("="*60 + "\n")
+
+
+if __name__ == '__main__':
+    main()
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,44 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+// Parameters
+params.pdb = '/omic/eureka/Pocketminer/1HSG.pdb'
+params.outdir = '/omic/eureka/Pocketminer/output'
+params.model_path = '/workspace/gvp/models/pocketminer'
+params.debug = false
+
+// Process definition
+process POCKETMINER {
+    container 'harbor.cluster.omic.ai/omic/pocketminer:latest'
+    publishDir params.outdir, mode: 'copy'
+    stageInMode 'copy'
+
+    input:
+    path pdb_file
+
+    output:
+    path "*-preds.npy", emit: predictions_npy
+    path "*-predictions.txt", emit: predictions_txt
+    path "*-summary.json", emit: summary
+    path "*_X.npy", optional: true, emit: features_debug
+    path "*_S.npy", optional: true, emit: sequence_debug
+    path "*_mask.npy", optional: true, emit: mask_debug
+
+    script:
+    def pdb_basename = pdb_file.baseName
+    def debug_flag = params.debug ? '--debug' : ''
+    """
+    python /workspace/entrypoint.py \\
+        --pdb ${pdb_file} \\
+        --output-folder . \\
+        --output-name ${pdb_basename} \\
+        --model-path ${params.model_path} \\
+        ${debug_flag}
+    """
+}
+
+// Workflow
+workflow {
+    POCKETMINER(Channel.of(file(params.pdb)))
+}
--- a/meta.yml
+++ b/meta.yml
@@ -0,0 +1,42 @@
+params:
+  - outdir:
+      type: file
+      description: path where output files will be deposited
+      required: true
+  - model_path:
+      type: file
+      description: path to pre-trained PocketMiner model checkpoint
+      default: /workspace/gvp/models/pocketminer
+      required: false
+  - debug:
+      type: boolean
+      description: save debug features (X, S, mask arrays)
+      default: false
+      required: false
+input:
+  - pdb:
+      type: file
+      description: PDB file path for protein structure
+  - dynamics_optional:
+      type: file
+      description: optional MD trajectory or ensemble of conformers for enhanced cryptic pocket detection
+      required: false
+output:
+  - predictions_npy:
+      type: file
+      description: NumPy binary file containing per-residue cryptic pocket scores
+  - predictions_txt:
+      type: file
+      description: human-readable text file with per-residue cryptic pocket scores (4 decimal places)
+  - summary:
+      type: file
+      description: JSON file containing overall cryptic pocket score, high/medium confidence residue counts, pocket clusters, and metadata
+  - features_debug:
+      type: file
+      description: (optional) protein features array for debugging
+  - sequence_debug:
+      type: file
+      description: (optional) sequence data array for debugging
+  - mask_debug:
+      type: file
+      description: (optional) masking array for debugging
--- a/nextflow.config
+++ b/nextflow.config
@@ -0,0 +1,36 @@
+profiles {
+    standard {
+        docker {
+            enabled = true
+            temp = 'auto'
+        }
+    }
+
+    k8s {
+        process {
+            executor = 'k8s'
+        }
+        docker {
+            enabled = true
+        }
+        k8s {
+            storageClaimName = 'eureka-pvc'
+            storageMountPath = '/omic/eureka'
+        }
+    }
+
+    k8s_gpu {
+        process {
+            executor = 'k8s'
+            pod = [[nodeSelector: 'nvidia.com/gpu.present=true']]
+            accelerator = [request: 1, type: 'nvidia.com/gpu']
+        }
+        docker {
+            enabled = true
+        }
+        k8s {
+            storageClaimName = 'eureka-pvc'
+            storageMountPath = '/omic/eureka'
+        }
+    }
+}
--- a/params.json
+++ b/params.json
@@ -0,0 +1,51 @@
+{
+    "params": {
+        "pdb": {
+            "type": "file",
+            "description": "Path to input PDB file for cryptic pocket prediction",
+            "default": "s3://omic/eureka/Pocketminer/1HSG.pdb",
+            "required": true,
+            "pipeline_io": "input",
+            "var_name": "params.pdb",
+            "examples": [
+                "s3://omic/eureka/Pocketminer/1HSG.pdb",
+                "s3://omic/eureka/Pocketminer/protein.pdb"
+            ],
+            "pattern": ".*\\.pdb$",
+            "enum": [],
+            "validation": {},
+            "notes": "PDB file containing the protein structure for cryptic binding pocket prediction."
+        },
+        "outdir": {
+            "type": "folder",
+            "description": "Output directory for PocketMiner prediction results",
+            "default": "s3://omic/eureka/Pocketminer/output",
+            "required": true,
+            "pipeline_io": "output",
+            "var_name": "params.outdir",
+            "examples": [
+                "s3://omic/eureka/Pocketminer/output",
+                "s3://omic/eureka/Pocketminer/results"
+            ],
+            "pattern": ".*",
+            "enum": [],
+            "validation": {},
+            "notes": "Directory where prediction results (numpy arrays, text predictions, and JSON summary) will be stored."
+        },
+        "debug": {
+            "type": "boolean",
+            "description": "Save debug features (X, S, mask arrays)",
+            "default": false,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.debug",
+            "examples": [
+                false,
+                true
+            ],
+            "enum": [true, false],
+            "validation": {},
+            "notes": "Enable to save intermediate feature arrays for debugging purposes."
+        }
+    }
+}