diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7bd8c4d --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# Nextflow +.nextflow/ +.nextflow.log* +work/ +results/ +*.html + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg-info/ +dist/ +build/ + +# Data +*.pdb +*.npy +output/ +data/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Docker +.dockerignore + +# Models (large files) +models/ +*.ckpt +*.h5 +*.pkl diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..19da1ae --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +FROM continuumio/miniconda3:latest + +ARG DEBIAN_FRONTEND=noninteractive + +# Update and install basic dependencies +RUN apt-get update -y \ + && apt-get -y upgrade --fix-missing \ + && apt-get -y install git procps coreutils wget \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +# Clone PocketMiner repository +RUN git clone https://github.com/Mickdub/gvp.git \ + && cd gvp \ + && git checkout pocket_pred + +# Create conda environment and install dependencies +RUN conda create -n pocketminer python=3.9 -y && \ + conda install -n pocketminer -c conda-forge \ + numpy scipy pandas tensorflow tqdm mdtraj pyyaml -y && \ + conda clean -afy + +# Activate environment and set up PATH +ENV PATH=/opt/conda/envs/pocketminer/bin:$PATH +ENV CONDA_DEFAULT_ENV=pocketminer + +# Copy entrypoint script +COPY entrypoint.py /workspace/entrypoint.py +RUN chmod +x /workspace/entrypoint.py + +# Set Python path to include the gvp/src directory +ENV PYTHONPATH=/workspace/gvp/src:$PYTHONPATH + +# Default command +CMD ["python", "/workspace/entrypoint.py", "--help"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9fa5d02 --- /dev/null +++ b/Makefile @@ -0,0 +1,66 @@ +.PHONY: help build run test clean + +# Default target +help: + @echo "PocketMiner - Cryptic Pocket Prediction Tool" + @echo "" + @echo "Available targets:" + @echo " make build - Build Docker image (conda-based, includes all dependencies)" + @echo " make run - Run test prediction (requires test.pdb)" + @echo " make test - Run Nextflow pipeline on test data" + @echo " make clean - Clean up generated files" + @echo " make shell - Open shell in Docker container" + @echo " make download-example - Download example PDB file" + @echo "" + +# Build Docker image (conda-based with all dependencies) +build: + @echo "Building PocketMiner Docker image (conda-based)..." + docker build -t pocketminer:latest . + @echo "Build complete!" + +# Run single test prediction +run: + @if [ ! -f test.pdb ]; then \ + echo "Error: test.pdb not found. Please provide a test PDB file."; \ + exit 1; \ + fi + @echo "Running PocketMiner prediction on test.pdb..." + docker run --rm \ + -v $(PWD):/data \ + pocketminer:latest \ + python /workspace/entrypoint.py \ + --pdb /data/test.pdb \ + --output-folder /data/output \ + --output-name test + @echo "Results saved to output/" + +# Run Nextflow pipeline +test: + @echo "Running Nextflow pipeline..." + nextflow run main.nf + @echo "Pipeline complete!" + +# Clean generated files +clean: + @echo "Cleaning up..." + rm -rf output/ + rm -rf .nextflow/ + rm -f .nextflow.log* + rm -rf work/ + rm -rf results/ + rm -f *.npy *.txt + @echo "Clean complete!" + +# Open shell in container +shell: + docker run --rm -it \ + -v $(PWD):/data \ + pocketminer:latest \ + /bin/bash + +# Download example PDB (if internet available) +download-example: + @echo "Downloading example PDB (1HSG - HIV protease)..." + wget -O test.pdb https://files.rcsb.org/download/1HSG.pdb + @echo "Example downloaded as test.pdb" diff --git a/entrypoint.py b/entrypoint.py new file mode 100644 index 0000000..5cb46be --- /dev/null +++ b/entrypoint.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 +""" +PocketMiner Entrypoint - Command-line wrapper for cryptic pocket prediction + +This script wraps the PocketMiner xtal_predict.py functionality with a proper +command-line interface for Nextflow/Docker integration. +""" + +import argparse +import json +import os +import sys +import numpy as np +from pathlib import Path +import warnings + +# Suppress TensorFlow warnings +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' + +# Import PocketMiner components +sys.path.insert(0, '/workspace/gvp/src') + +try: + import tensorflow as tf + import mdtraj as md + from models import MQAModel + from util import load_checkpoint + from validate_performance_on_xtals import process_strucs, predict_on_xtals +except ImportError as e: + print(f"Error importing PocketMiner modules: {e}", file=sys.stderr) + print("Please ensure the GVP repository is properly cloned and models are available.", file=sys.stderr) + sys.exit(1) + + +def load_model(model_path, dropout=0.1, num_layers=4, hidden_dim=100): + """Load pre-trained PocketMiner model""" + # Model architecture from original PocketMiner (must match checkpoint exactly) + model = MQAModel( + node_features=(8, 50), + edge_features=(1, 32), + hidden_dim=(16, hidden_dim), # (16, 100) for pocketminer checkpoint + num_layers=num_layers, + dropout=dropout + ) + + # Load checkpoint + opt = tf.keras.optimizers.Adam() + load_checkpoint(model, opt, model_path) + + return model + + +def make_predictions(pdb_file, model, model_path, output_folder, output_name, debug=False): + """Make cryptic pocket predictions for a PDB structure""" + + # Load structure using mdtraj + try: + struc = md.load(pdb_file) + strucs = [struc] + except Exception as e: + raise ValueError(f"Failed to load PDB file {pdb_file}: {e}") + + # Process structure to get features + X, S, mask = process_strucs(strucs) + + # Get predictions using PocketMiner model + predictions = predict_on_xtals(model, model_path, X, S, mask) + + # Extract predictions for the single structure + # predictions shape: (batch, max_length) + pred_array = predictions[0] # First (and only) structure + mask_array = mask[0] # Corresponding mask + + # Convert TensorFlow tensors to NumPy arrays explicitly + if hasattr(pred_array, 'numpy'): + pred_array = pred_array.numpy() + if hasattr(mask_array, 'numpy'): + mask_array = mask_array.numpy() + + # Ensure arrays are NumPy (in case they weren't TensorFlow tensors) + pred_array = np.asarray(pred_array) + mask_array = np.asarray(mask_array) + + # Get only valid (masked) residues + valid_residues = mask_array > 0 + pred_valid = pred_array[valid_residues] + + # Save outputs + output_path = Path(output_folder) + output_path.mkdir(parents=True, exist_ok=True) + + # Save binary predictions (full array with padding) + pred_file = output_path / f"{output_name}-preds.npy" + np.save(pred_file, pred_valid) + + # Save human-readable predictions + txt_file = output_path / f"{output_name}-predictions.txt" + np.savetxt(txt_file, pred_valid, fmt='%.4f') + + # Calculate summary statistics + cryptic_pocket_score = float(np.mean(pred_valid)) + high_confidence_residues = int(np.sum(pred_valid > 0.7)) + medium_confidence_residues = int(np.sum((pred_valid > 0.4) & (pred_valid <= 0.7))) + + # Save debug features if requested + if debug: + np.save(output_path / f"{output_name}_X.npy", X) + np.save(output_path / f"{output_name}_S.npy", S) + np.save(output_path / f"{output_name}_mask.npy", mask) + + # Cluster high-confidence residues + pocket_clusters = cluster_residues(pred_valid, threshold=0.5) + + # Generate summary JSON + summary = { + "cryptic_pocket_score": cryptic_pocket_score, + "high_confidence_residues": high_confidence_residues, + "medium_confidence_residues": medium_confidence_residues, + "total_residues": len(pred_valid), + "pocket_clusters": pocket_clusters, + "output_files": { + "predictions_npy": str(pred_file), + "predictions_txt": str(txt_file) + } + } + + summary_file = output_path / f"{output_name}-summary.json" + with open(summary_file, 'w') as f: + json.dump(summary, f, indent=2) + + return summary + + +def cluster_residues(predictions, threshold=0.5, min_cluster_size=3): + """ + Cluster high-scoring residues into spatial pockets + + Returns list of clusters with residue indices and average scores + """ + # Ensure predictions is a pure NumPy array + if hasattr(predictions, 'numpy'): + predictions = predictions.numpy() + predictions = np.asarray(predictions) + + high_score_idx = np.where(predictions > threshold)[0] + + if len(high_score_idx) == 0: + return [] + + # Simple sequential clustering (assumes residues are ordered by sequence) + # More sophisticated spatial clustering would require 3D coordinates + clusters = [] + current_cluster = [int(high_score_idx[0])] # Convert to Python int + + for idx in high_score_idx[1:]: + idx = int(idx) # Convert to Python int + if idx - current_cluster[-1] <= 2: # Allow 2-residue gaps + current_cluster.append(idx) + else: + if len(current_cluster) >= min_cluster_size: + # Use NumPy array indexing for safety + cluster_indices = np.array(current_cluster) + cluster_score = float(np.mean(predictions[cluster_indices])) + clusters.append({ + "residue_indices": current_cluster, + "size": len(current_cluster), + "average_score": cluster_score + }) + current_cluster = [idx] + + # Add final cluster + if len(current_cluster) >= min_cluster_size: + cluster_indices = np.array(current_cluster) + cluster_score = float(np.mean(predictions[cluster_indices])) + clusters.append({ + "residue_indices": current_cluster, + "size": len(current_cluster), + "average_score": cluster_score + }) + + # Sort by score + clusters.sort(key=lambda x: x['average_score'], reverse=True) + + return clusters + + +def main(): + parser = argparse.ArgumentParser( + description='PocketMiner: Predict cryptic binding pockets in protein structures' + ) + + parser.add_argument( + '--pdb', + required=True, + help='Input PDB file path' + ) + + parser.add_argument( + '--output-folder', + default='.', + help='Output directory for results (default: current directory)' + ) + + parser.add_argument( + '--output-name', + required=True, + help='Base name for output files' + ) + + parser.add_argument( + '--model-path', + default='/workspace/gvp/models/pocketminer', + help='Path to pre-trained model checkpoint' + ) + + parser.add_argument( + '--debug', + action='store_true', + help='Save debug features (X, S, mask arrays)' + ) + + parser.add_argument( + '--dropout', + type=float, + default=0.1, + help='Model dropout rate (default: 0.1)' + ) + + parser.add_argument( + '--num-layers', + type=int, + default=4, + help='Number of model layers (default: 4)' + ) + + parser.add_argument( + '--hidden-dim', + type=int, + default=100, + help='Hidden dimension size (default: 100)' + ) + + args = parser.parse_args() + + # Validate inputs + if not os.path.exists(args.pdb): + print(f"Error: PDB file not found: {args.pdb}", file=sys.stderr) + sys.exit(1) + + # Check if model checkpoint files exist (model_path is a prefix, not a directory) + model_index = f"{args.model_path}.index" + if not os.path.exists(model_index): + print(f"Error: Model checkpoint not found: {args.model_path}", file=sys.stderr) + print(f"Looking for: {model_index}", file=sys.stderr) + print("Please ensure the pre-trained model is available.", file=sys.stderr) + sys.exit(1) + + print(f"Loading PocketMiner model from {args.model_path}...") + model = load_model( + args.model_path, + dropout=args.dropout, + num_layers=args.num_layers, + hidden_dim=args.hidden_dim + ) + + print(f"Processing structure: {args.pdb}") + summary = make_predictions( + pdb_file=args.pdb, + model=model, + model_path=args.model_path, + output_folder=args.output_folder, + output_name=args.output_name, + debug=args.debug + ) + + print("\n" + "="*60) + print("PocketMiner Prediction Summary") + print("="*60) + print(f"Overall cryptic pocket score: {summary['cryptic_pocket_score']:.4f}") + print(f"High confidence residues (>0.7): {summary['high_confidence_residues']}") + print(f"Medium confidence residues (0.4-0.7): {summary['medium_confidence_residues']}") + print(f"Total residues analyzed: {summary['total_residues']}") + print(f"\nPocket clusters identified: {len(summary['pocket_clusters'])}") + + for i, cluster in enumerate(summary['pocket_clusters'][:5], 1): + print(f" Cluster {i}: {cluster['size']} residues, score={cluster['average_score']:.4f}") + + print(f"\nResults saved to: {args.output_folder}") + print("="*60 + "\n") + + +if __name__ == '__main__': + main() diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..a7bfe58 --- /dev/null +++ b/main.nf @@ -0,0 +1,44 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +// Parameters +params.pdb = '/omic/eureka/Pocketminer/1HSG.pdb' +params.outdir = '/omic/eureka/Pocketminer/output' +params.model_path = '/workspace/gvp/models/pocketminer' +params.debug = false + +// Process definition +process POCKETMINER { + container 'harbor.cluster.omic.ai/omic/pocketminer:latest' + publishDir params.outdir, mode: 'copy' + stageInMode 'copy' + + input: + path pdb_file + + output: + path "*-preds.npy", emit: predictions_npy + path "*-predictions.txt", emit: predictions_txt + path "*-summary.json", emit: summary + path "*_X.npy", optional: true, emit: features_debug + path "*_S.npy", optional: true, emit: sequence_debug + path "*_mask.npy", optional: true, emit: mask_debug + + script: + def pdb_basename = pdb_file.baseName + def debug_flag = params.debug ? '--debug' : '' + """ + python /workspace/entrypoint.py \\ + --pdb ${pdb_file} \\ + --output-folder . \\ + --output-name ${pdb_basename} \\ + --model-path ${params.model_path} \\ + ${debug_flag} + """ +} + +// Workflow +workflow { + POCKETMINER(Channel.of(file(params.pdb))) +} diff --git a/meta.yml b/meta.yml new file mode 100644 index 0000000..e87acff --- /dev/null +++ b/meta.yml @@ -0,0 +1,42 @@ +params: + - outdir: + type: file + description: path where output files will be deposited + required: true + - model_path: + type: file + description: path to pre-trained PocketMiner model checkpoint + default: /workspace/gvp/models/pocketminer + required: false + - debug: + type: boolean + description: save debug features (X, S, mask arrays) + default: false + required: false +input: + - pdb: + type: file + description: PDB file path for protein structure + - dynamics_optional: + type: file + description: optional MD trajectory or ensemble of conformers for enhanced cryptic pocket detection + required: false +output: + - predictions_npy: + type: file + description: NumPy binary file containing per-residue cryptic pocket scores + - predictions_txt: + type: file + description: human-readable text file with per-residue cryptic pocket scores (4 decimal places) + - summary: + type: file + description: JSON file containing overall cryptic pocket score, high/medium confidence residue counts, pocket clusters, and metadata + - features_debug: + type: file + description: (optional) protein features array for debugging + - sequence_debug: + type: file + description: (optional) sequence data array for debugging + - mask_debug: + type: file + description: (optional) masking array for debugging diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..065a56f --- /dev/null +++ b/nextflow.config @@ -0,0 +1,36 @@ +profiles { + standard { + docker { + enabled = true + temp = 'auto' + } + } + + k8s { + process { + executor = 'k8s' + } + docker { + enabled = true + } + k8s { + storageClaimName = 'eureka-pvc' + storageMountPath = '/omic/eureka' + } + } + + k8s_gpu { + process { + executor = 'k8s' + pod = [[nodeSelector: 'nvidia.com/gpu.present=true']] + accelerator = [request: 1, type: 'nvidia.com/gpu'] + } + docker { + enabled = true + } + k8s { + storageClaimName = 'eureka-pvc' + storageMountPath = '/omic/eureka' + } + } +} diff --git a/params.json b/params.json new file mode 100644 index 0000000..f299235 --- /dev/null +++ b/params.json @@ -0,0 +1,51 @@ +{ + "params": { + "pdb": { + "type": "file", + "description": "Path to input PDB file for cryptic pocket prediction", + "default": "s3://omic/eureka/Pocketminer/1HSG.pdb", + "required": true, + "pipeline_io": "input", + "var_name": "params.pdb", + "examples": [ + "s3://omic/eureka/Pocketminer/1HSG.pdb", + "s3://omic/eureka/Pocketminer/protein.pdb" + ], + "pattern": ".*\\.pdb$", + "enum": [], + "validation": {}, + "notes": "PDB file containing the protein structure for cryptic binding pocket prediction." + }, + "outdir": { + "type": "folder", + "description": "Output directory for PocketMiner prediction results", + "default": "s3://omic/eureka/Pocketminer/output", + "required": true, + "pipeline_io": "output", + "var_name": "params.outdir", + "examples": [ + "s3://omic/eureka/Pocketminer/output", + "s3://omic/eureka/Pocketminer/results" + ], + "pattern": ".*", + "enum": [], + "validation": {}, + "notes": "Directory where prediction results (numpy arrays, text predictions, and JSON summary) will be stored." + }, + "debug": { + "type": "boolean", + "description": "Save debug features (X, S, mask arrays)", + "default": false, + "required": false, + "pipeline_io": "parameter", + "var_name": "params.debug", + "examples": [ + false, + true + ], + "enum": [true, false], + "validation": {}, + "notes": "Enable to save intermediate feature arrays for debugging purposes." + } + } +}