#!/usr/bin/env python3 """ PocketMiner Entrypoint - Command-line wrapper for cryptic pocket prediction This script wraps the PocketMiner xtal_predict.py functionality with a proper command-line interface for Nextflow/Docker integration. """ import argparse import json import os import sys import numpy as np from pathlib import Path import warnings # Suppress TensorFlow warnings os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' # Import PocketMiner components sys.path.insert(0, '/workspace/gvp/src') try: import tensorflow as tf import mdtraj as md from models import MQAModel from util import load_checkpoint from validate_performance_on_xtals import process_strucs, predict_on_xtals except ImportError as e: print(f"Error importing PocketMiner modules: {e}", file=sys.stderr) print("Please ensure the GVP repository is properly cloned and models are available.", file=sys.stderr) sys.exit(1) def load_model(model_path, dropout=0.1, num_layers=4, hidden_dim=100): """Load pre-trained PocketMiner model""" # Model architecture from original PocketMiner (must match checkpoint exactly) model = MQAModel( node_features=(8, 50), edge_features=(1, 32), hidden_dim=(16, hidden_dim), # (16, 100) for pocketminer checkpoint num_layers=num_layers, dropout=dropout ) # Load checkpoint opt = tf.keras.optimizers.Adam() load_checkpoint(model, opt, model_path) return model def make_predictions(pdb_file, model, model_path, output_folder, output_name, debug=False): """Make cryptic pocket predictions for a PDB structure""" # Load structure using mdtraj try: struc = md.load(pdb_file) strucs = [struc] except Exception as e: raise ValueError(f"Failed to load PDB file {pdb_file}: {e}") # Process structure to get features X, S, mask = process_strucs(strucs) # Get predictions using PocketMiner model predictions = predict_on_xtals(model, model_path, X, S, mask) # Extract predictions for the single structure # predictions shape: (batch, max_length) pred_array = predictions[0] # First (and only) structure mask_array = mask[0] # Corresponding mask # Convert TensorFlow tensors to NumPy arrays explicitly if hasattr(pred_array, 'numpy'): pred_array = pred_array.numpy() if hasattr(mask_array, 'numpy'): mask_array = mask_array.numpy() # Ensure arrays are NumPy (in case they weren't TensorFlow tensors) pred_array = np.asarray(pred_array) mask_array = np.asarray(mask_array) # Get only valid (masked) residues valid_residues = mask_array > 0 pred_valid = pred_array[valid_residues] # Save outputs output_path = Path(output_folder) output_path.mkdir(parents=True, exist_ok=True) # Save binary predictions (full array with padding) pred_file = output_path / f"{output_name}-preds.npy" np.save(pred_file, pred_valid) # Save human-readable predictions txt_file = output_path / f"{output_name}-predictions.txt" np.savetxt(txt_file, pred_valid, fmt='%.4f') # Calculate summary statistics cryptic_pocket_score = float(np.mean(pred_valid)) high_confidence_residues = int(np.sum(pred_valid > 0.7)) medium_confidence_residues = int(np.sum((pred_valid > 0.4) & (pred_valid <= 0.7))) # Save debug features if requested if debug: np.save(output_path / f"{output_name}_X.npy", X) np.save(output_path / f"{output_name}_S.npy", S) np.save(output_path / f"{output_name}_mask.npy", mask) # Cluster high-confidence residues pocket_clusters = cluster_residues(pred_valid, threshold=0.5) # Generate summary JSON summary = { "cryptic_pocket_score": cryptic_pocket_score, "high_confidence_residues": high_confidence_residues, "medium_confidence_residues": medium_confidence_residues, "total_residues": len(pred_valid), "pocket_clusters": pocket_clusters, "output_files": { "predictions_npy": str(pred_file), "predictions_txt": str(txt_file) } } summary_file = output_path / f"{output_name}-summary.json" with open(summary_file, 'w') as f: json.dump(summary, f, indent=2) return summary def cluster_residues(predictions, threshold=0.5, min_cluster_size=3): """ Cluster high-scoring residues into spatial pockets Returns list of clusters with residue indices and average scores """ # Ensure predictions is a pure NumPy array if hasattr(predictions, 'numpy'): predictions = predictions.numpy() predictions = np.asarray(predictions) high_score_idx = np.where(predictions > threshold)[0] if len(high_score_idx) == 0: return [] # Simple sequential clustering (assumes residues are ordered by sequence) # More sophisticated spatial clustering would require 3D coordinates clusters = [] current_cluster = [int(high_score_idx[0])] # Convert to Python int for idx in high_score_idx[1:]: idx = int(idx) # Convert to Python int if idx - current_cluster[-1] <= 2: # Allow 2-residue gaps current_cluster.append(idx) else: if len(current_cluster) >= min_cluster_size: # Use NumPy array indexing for safety cluster_indices = np.array(current_cluster) cluster_score = float(np.mean(predictions[cluster_indices])) clusters.append({ "residue_indices": current_cluster, "size": len(current_cluster), "average_score": cluster_score }) current_cluster = [idx] # Add final cluster if len(current_cluster) >= min_cluster_size: cluster_indices = np.array(current_cluster) cluster_score = float(np.mean(predictions[cluster_indices])) clusters.append({ "residue_indices": current_cluster, "size": len(current_cluster), "average_score": cluster_score }) # Sort by score clusters.sort(key=lambda x: x['average_score'], reverse=True) return clusters def main(): parser = argparse.ArgumentParser( description='PocketMiner: Predict cryptic binding pockets in protein structures' ) parser.add_argument( '--pdb', required=True, help='Input PDB file path' ) parser.add_argument( '--output-folder', default='.', help='Output directory for results (default: current directory)' ) parser.add_argument( '--output-name', required=True, help='Base name for output files' ) parser.add_argument( '--model-path', default='/workspace/gvp/models/pocketminer', help='Path to pre-trained model checkpoint' ) parser.add_argument( '--debug', action='store_true', help='Save debug features (X, S, mask arrays)' ) parser.add_argument( '--dropout', type=float, default=0.1, help='Model dropout rate (default: 0.1)' ) parser.add_argument( '--num-layers', type=int, default=4, help='Number of model layers (default: 4)' ) parser.add_argument( '--hidden-dim', type=int, default=100, help='Hidden dimension size (default: 100)' ) args = parser.parse_args() # Validate inputs if not os.path.exists(args.pdb): print(f"Error: PDB file not found: {args.pdb}", file=sys.stderr) sys.exit(1) # Check if model checkpoint files exist (model_path is a prefix, not a directory) model_index = f"{args.model_path}.index" if not os.path.exists(model_index): print(f"Error: Model checkpoint not found: {args.model_path}", file=sys.stderr) print(f"Looking for: {model_index}", file=sys.stderr) print("Please ensure the pre-trained model is available.", file=sys.stderr) sys.exit(1) print(f"Loading PocketMiner model from {args.model_path}...") model = load_model( args.model_path, dropout=args.dropout, num_layers=args.num_layers, hidden_dim=args.hidden_dim ) print(f"Processing structure: {args.pdb}") summary = make_predictions( pdb_file=args.pdb, model=model, model_path=args.model_path, output_folder=args.output_folder, output_name=args.output_name, debug=args.debug ) print("\n" + "="*60) print("PocketMiner Prediction Summary") print("="*60) print(f"Overall cryptic pocket score: {summary['cryptic_pocket_score']:.4f}") print(f"High confidence residues (>0.7): {summary['high_confidence_residues']}") print(f"Medium confidence residues (0.4-0.7): {summary['medium_confidence_residues']}") print(f"Total residues analyzed: {summary['total_residues']}") print(f"\nPocket clusters identified: {len(summary['pocket_clusters'])}") for i, cluster in enumerate(summary['pocket_clusters'][:5], 1): print(f" Cluster {i}: {cluster['size']} residues, score={cluster['average_score']:.4f}") print(f"\nResults saved to: {args.output_folder}") print("="*60 + "\n") if __name__ == '__main__': main()