#!/usr/bin/env nextflow nextflow.enable.dsl=2 // Pipeline parameters params.pdb = 's3://omic/eureka/Pocketminer/1HSG.pdb' params.outdir = 's3://omic/eureka/Pocketminer/output' params.model_path = '/workspace/gvp/models/pocketminer' params.debug = false process POCKETMINER { container 'harbor.cluster.omic.ai/omic/pocketminer:latest' publishDir params.outdir, mode: 'copy' input: val pdb_path output: path "*-preds.npy", emit: predictions_npy path "*-predictions.txt", emit: predictions_txt path "*-summary.json", emit: summary path "*_X.npy", optional: true, emit: features_debug path "*_S.npy", optional: true, emit: sequence_debug path "*_mask.npy", optional: true, emit: mask_debug path "run.log", emit: log script: def debug_flag = params.debug ? '--debug' : '' """ set +e touch run.log echo "=== Environment Debug ===" >> run.log echo "Input path: ${pdb_path}" >> run.log echo "AWS_ENDPOINT_URL: \${AWS_ENDPOINT_URL:-not set}" >> run.log echo "AWS_ACCESS_KEY_ID set: \${AWS_ACCESS_KEY_ID:+yes}" >> run.log ls /omic/eureka/ >> run.log 2>&1 || echo "/omic/eureka not mounted" >> run.log echo "=== End Debug ===" >> run.log PDB_FILE="" # Method 1: Direct path (if PVC mounted or local) if [ -f "${pdb_path}" ]; then echo "Found at direct path" >> run.log cp "${pdb_path}" input.pdb && PDB_FILE="input.pdb" fi # Method 2: Convert s3://omic/... to /omic/... (PVC mount path) if [ -z "\$PDB_FILE" ]; then PVC_PATH="\$(echo '${pdb_path}' | sed 's|^s3://|/|')" if [ -f "\$PVC_PATH" ]; then echo "Found at PVC path: \$PVC_PATH" >> run.log cp "\$PVC_PATH" input.pdb && PDB_FILE="input.pdb" else echo "Not found at PVC path: \$PVC_PATH" >> run.log fi fi # Method 3: Download from MinIO using python boto3/S3 if [ -z "\$PDB_FILE" ]; then S3_PATH="${pdb_path}" ENDPOINT="\${AWS_ENDPOINT_URL:-http://datalake-hl.datalake.svc.cluster.local:9000}" echo "Downloading via python S3 client from: \$ENDPOINT" >> run.log echo "S3 path: \$S3_PATH" >> run.log echo "AWS_ACCESS_KEY_ID: \${AWS_ACCESS_KEY_ID:+set}" >> run.log echo "AWS_SECRET_ACCESS_KEY: \${AWS_SECRET_ACCESS_KEY:+set}" >> run.log echo "AWS_ENDPOINT_URL: \${AWS_ENDPOINT_URL:-not set}" >> run.log python -c " import os, sys s3_path = '\$S3_PATH' endpoint = '\$ENDPOINT' # Parse s3://bucket/key path = s3_path.replace('s3://', '') parts = path.split('/', 1) bucket = parts[0] key = parts[1] if len(parts) > 1 else '' print(f'Bucket: {bucket}, Key: {key}') print(f'Endpoint: {endpoint}') try: import boto3 from botocore.client import Config s3 = boto3.client('s3', endpoint_url=endpoint, aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', ''), aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', ''), config=Config(signature_version='s3v4'), region_name='us-east-1' ) s3.download_file(bucket, key, 'input.pdb') print(f'Downloaded via boto3 ({os.path.getsize(\"input.pdb\")} bytes)') except ImportError: print('boto3 not available, trying urllib with signing...') # Fallback: use subprocess to call python with hmac signing import urllib.request, hmac, hashlib, datetime access_key = os.environ.get('AWS_ACCESS_KEY_ID', '') secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY', '') if access_key and secret_key: # Simple S3 GET with AWS Signature V2 date_str = datetime.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') string_to_sign = f'GET\n\n\n{date_str}\n/{bucket}/{key}' signature = hmac.new(secret_key.encode(), string_to_sign.encode(), hashlib.sha1) import base64 sig_b64 = base64.b64encode(signature.digest()).decode() url = f'{endpoint}/{bucket}/{key}' req = urllib.request.Request(url) req.add_header('Date', date_str) req.add_header('Authorization', f'AWS {access_key}:{sig_b64}') with urllib.request.urlopen(req) as resp: with open('input.pdb', 'wb') as f: f.write(resp.read()) print(f'Downloaded via signed URL ({os.path.getsize(\"input.pdb\")} bytes)') else: print('No AWS credentials available') sys.exit(1) except Exception as e: print(f'Download failed: {e}') sys.exit(1) " >> run.log 2>&1 if [ -f input.pdb ] && [ -s input.pdb ]; then echo "S3 download successful (\$(wc -c < input.pdb) bytes)" >> run.log PDB_FILE="input.pdb" else echo "S3 download failed" >> run.log rm -f input.pdb fi fi set -e if [ -z "\$PDB_FILE" ] || [ ! -f input.pdb ]; then echo "ERROR: Could not retrieve PDB file from any source" >> run.log cat run.log exit 1 fi echo "PDB file ready (\$(wc -c < input.pdb) bytes)" >> run.log PDB_BASENAME="\$(basename '${pdb_path}' .pdb)" python /workspace/entrypoint.py \\ --pdb input.pdb \\ --output-folder . \\ --output-name "\$PDB_BASENAME" \\ --model-path ${params.model_path} \\ ${debug_flag} 2>&1 | tee -a run.log echo "Pipeline completed successfully" >> run.log """ } workflow { POCKETMINER(params.pdb) }