Files
pocketminer/main.nf

128 lines
4.5 KiB
Plaintext

#!/usr/bin/env nextflow
nextflow.enable.dsl=2
// Pipeline parameters
params.pdb = 's3://omic/eureka/Pocketminer/1HSG.pdb'
params.outdir = 's3://omic/eureka/Pocketminer/output'
params.model_path = '/workspace/gvp/models/pocketminer'
params.debug = false
process POCKETMINER {
container 'harbor.cluster.omic.ai/omic/pocketminer:latest'
publishDir params.outdir, mode: 'copy'
input:
val pdb_path
output:
path "*-preds.npy", emit: predictions_npy
path "*-predictions.txt", emit: predictions_txt
path "*-summary.json", emit: summary
path "*_X.npy", optional: true, emit: features_debug
path "*_S.npy", optional: true, emit: sequence_debug
path "*_mask.npy", optional: true, emit: mask_debug
path "run.log", emit: log
script:
def debug_flag = params.debug ? '--debug' : ''
"""
set +e
touch run.log
echo "=== Environment Debug ===" >> run.log
echo "Input path: ${pdb_path}" >> run.log
echo "AWS_ENDPOINT_URL: \${AWS_ENDPOINT_URL:-not set}" >> run.log
echo "AWS_ACCESS_KEY_ID set: \${AWS_ACCESS_KEY_ID:+yes}" >> run.log
ls /omic/eureka/ >> run.log 2>&1 || echo "/omic/eureka not mounted" >> run.log
echo "=== End Debug ===" >> run.log
PDB_FILE=""
# Method 1: Direct path (if PVC mounted or local)
if [ -f "${pdb_path}" ]; then
echo "Found at direct path" >> run.log
cp "${pdb_path}" input.pdb && PDB_FILE="input.pdb"
fi
# Method 2: Convert s3://omic/... to /omic/... (PVC mount path)
if [ -z "\$PDB_FILE" ]; then
PVC_PATH="\$(echo '${pdb_path}' | sed 's|^s3://|/|')"
if [ -f "\$PVC_PATH" ]; then
echo "Found at PVC path: \$PVC_PATH" >> run.log
cp "\$PVC_PATH" input.pdb && PDB_FILE="input.pdb"
else
echo "Not found at PVC path: \$PVC_PATH" >> run.log
fi
fi
# Method 3: Download from MinIO using python
if [ -z "\$PDB_FILE" ]; then
S3_PATH="${pdb_path}"
ENDPOINT="\${AWS_ENDPOINT_URL:-http://datalake-hl.datalake.svc.cluster.local:9000}"
echo "Downloading via python S3 from: \$ENDPOINT" >> run.log
echo "S3 path: \$S3_PATH" >> run.log
echo "AWS_ACCESS_KEY_ID: \${AWS_ACCESS_KEY_ID:+set}" >> run.log
echo "AWS_SECRET_ACCESS_KEY: \${AWS_SECRET_ACCESS_KEY:+set}" >> run.log
echo "AWS_ENDPOINT_URL: \${AWS_ENDPOINT_URL:-not set}" >> run.log
# Install boto3 if needed, then download
pip install -q boto3 2>> run.log || conda install -y -q boto3 2>> run.log || true
python -c "
import os, sys
s3_path = os.environ.get('S3_INPUT', '${pdb_path}')
endpoint = os.environ.get('AWS_ENDPOINT_URL', 'http://datalake-hl.datalake.svc.cluster.local:9000')
path = s3_path.replace('s3://', '')
parts = path.split('/', 1)
bucket, key = parts[0], parts[1]
print(f'Bucket: {bucket}, Key: {key}, Endpoint: {endpoint}')
import boto3
from botocore.client import Config
s3 = boto3.client('s3',
endpoint_url=endpoint,
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', ''),
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', ''),
config=Config(signature_version='s3v4'),
region_name='us-east-1'
)
s3.download_file(bucket, key, 'input.pdb')
size = os.path.getsize('input.pdb')
print(f'Downloaded {size} bytes')
" >> run.log 2>&1
if [ -f input.pdb ] && [ -s input.pdb ]; then
echo "S3 download successful (\$(wc -c < input.pdb) bytes)" >> run.log
PDB_FILE="input.pdb"
else
echo "S3 download failed - check run.log" >> run.log
cat run.log
rm -f input.pdb
fi
fi
set -e
if [ -z "\$PDB_FILE" ] || [ ! -f input.pdb ]; then
echo "ERROR: Could not retrieve PDB file from any source" >> run.log
cat run.log
exit 1
fi
echo "PDB file ready (\$(wc -c < input.pdb) bytes)" >> run.log
PDB_BASENAME="\$(basename '${pdb_path}' .pdb)"
python /workspace/entrypoint.py \\
--pdb input.pdb \\
--output-folder . \\
--output-name "\$PDB_BASENAME" \\
--model-path ${params.model_path} \\
${debug_flag} 2>&1 | tee -a run.log
echo "Pipeline completed successfully" >> run.log
"""
}
workflow {
POCKETMINER(params.pdb)
}