diff --git a/main.nf b/main.nf index 022029c..1e499e2 100644 --- a/main.nf +++ b/main.nf @@ -10,32 +10,25 @@ params.debug = false process POCKETMINER { container 'harbor.cluster.omic.ai/omic/pocketminer:v2' - publishDir params.outdir, mode: 'copy' input: val pdb_path output: - path "*-preds.npy", emit: predictions_npy - path "*-predictions.txt", emit: predictions_txt - path "*-summary.json", emit: summary - path "*_X.npy", optional: true, emit: features_debug - path "*_S.npy", optional: true, emit: sequence_debug - path "*_mask.npy", optional: true, emit: mask_debug + path "results/*", emit: all_results path "run.log", emit: log script: def debug_flag = params.debug ? '--debug' : '' + def outdir = params.outdir """ set +e touch run.log + mkdir -p results echo "=== Environment Debug ===" >> run.log echo "Input path: ${pdb_path}" >> run.log - echo "AWS_ENDPOINT_URL: \${AWS_ENDPOINT_URL:-not set}" >> run.log - echo "AWS_ACCESS_KEY_ID set: \${AWS_ACCESS_KEY_ID:+yes}" >> run.log - ls /omic/eureka/ >> run.log 2>&1 || echo "/omic/eureka not mounted" >> run.log - echo "=== End Debug ===" >> run.log + echo "Output path: ${outdir}" >> run.log PDB_FILE="" @@ -56,65 +49,66 @@ process POCKETMINER { fi fi - # Method 3: Download from MinIO using python + # Method 3: Download from MinIO using boto3 if [ -z "\$PDB_FILE" ]; then - S3_PATH="${pdb_path}" ENDPOINT="\${AWS_ENDPOINT_URL:-http://datalake-hl.datalake.svc.cluster.local:9000}" - echo "Downloading via python S3 from: \$ENDPOINT" >> run.log - echo "S3 path: \$S3_PATH" >> run.log - echo "AWS_ACCESS_KEY_ID: \${AWS_ACCESS_KEY_ID:+set}" >> run.log - echo "AWS_SECRET_ACCESS_KEY: \${AWS_SECRET_ACCESS_KEY:+set}" >> run.log - echo "AWS_ENDPOINT_URL: \${AWS_ENDPOINT_URL:-not set}" >> run.log - + echo "Downloading via boto3 from: \$ENDPOINT" >> run.log python -c " -import os, sys -s3_path = os.environ.get('S3_INPUT', '${pdb_path}') -endpoint = os.environ.get('AWS_ENDPOINT_URL', 'http://datalake-hl.datalake.svc.cluster.local:9000') -path = s3_path.replace('s3://', '') -parts = path.split('/', 1) -bucket, key = parts[0], parts[1] -print(f'Bucket: {bucket}, Key: {key}, Endpoint: {endpoint}') -import boto3 +import os, sys, boto3 from botocore.client import Config -s3 = boto3.client('s3', - endpoint_url=endpoint, +s3_path = '${pdb_path}'.replace('s3://', '') +bucket, key = s3_path.split('/', 1) +endpoint = os.environ.get('AWS_ENDPOINT_URL', 'http://datalake-hl.datalake.svc.cluster.local:9000') +print(f'Bucket: {bucket}, Key: {key}') +s3 = boto3.client('s3', endpoint_url=endpoint, aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', ''), aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', ''), - config=Config(signature_version='s3v4'), - region_name='us-east-1' -) + config=Config(signature_version='s3v4'), region_name='us-east-1') s3.download_file(bucket, key, 'input.pdb') -size = os.path.getsize('input.pdb') -print(f'Downloaded {size} bytes') +print(f'Downloaded {os.path.getsize(\"input.pdb\")} bytes') " >> run.log 2>&1 if [ -f input.pdb ] && [ -s input.pdb ]; then - echo "S3 download successful (\$(wc -c < input.pdb) bytes)" >> run.log PDB_FILE="input.pdb" - else - echo "S3 download failed - check run.log" >> run.log - cat run.log - rm -f input.pdb fi fi set -e if [ -z "\$PDB_FILE" ] || [ ! -f input.pdb ]; then - echo "ERROR: Could not retrieve PDB file from any source" >> run.log + echo "ERROR: Could not retrieve PDB file" >> run.log cat run.log exit 1 fi - echo "PDB file ready (\$(wc -c < input.pdb) bytes)" >> run.log PDB_BASENAME="\$(basename '${pdb_path}' .pdb)" python /workspace/entrypoint.py \\ --pdb input.pdb \\ - --output-folder . \\ + --output-folder results \\ --output-name "\$PDB_BASENAME" \\ --model-path ${params.model_path} \\ ${debug_flag} 2>&1 | tee -a run.log + # Upload results to S3 + echo "Uploading results to ${outdir}..." >> run.log + python -c " +import os, boto3 +from botocore.client import Config +outdir = '${outdir}'.replace('s3://', '') +bucket, prefix = outdir.split('/', 1) +endpoint = os.environ.get('AWS_ENDPOINT_URL', 'http://datalake-hl.datalake.svc.cluster.local:9000') +s3 = boto3.client('s3', endpoint_url=endpoint, + aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', ''), + aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', ''), + config=Config(signature_version='s3v4'), region_name='us-east-1') +for f in os.listdir('results'): + filepath = os.path.join('results', f) + if os.path.isfile(filepath): + key = f'{prefix}/{f}' + s3.upload_file(filepath, bucket, key) + print(f'Uploaded {f} -> s3://{bucket}/{key}') +" >> run.log 2>&1 + echo "Pipeline completed successfully" >> run.log """ }