Upload results to S3 via boto3 instead of publishDir (fixes S3 write)
This commit is contained in:
78
main.nf
78
main.nf
@@ -10,32 +10,25 @@ params.debug = false
|
|||||||
|
|
||||||
process POCKETMINER {
|
process POCKETMINER {
|
||||||
container 'harbor.cluster.omic.ai/omic/pocketminer:v2'
|
container 'harbor.cluster.omic.ai/omic/pocketminer:v2'
|
||||||
publishDir params.outdir, mode: 'copy'
|
|
||||||
|
|
||||||
input:
|
input:
|
||||||
val pdb_path
|
val pdb_path
|
||||||
|
|
||||||
output:
|
output:
|
||||||
path "*-preds.npy", emit: predictions_npy
|
path "results/*", emit: all_results
|
||||||
path "*-predictions.txt", emit: predictions_txt
|
|
||||||
path "*-summary.json", emit: summary
|
|
||||||
path "*_X.npy", optional: true, emit: features_debug
|
|
||||||
path "*_S.npy", optional: true, emit: sequence_debug
|
|
||||||
path "*_mask.npy", optional: true, emit: mask_debug
|
|
||||||
path "run.log", emit: log
|
path "run.log", emit: log
|
||||||
|
|
||||||
script:
|
script:
|
||||||
def debug_flag = params.debug ? '--debug' : ''
|
def debug_flag = params.debug ? '--debug' : ''
|
||||||
|
def outdir = params.outdir
|
||||||
"""
|
"""
|
||||||
set +e
|
set +e
|
||||||
touch run.log
|
touch run.log
|
||||||
|
mkdir -p results
|
||||||
|
|
||||||
echo "=== Environment Debug ===" >> run.log
|
echo "=== Environment Debug ===" >> run.log
|
||||||
echo "Input path: ${pdb_path}" >> run.log
|
echo "Input path: ${pdb_path}" >> run.log
|
||||||
echo "AWS_ENDPOINT_URL: \${AWS_ENDPOINT_URL:-not set}" >> run.log
|
echo "Output path: ${outdir}" >> run.log
|
||||||
echo "AWS_ACCESS_KEY_ID set: \${AWS_ACCESS_KEY_ID:+yes}" >> run.log
|
|
||||||
ls /omic/eureka/ >> run.log 2>&1 || echo "/omic/eureka not mounted" >> run.log
|
|
||||||
echo "=== End Debug ===" >> run.log
|
|
||||||
|
|
||||||
PDB_FILE=""
|
PDB_FILE=""
|
||||||
|
|
||||||
@@ -56,65 +49,66 @@ process POCKETMINER {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Method 3: Download from MinIO using python
|
# Method 3: Download from MinIO using boto3
|
||||||
if [ -z "\$PDB_FILE" ]; then
|
if [ -z "\$PDB_FILE" ]; then
|
||||||
S3_PATH="${pdb_path}"
|
|
||||||
ENDPOINT="\${AWS_ENDPOINT_URL:-http://datalake-hl.datalake.svc.cluster.local:9000}"
|
ENDPOINT="\${AWS_ENDPOINT_URL:-http://datalake-hl.datalake.svc.cluster.local:9000}"
|
||||||
echo "Downloading via python S3 from: \$ENDPOINT" >> run.log
|
echo "Downloading via boto3 from: \$ENDPOINT" >> run.log
|
||||||
echo "S3 path: \$S3_PATH" >> run.log
|
|
||||||
echo "AWS_ACCESS_KEY_ID: \${AWS_ACCESS_KEY_ID:+set}" >> run.log
|
|
||||||
echo "AWS_SECRET_ACCESS_KEY: \${AWS_SECRET_ACCESS_KEY:+set}" >> run.log
|
|
||||||
echo "AWS_ENDPOINT_URL: \${AWS_ENDPOINT_URL:-not set}" >> run.log
|
|
||||||
|
|
||||||
python -c "
|
python -c "
|
||||||
import os, sys
|
import os, sys, boto3
|
||||||
s3_path = os.environ.get('S3_INPUT', '${pdb_path}')
|
|
||||||
endpoint = os.environ.get('AWS_ENDPOINT_URL', 'http://datalake-hl.datalake.svc.cluster.local:9000')
|
|
||||||
path = s3_path.replace('s3://', '')
|
|
||||||
parts = path.split('/', 1)
|
|
||||||
bucket, key = parts[0], parts[1]
|
|
||||||
print(f'Bucket: {bucket}, Key: {key}, Endpoint: {endpoint}')
|
|
||||||
import boto3
|
|
||||||
from botocore.client import Config
|
from botocore.client import Config
|
||||||
s3 = boto3.client('s3',
|
s3_path = '${pdb_path}'.replace('s3://', '')
|
||||||
endpoint_url=endpoint,
|
bucket, key = s3_path.split('/', 1)
|
||||||
|
endpoint = os.environ.get('AWS_ENDPOINT_URL', 'http://datalake-hl.datalake.svc.cluster.local:9000')
|
||||||
|
print(f'Bucket: {bucket}, Key: {key}')
|
||||||
|
s3 = boto3.client('s3', endpoint_url=endpoint,
|
||||||
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', ''),
|
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', ''),
|
||||||
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', ''),
|
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', ''),
|
||||||
config=Config(signature_version='s3v4'),
|
config=Config(signature_version='s3v4'), region_name='us-east-1')
|
||||||
region_name='us-east-1'
|
|
||||||
)
|
|
||||||
s3.download_file(bucket, key, 'input.pdb')
|
s3.download_file(bucket, key, 'input.pdb')
|
||||||
size = os.path.getsize('input.pdb')
|
print(f'Downloaded {os.path.getsize(\"input.pdb\")} bytes')
|
||||||
print(f'Downloaded {size} bytes')
|
|
||||||
" >> run.log 2>&1
|
" >> run.log 2>&1
|
||||||
if [ -f input.pdb ] && [ -s input.pdb ]; then
|
if [ -f input.pdb ] && [ -s input.pdb ]; then
|
||||||
echo "S3 download successful (\$(wc -c < input.pdb) bytes)" >> run.log
|
|
||||||
PDB_FILE="input.pdb"
|
PDB_FILE="input.pdb"
|
||||||
else
|
|
||||||
echo "S3 download failed - check run.log" >> run.log
|
|
||||||
cat run.log
|
|
||||||
rm -f input.pdb
|
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
if [ -z "\$PDB_FILE" ] || [ ! -f input.pdb ]; then
|
if [ -z "\$PDB_FILE" ] || [ ! -f input.pdb ]; then
|
||||||
echo "ERROR: Could not retrieve PDB file from any source" >> run.log
|
echo "ERROR: Could not retrieve PDB file" >> run.log
|
||||||
cat run.log
|
cat run.log
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "PDB file ready (\$(wc -c < input.pdb) bytes)" >> run.log
|
|
||||||
PDB_BASENAME="\$(basename '${pdb_path}' .pdb)"
|
PDB_BASENAME="\$(basename '${pdb_path}' .pdb)"
|
||||||
|
|
||||||
python /workspace/entrypoint.py \\
|
python /workspace/entrypoint.py \\
|
||||||
--pdb input.pdb \\
|
--pdb input.pdb \\
|
||||||
--output-folder . \\
|
--output-folder results \\
|
||||||
--output-name "\$PDB_BASENAME" \\
|
--output-name "\$PDB_BASENAME" \\
|
||||||
--model-path ${params.model_path} \\
|
--model-path ${params.model_path} \\
|
||||||
${debug_flag} 2>&1 | tee -a run.log
|
${debug_flag} 2>&1 | tee -a run.log
|
||||||
|
|
||||||
|
# Upload results to S3
|
||||||
|
echo "Uploading results to ${outdir}..." >> run.log
|
||||||
|
python -c "
|
||||||
|
import os, boto3
|
||||||
|
from botocore.client import Config
|
||||||
|
outdir = '${outdir}'.replace('s3://', '')
|
||||||
|
bucket, prefix = outdir.split('/', 1)
|
||||||
|
endpoint = os.environ.get('AWS_ENDPOINT_URL', 'http://datalake-hl.datalake.svc.cluster.local:9000')
|
||||||
|
s3 = boto3.client('s3', endpoint_url=endpoint,
|
||||||
|
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', ''),
|
||||||
|
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', ''),
|
||||||
|
config=Config(signature_version='s3v4'), region_name='us-east-1')
|
||||||
|
for f in os.listdir('results'):
|
||||||
|
filepath = os.path.join('results', f)
|
||||||
|
if os.path.isfile(filepath):
|
||||||
|
key = f'{prefix}/{f}'
|
||||||
|
s3.upload_file(filepath, bucket, key)
|
||||||
|
print(f'Uploaded {f} -> s3://{bucket}/{key}')
|
||||||
|
" >> run.log 2>&1
|
||||||
|
|
||||||
echo "Pipeline completed successfully" >> run.log
|
echo "Pipeline completed successfully" >> run.log
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user