diff --git a/main.nf b/main.nf index 53b88a8..3f60b17 100644 --- a/main.nf +++ b/main.nf @@ -56,40 +56,74 @@ process POCKETMINER { fi fi - # Method 3: Download from MinIO via wget + # Method 3: Download from MinIO using python boto3/S3 if [ -z "\$PDB_FILE" ]; then - S3_PATH="\$(echo '${pdb_path}' | sed 's|^s3://||')" - MINIO_URL="http://datalake-hl.datalake.svc.cluster.local:9000/\$S3_PATH" - echo "Downloading from MinIO: \$MINIO_URL" >> run.log - wget -q "\$MINIO_URL" -O input.pdb 2>> run.log || true - if [ -f input.pdb ] && [ -s input.pdb ]; then - echo "Downloaded from MinIO (\$(wc -c < input.pdb) bytes)" >> run.log - PDB_FILE="input.pdb" - else - echo "wget download failed, trying python..." >> run.log - rm -f input.pdb - fi - fi - - # Method 4: Download from MinIO using python urllib - if [ -z "\$PDB_FILE" ]; then - S3_PATH="\$(echo '${pdb_path}' | sed 's|^s3://||')" + S3_PATH="${pdb_path}" ENDPOINT="\${AWS_ENDPOINT_URL:-http://datalake-hl.datalake.svc.cluster.local:9000}" - echo "Downloading with python from: \$ENDPOINT/\$S3_PATH" >> run.log + echo "Downloading via python S3 client from: \$ENDPOINT" >> run.log + echo "S3 path: \$S3_PATH" >> run.log + echo "AWS_ACCESS_KEY_ID: \${AWS_ACCESS_KEY_ID:+set}" >> run.log + echo "AWS_SECRET_ACCESS_KEY: \${AWS_SECRET_ACCESS_KEY:+set}" >> run.log + echo "AWS_ENDPOINT_URL: \${AWS_ENDPOINT_URL:-not set}" >> run.log python -c " -import urllib.request -url = '\$ENDPOINT/\$S3_PATH' +import os, sys +s3_path = '\$S3_PATH' +endpoint = '\$ENDPOINT' + +# Parse s3://bucket/key +path = s3_path.replace('s3://', '') +parts = path.split('/', 1) +bucket = parts[0] +key = parts[1] if len(parts) > 1 else '' + +print(f'Bucket: {bucket}, Key: {key}') +print(f'Endpoint: {endpoint}') + try: - urllib.request.urlretrieve(url, 'input.pdb') - print(f'Downloaded from {url}') + import boto3 + from botocore.client import Config + s3 = boto3.client('s3', + endpoint_url=endpoint, + aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', ''), + aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', ''), + config=Config(signature_version='s3v4'), + region_name='us-east-1' + ) + s3.download_file(bucket, key, 'input.pdb') + print(f'Downloaded via boto3 ({os.path.getsize(\"input.pdb\")} bytes)') +except ImportError: + print('boto3 not available, trying urllib with signing...') + # Fallback: use subprocess to call python with hmac signing + import urllib.request, hmac, hashlib, datetime + access_key = os.environ.get('AWS_ACCESS_KEY_ID', '') + secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY', '') + if access_key and secret_key: + # Simple S3 GET with AWS Signature V2 + date_str = datetime.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') + string_to_sign = f'GET\n\n\n{date_str}\n/{bucket}/{key}' + signature = hmac.new(secret_key.encode(), string_to_sign.encode(), hashlib.sha1) + import base64 + sig_b64 = base64.b64encode(signature.digest()).decode() + url = f'{endpoint}/{bucket}/{key}' + req = urllib.request.Request(url) + req.add_header('Date', date_str) + req.add_header('Authorization', f'AWS {access_key}:{sig_b64}') + with urllib.request.urlopen(req) as resp: + with open('input.pdb', 'wb') as f: + f.write(resp.read()) + print(f'Downloaded via signed URL ({os.path.getsize(\"input.pdb\")} bytes)') + else: + print('No AWS credentials available') + sys.exit(1) except Exception as e: - print(f'Failed: {e}') + print(f'Download failed: {e}') + sys.exit(1) " >> run.log 2>&1 if [ -f input.pdb ] && [ -s input.pdb ]; then - echo "Downloaded via python (\$(wc -c < input.pdb) bytes)" >> run.log + echo "S3 download successful (\$(wc -c < input.pdb) bytes)" >> run.log PDB_FILE="input.pdb" else - echo "Python download failed" >> run.log + echo "S3 download failed" >> run.log rm -f input.pdb fi fi