Use boto3/signed URL for authenticated MinIO download (403 fix)

This commit is contained in:
2026-03-23 16:06:50 +01:00
parent 7abd6eb267
commit c0f0e38f27

84
main.nf
View File

@@ -56,40 +56,74 @@ process POCKETMINER {
fi
fi
# Method 3: Download from MinIO via wget
# Method 3: Download from MinIO using python boto3/S3
if [ -z "\$PDB_FILE" ]; then
S3_PATH="\$(echo '${pdb_path}' | sed 's|^s3://||')"
MINIO_URL="http://datalake-hl.datalake.svc.cluster.local:9000/\$S3_PATH"
echo "Downloading from MinIO: \$MINIO_URL" >> run.log
wget -q "\$MINIO_URL" -O input.pdb 2>> run.log || true
if [ -f input.pdb ] && [ -s input.pdb ]; then
echo "Downloaded from MinIO (\$(wc -c < input.pdb) bytes)" >> run.log
PDB_FILE="input.pdb"
else
echo "wget download failed, trying python..." >> run.log
rm -f input.pdb
fi
fi
# Method 4: Download from MinIO using python urllib
if [ -z "\$PDB_FILE" ]; then
S3_PATH="\$(echo '${pdb_path}' | sed 's|^s3://||')"
S3_PATH="${pdb_path}"
ENDPOINT="\${AWS_ENDPOINT_URL:-http://datalake-hl.datalake.svc.cluster.local:9000}"
echo "Downloading with python from: \$ENDPOINT/\$S3_PATH" >> run.log
echo "Downloading via python S3 client from: \$ENDPOINT" >> run.log
echo "S3 path: \$S3_PATH" >> run.log
echo "AWS_ACCESS_KEY_ID: \${AWS_ACCESS_KEY_ID:+set}" >> run.log
echo "AWS_SECRET_ACCESS_KEY: \${AWS_SECRET_ACCESS_KEY:+set}" >> run.log
echo "AWS_ENDPOINT_URL: \${AWS_ENDPOINT_URL:-not set}" >> run.log
python -c "
import urllib.request
url = '\$ENDPOINT/\$S3_PATH'
import os, sys
s3_path = '\$S3_PATH'
endpoint = '\$ENDPOINT'
# Parse s3://bucket/key
path = s3_path.replace('s3://', '')
parts = path.split('/', 1)
bucket = parts[0]
key = parts[1] if len(parts) > 1 else ''
print(f'Bucket: {bucket}, Key: {key}')
print(f'Endpoint: {endpoint}')
try:
urllib.request.urlretrieve(url, 'input.pdb')
print(f'Downloaded from {url}')
import boto3
from botocore.client import Config
s3 = boto3.client('s3',
endpoint_url=endpoint,
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', ''),
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', ''),
config=Config(signature_version='s3v4'),
region_name='us-east-1'
)
s3.download_file(bucket, key, 'input.pdb')
print(f'Downloaded via boto3 ({os.path.getsize(\"input.pdb\")} bytes)')
except ImportError:
print('boto3 not available, trying urllib with signing...')
# Fallback: use subprocess to call python with hmac signing
import urllib.request, hmac, hashlib, datetime
access_key = os.environ.get('AWS_ACCESS_KEY_ID', '')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY', '')
if access_key and secret_key:
# Simple S3 GET with AWS Signature V2
date_str = datetime.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
string_to_sign = f'GET\n\n\n{date_str}\n/{bucket}/{key}'
signature = hmac.new(secret_key.encode(), string_to_sign.encode(), hashlib.sha1)
import base64
sig_b64 = base64.b64encode(signature.digest()).decode()
url = f'{endpoint}/{bucket}/{key}'
req = urllib.request.Request(url)
req.add_header('Date', date_str)
req.add_header('Authorization', f'AWS {access_key}:{sig_b64}')
with urllib.request.urlopen(req) as resp:
with open('input.pdb', 'wb') as f:
f.write(resp.read())
print(f'Downloaded via signed URL ({os.path.getsize(\"input.pdb\")} bytes)')
else:
print('No AWS credentials available')
sys.exit(1)
except Exception as e:
print(f'Failed: {e}')
print(f'Download failed: {e}')
sys.exit(1)
" >> run.log 2>&1
if [ -f input.pdb ] && [ -s input.pdb ]; then
echo "Downloaded via python (\$(wc -c < input.pdb) bytes)" >> run.log
echo "S3 download successful (\$(wc -c < input.pdb) bytes)" >> run.log
PDB_FILE="input.pdb"
else
echo "Python download failed" >> run.log
echo "S3 download failed" >> run.log
rm -f input.pdb
fi
fi