diff --git a/stage_data.nf b/stage_data.nf index 21e0c47..beffc16 100644 --- a/stage_data.nf +++ b/stage_data.nf @@ -7,98 +7,73 @@ process STAGE_DATA { script: """ - #!/opt/conda/envs/synthea/bin/python3 - import subprocess, os, sys + set +e + BASE="/omic/eureka/digital-patients" - BASE = "/omic/eureka/digital-patients" + echo "=== Cleaning up old _parts directories ===" + find \$BASE -name "*_parts" -type d -exec rm -rf {} + 2>/dev/null + echo "Cleanup done" - def run(cmd): - print(f"RUN: {cmd}", flush=True) - r = subprocess.run(cmd, shell=True, capture_output=True, text=True) - if r.stdout: print(r.stdout, flush=True) - if r.stderr: print(r.stderr, flush=True) - return r.returncode + echo "=== Creating directory structure ===" + mkdir -p \$BASE/imputed \$BASE/healthy \$BASE/supporting-data/vcf \$BASE/supporting-data/ucsc-liftover \$BASE/supporting-data/genome \$BASE/output + echo "Dirs created" - print("=== Cleaning up old _parts directories ===", flush=True) - run(f"find {BASE} -name '*_parts' -type d -exec rm -rf {{}} + 2>/dev/null || true") + echo "=== Current PVC state ===" + ls -la \$BASE/ 2>/dev/null || echo "Base dir issue" - print("=== Creating directory structure ===", flush=True) - for d in ["imputed", "healthy", "supporting-data/vcf", "supporting-data/ucsc-liftover", "supporting-data/genome", "output"]: - os.makedirs(f"{BASE}/{d}", exist_ok=True) + echo "=== Installing sshpass ===" + conda install -y -c conda-forge sshpass 2>&1 | tail -3 + which sshpass && echo "sshpass OK" || echo "sshpass NOT FOUND" - print("=== Current PVC state ===", flush=True) - run(f"ls -la {BASE}/") + download() { + remote="\$1" + local_path="\$2" + if [ -f "\$local_path" ]; then + echo "SKIP (exists): \$local_path" + return 0 + fi + echo "Downloading: \$remote -> \$local_path" + sshpass -p 'bl3rg3r5' scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -P 9100 "omic@nucleus.omic.ai:\$remote" "\$local_path" 2>&1 + if [ -f "\$local_path" ]; then + ls -lh "\$local_path" + return 0 + else + echo "FAILED: \$local_path" + return 1 + fi + } - print("=== Installing sshpass ===", flush=True) - # Try conda first, then apt - rc = run("conda install -y -c conda-forge sshpass 2>/dev/null || apt-get update -qq && apt-get install -y -qq sshpass 2>/dev/null || pip install sshpass 2>/dev/null") - # Check if sshpass is available - rc = run("which sshpass || echo 'sshpass not found, trying expect'") - - # If sshpass not available, use expect or python paramiko - has_sshpass = subprocess.run("which sshpass", shell=True, capture_output=True).returncode == 0 + echo "=== Downloading imputed files ===" + download "/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz" "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz" + download "/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz" "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz" + download "/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz" "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz" - def download(remote, local_path): - if os.path.isfile(local_path): - size = os.path.getsize(local_path) - print(f"SKIP (exists, {size} bytes): {local_path}", flush=True) - return True - print(f"Downloading: {remote} -> {local_path}", flush=True) - if has_sshpass: - cmd = f"sshpass -p 'bl3rg3r5' scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -P 9100 'omic@nucleus.omic.ai:{remote}' '{local_path}'" - else: - # Fallback: use expect - cmd = f'''expect -c " -set timeout 3600 -spawn scp -o StrictHostKeyChecking=no -P 9100 omic@nucleus.omic.ai:{remote} {local_path} -expect \\"password:\\" -send \\"bl3rg3r5\\r\\" -expect eof -"''' - rc = run(cmd) - if rc == 0 and os.path.isfile(local_path): - size = os.path.getsize(local_path) - print(f" OK: {size} bytes", flush=True) - return True - else: - print(f" FAILED (rc={rc})", flush=True) - return False + echo "=== Downloading gnomad files ===" + download "/mnt/Avatar/digital_patient/gnomad.genomes.v4.1.sites.female.txt" "\$BASE/healthy/gnomad.genomes.v4.1.sites.female.txt" + download "/mnt/Avatar/digital_patient/gnomad.genomes.v4.1.sites.male.txt" "\$BASE/healthy/gnomad.genomes.v4.1.sites.male.txt" - files = [ - ("/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz", f"{BASE}/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz"), - ("/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz", f"{BASE}/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz"), - ("/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz", f"{BASE}/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz"), - ("/mnt/Avatar/digital_patient/gnomad.genomes.v4.1.sites.female.txt", f"{BASE}/healthy/gnomad.genomes.v4.1.sites.female.txt"), - ("/mnt/Avatar/digital_patient/gnomad.genomes.v4.1.sites.male.txt", f"{BASE}/healthy/gnomad.genomes.v4.1.sites.male.txt"), - ("/mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa", f"{BASE}/supporting-data/genome/hg38.fa"), - ("/mnt/Avatar/dd/synthea/supporting-data/genome/hg38.dict", f"{BASE}/supporting-data/genome/hg38.dict"), - ("/mnt/Avatar/dd/synthea/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz", f"{BASE}/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz"), - ("/mnt/Avatar/dd/synthea/supporting-data/vcf/vcf_template.vcf", f"{BASE}/supporting-data/vcf/vcf_template.vcf"), - ("/mnt/Avatar/imputed/ukbb/metadata/ukbb_phenotypes.csv", f"{BASE}/ukbb_phenotypes_filtered.csv"), - ] + echo "=== Downloading supporting-data files ===" + download "/mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa" "\$BASE/supporting-data/genome/hg38.fa" + download "/mnt/Avatar/dd/synthea/supporting-data/genome/hg38.dict" "\$BASE/supporting-data/genome/hg38.dict" + download "/mnt/Avatar/dd/synthea/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz" "\$BASE/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz" + download "/mnt/Avatar/dd/synthea/supporting-data/vcf/vcf_template.vcf" "\$BASE/supporting-data/vcf/vcf_template.vcf" - print("=== Downloading files ===", flush=True) - ok = 0 - fail = 0 - for remote, local in files: - if download(remote, local): - ok += 1 - else: - fail += 1 + echo "=== Downloading phenotype file ===" + download "/mnt/Avatar/imputed/ukbb/metadata/ukbb_phenotypes.csv" "\$BASE/ukbb_phenotypes_filtered.csv" - print(f"\\n=== Download results: {ok} ok, {fail} failed ===", flush=True) + echo "=== Final verification ===" + echo "Imputed:" + ls -lh \$BASE/imputed/*.bgz 2>/dev/null || echo " NONE" + echo "Healthy:" + ls -lh \$BASE/healthy/*.txt 2>/dev/null || echo " NONE" + echo "Genome:" + ls -lh \$BASE/supporting-data/genome/* 2>/dev/null || echo " NONE" + echo "Support:" + ls -lh \$BASE/supporting-data/vcf/* \$BASE/supporting-data/ucsc-liftover/* 2>/dev/null || echo " NONE" + echo "Small:" + ls -lh \$BASE/MANE* \$BASE/regulon* \$BASE/LM22* \$BASE/ukbb* 2>/dev/null || echo " NONE" - print("\\n=== Final verification ===", flush=True) - run(f"echo 'Imputed:' && ls -lh {BASE}/imputed/*.bgz 2>/dev/null || echo ' NONE'") - run(f"echo 'Healthy:' && ls -lh {BASE}/healthy/*.txt 2>/dev/null || echo ' NONE'") - run(f"echo 'Genome:' && ls -lh {BASE}/supporting-data/genome/* 2>/dev/null || echo ' NONE'") - run(f"echo 'Support:' && ls -lh {BASE}/supporting-data/vcf/* {BASE}/supporting-data/ucsc-liftover/* 2>/dev/null || echo ' NONE'") - run(f"echo 'Small:' && ls -lh {BASE}/MANE* {BASE}/regulon* {BASE}/LM22* {BASE}/ukbb* 2>/dev/null || echo ' NONE'") - - if fail > 0: - print("STAGING FAILED - some files missing", flush=True) - sys.exit(1) - print("=== STAGING COMPLETE ===", flush=True) + echo "=== STAGING COMPLETE ===" """ }