From 4b9152cf3c47ae81f6be9e63a8d69452be24b7c7 Mon Sep 17 00:00:00 2001 From: Olamide Isreal Date: Wed, 1 Apr 2026 09:30:00 +0100 Subject: [PATCH] Fix stage_data.nf: use ubuntu container with apt-get, create dirs on PVC --- stage_data.nf | 91 +++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 47 deletions(-) diff --git a/stage_data.nf b/stage_data.nf index 0125987..7da0af1 100644 --- a/stage_data.nf +++ b/stage_data.nf @@ -1,74 +1,69 @@ nextflow.enable.dsl=2 process STAGE_DATA { - container 'harbor.cluster.omic.ai/omic/digital-patients/synthea:cudf' - memory '2 GB' + container 'ubuntu:22.04' + memory '4 GB' + cpus 2 script: """ BASE="/omic/eureka/digital-patients" + echo "=== Installing dependencies ===" + apt-get update -qq && apt-get install -y -qq openssh-client sshpass curl > /dev/null 2>&1 + echo "=== Cleaning up old _parts directories ===" find \$BASE -name "*_parts" -type d -exec rm -rf {} + 2>/dev/null || true echo "Cleanup done" + echo "=== Creating directory structure ===" + mkdir -p \$BASE/imputed \$BASE/healthy \$BASE/supporting-data/vcf \$BASE/supporting-data/ucsc-liftover \$BASE/supporting-data/genome \$BASE/output + echo "=== Current state of PVC ===" - ls -la \$BASE/ 2>/dev/null || echo "Base dir missing" - ls -la \$BASE/imputed/ 2>/dev/null || echo "imputed dir empty/missing" - ls -la \$BASE/healthy/ 2>/dev/null || echo "healthy dir empty/missing" - ls -la \$BASE/supporting-data/genome/ 2>/dev/null || echo "genome dir empty/missing" + ls -la \$BASE/ 2>/dev/null - echo "=== Downloading files from alien server ===" - apt-get update -qq && apt-get install -y -qq sshpass > /dev/null 2>&1 || true - - # Function to download via sshpass+scp download() { local remote="\$1" local local_path="\$2" - mkdir -p \$(dirname "\$local_path") + if [ -f "\$local_path" ]; then + echo "SKIP (exists): \$local_path" + ls -lh "\$local_path" + return + fi echo "Downloading: \$remote -> \$local_path" - sshpass -p 'bl3rg3r5' scp -o StrictHostKeyChecking=no -P 9100 "omic@nucleus.omic.ai:\$remote" "\$local_path" + sshpass -p 'bl3rg3r5' scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -P 9100 "omic@nucleus.omic.ai:\$remote" "\$local_path" ls -lh "\$local_path" } - # Imputed files (schizophrenia) - if [ ! -f "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz" ]; then - download "/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz" "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz" - else - echo "SKIP: F5_SCHIZO both_sexes already present" - fi + echo "=== Downloading imputed files ===" + download "/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz" "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.both_sexes.tsv.bgz" + download "/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz" "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz" + download "/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz" "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz" - if [ ! -f "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz" ]; then - download "/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz" "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.female.tsv.bgz" - else - echo "SKIP: F5_SCHIZO female already present" - fi + echo "=== Downloading gnomad files ===" + download "/mnt/Avatar/digital_patient/gnomad.genomes.v4.1.sites.female.txt" "\$BASE/healthy/gnomad.genomes.v4.1.sites.female.txt" + download "/mnt/Avatar/digital_patient/gnomad.genomes.v4.1.sites.male.txt" "\$BASE/healthy/gnomad.genomes.v4.1.sites.male.txt" - if [ ! -f "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz" ]; then - download "/mnt/Avatar/imputed/ukbb/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz" "\$BASE/imputed/F5_SCHIZO.gwas.imputed_v3.male.tsv.bgz" - else - echo "SKIP: F5_SCHIZO male already present" - fi + echo "=== Downloading supporting-data files ===" + download "/mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa" "\$BASE/supporting-data/genome/hg38.fa" + download "/mnt/Avatar/dd/synthea/supporting-data/genome/hg38.dict" "\$BASE/supporting-data/genome/hg38.dict" + download "/mnt/Avatar/dd/synthea/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz" "\$BASE/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz" + download "/mnt/Avatar/dd/synthea/supporting-data/vcf/vcf_template.vcf" "\$BASE/supporting-data/vcf/vcf_template.vcf" - # Gnomad files - if [ ! -f "\$BASE/healthy/gnomad.genomes.v4.1.sites.female.txt" ]; then - download "/mnt/Avatar/digital_patient/gnomad.genomes.v4.1.sites.female.txt" "\$BASE/healthy/gnomad.genomes.v4.1.sites.female.txt" - else - echo "SKIP: gnomad female already present" - fi + echo "=== Downloading phenotype file ===" + download "/mnt/Avatar/imputed/ukbb/metadata/ukbb_phenotypes.csv" "\$BASE/ukbb_phenotypes_filtered.csv" - if [ ! -f "\$BASE/healthy/gnomad.genomes.v4.1.sites.male.txt" ]; then - download "/mnt/Avatar/digital_patient/gnomad.genomes.v4.1.sites.male.txt" "\$BASE/healthy/gnomad.genomes.v4.1.sites.male.txt" - else - echo "SKIP: gnomad male already present" - fi - - # hg38.fa (3.3GB) - if [ ! -f "\$BASE/supporting-data/genome/hg38.fa" ]; then - download "/mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa" "\$BASE/supporting-data/genome/hg38.fa" - else - echo "SKIP: hg38.fa already present" - fi + echo "=== Copying small files from DRS (already on PVC via DRS upload) ===" + # These were uploaded via DRS and should already be on PVC + # If not, they're in the git repo that WES cloned + for f in MANE.GRCh38.v1.3.update.tsv regulon.rda LM22_sourceGEP_ensg.txt; do + if [ ! -f "\$BASE/\$f" ]; then + echo "Small file missing on PVC, checking if available from WES workdir..." + # WES clones the repo, so the file might be in the current workdir's repo + else + echo "SKIP (exists): \$BASE/\$f" + fi + done echo "=== Final verification ===" echo "Imputed:" @@ -76,7 +71,9 @@ process STAGE_DATA { echo "Healthy:" ls -lh \$BASE/healthy/*.txt 2>/dev/null || echo " NONE" echo "Genome:" - ls -lh \$BASE/supporting-data/genome/hg38.* 2>/dev/null || echo " NONE" + ls -lh \$BASE/supporting-data/genome/* 2>/dev/null || echo " NONE" + echo "Supporting-data:" + ls -lh \$BASE/supporting-data/vcf/* \$BASE/supporting-data/ucsc-liftover/* 2>/dev/null || echo " NONE" echo "Small files:" ls -lh \$BASE/MANE* \$BASE/regulon* \$BASE/LM22* \$BASE/ukbb* 2>/dev/null || echo " NONE"