From d468509ec3d179d3915f162e03975fcc2d91706b Mon Sep 17 00:00:00 2001 From: Olamide Isreal Date: Wed, 25 Mar 2026 12:31:34 +0100 Subject: [PATCH] Configure synthea-alldiseases for WES execution - Rewrite params.json to match WES tool registry format - Update main.nf to use Harbor container image - Add k8s profile to nextflow.config for WES/Kubernetes execution - Use s3://omic/eureka paths for output --- main.nf | 146 +++++++++++++++--------------------------------- nextflow.config | 82 ++++++++++++--------------- params.json | 127 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 195 insertions(+), 160 deletions(-) diff --git a/main.nf b/main.nf index 2ef2c10..7eebb8a 100644 --- a/main.nf +++ b/main.nf @@ -2,127 +2,69 @@ nextflow.enable.dsl=2 -/* - * Synthea Disease Module Generator Pipeline - * - * A Nextflow pipeline to generate and manage Synthea disease modules - */ - -// Load API key from .env file if it exists -def envFile = file('.env') -if (envFile.exists()) { - envFile.eachLine { line -> - def (key, value) = line.tokenize('=') - if (key && value && key.trim() == 'ANTHROPIC_API_KEY') { - params.anthropic_api_key = value.trim() - } - } -} - // Default parameters -params.disease_name = null // Disease name to generate patients for -params.output_dir = "/mnt/OmicNAS/private/old/olamide/synthea/output/new" // Output directory -params.modules_dir = "src/main/resources/modules" // Directory for module files -params.population = 100 // Number of patients to generate -params.gender = 0.5 // Decimal representing proportion female (0.0-1.0) -params.min_age = 0 // Minimum age of generated patients -params.max_age = 90 // Maximum age of generated patients -params.seed = null // Random seed for reproducibility -params.help = false // Show help message - -// Show help message -if (params.help) { - log.info """ - Synthea Patient Generator - ======================== - Usage: nextflow run main.nf --disease_name "Disease Name" - - Required Arguments: - --disease_name Disease name to generate patients for - - Optional Arguments: - --modules_dir Module directory (default: modules) - --output_dir Output directory (default: output) - --population Number of patients (default: 100) - --gender Gender ratio - female proportion 0.0-1.0 (default: 0.5) - --min_age Minimum age (default: 0) - --max_age Maximum age (default: 90) - --seed Random seed (default: random) - """ - exit 0 -} +params.disease_name = null +params.outdir = null +params.population = 10 +params.gender = 0.5 +params.min_age = 0 +params.max_age = 90 +params.seed = null // Validate required parameters -if (!params.disease_name && !params.help) { +if (!params.disease_name) { error "Disease name is required. Please specify with --disease_name" } -// Process to check if module exists and generate it if needed -process checkAndGetModule { - container 'synthea-module-generator' - publishDir "${params.modules_dir}", mode: 'copy' - - input: - val diseaseName - - output: - path "*.json", emit: module_file - - script: - def moduleFilename = diseaseName.toLowerCase().replaceAll(' ', '_') + '.json' - def fullPath = "/app/src/main/resources/modules/${moduleFilename}" - """ - echo "Looking for module at ${fullPath}" - if [ -f "${fullPath}" ]; then - echo "Module exists, copying..." - cp "${fullPath}" . - else - echo "Module not found, generating..." - python3 /app/module_generator/module_generator.py --disease "${diseaseName}" --output "${moduleFilename}" - if [ -f "${moduleFilename}" ]; then - echo "Successfully generated module" - else - echo "Error: Failed to generate module" - exit 1 - fi - fi - """ +if (!params.outdir) { + error "Output directory is required. Please specify with --outdir" } // Process to generate synthetic patients process generatePatients { - container 'synthea-module-generator' - publishDir "${params.output_dir}/${diseaseName.toLowerCase().replaceAll(' ', '_')}", mode: 'copy' - + container 'harbor.cluster.omic.ai/omic/synthea-alldiseases:latest' + publishDir params.outdir, mode: 'copy' + input: val diseaseName - path moduleFile - + output: - path "m/*", optional: true - path "f/*", optional: true - + path "fhir/*.json", optional: true, emit: fhir_output + path "run.log", emit: log_file + script: def moduleBasename = diseaseName.toLowerCase().replaceAll(' ', '_') def genderArg = params.gender < 0.5 ? "M" : (params.gender > 0.5 ? "F" : "B") - def seedValue = params.seed ?: new Random().nextInt(1000000) + def seedArg = params.seed ? "-s ${params.seed}" : "" """ - # Copy module and run Synthea - cp "${moduleFile}" /app/modules/ - cd /app && ./run_synthea -p ${params.population} -g ${genderArg} -m ${moduleBasename} -a ${params.min_age}-${params.max_age} -s ${seedValue} - - # Organize output by gender - mkdir -p m f - find /app/output/fhir -type f -name "*.json" ! -name "*hospital*" ! -name "*practitioner*" | xargs grep -l '"gender":"male"' | xargs -I{} cp {} m/ - find /app/output/fhir -type f -name "*.json" ! -name "*hospital*" ! -name "*practitioner*" | xargs grep -l '"gender":"female"' | xargs -I{} cp {} f/ + # Check if a custom module exists, otherwise use built-in Synthea modules + MODULE_FILE="/app/src/main/resources/modules/${moduleBasename}.json" + if [ -f "\${MODULE_FILE}" ]; then + echo "Found custom module: \${MODULE_FILE}" | tee run.log + else + echo "Using built-in Synthea modules for: ${diseaseName}" | tee run.log + fi + + # Run Synthea patient generation + cd /app && ./run_synthea \ + -p ${params.population} \ + -g ${genderArg} \ + -a ${params.min_age}-${params.max_age} \ + ${seedArg} \ + -- ${diseaseName} 2>&1 | tee -a run.log + + # Collect FHIR output + mkdir -p fhir + if [ -d /app/output/fhir ]; then + cp /app/output/fhir/*.json fhir/ 2>/dev/null || true + echo "Copied \$(ls fhir/*.json 2>/dev/null | wc -l) FHIR bundles" | tee -a run.log + else + echo "Warning: No FHIR output generated" | tee -a run.log + fi """ } -// Define workflow +// Workflow workflow { - // First check if the module exists - checkAndGetModule(params.disease_name) - - // Then generate patients - generatePatients(params.disease_name, checkAndGetModule.out.module_file) + generatePatients(params.disease_name) } diff --git a/nextflow.config b/nextflow.config index 8a0c2b1..9dd640b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,54 +1,44 @@ -// Nextflow configuration file manifest { - description = 'Synthea Module Generator Pipeline' + name = 'synthea-alldiseases' + description = 'Synthea synthetic patient generator pipeline' + mainScript = 'main.nf' + version = '1.0.0' } -// Load parameters from params.json -def paramsJson = new File("$baseDir/params.json").text -def paramsData = new groovy.json.JsonSlurper().parseText(paramsJson) - -// Keep work directories for debugging -cleanup = false - -// Merge with defaults params { - local_output_dir = "$baseDir/local_output" - disease_name = paramsData.disease_name - modules_dir = paramsData.modules_dir ?: "/Users/richman/workspace/synthea-alldiseases/modules" - output_dir = paramsData.output_dir ?: "/mnt/OmicNAS/private/old/olamide/synthea/output/new" - population = paramsData.population ?: 100 - gender = paramsData.gender ?: 0.5 - min_age = paramsData.min_age ?: 0 - max_age = paramsData.max_age ?: 90 - seed = paramsData.seed - generate_patients = paramsData.generate_patients ?: true - publish_dir = paramsData.publish_dir ?: "published_output" - // Additional params - max_cost = 5.0 - timeout = 300 - anthropic_api_key = null - batch_size = 1 - help = false -} - -docker { - enabled = true - runOptions = "-v $baseDir/$params.modules_dir:/app/src/main/resources/modules -v $baseDir/src/main/python:/app/src/main/python -v $baseDir/src/main/resources:/app/src/main/resources -v $baseDir/.env:/app/.env -v $baseDir/module_generator:/app/module_generator -v $baseDir/modules:/app/modules" -} - -process { - container = 'synthea-module-generator' - containerOptions = "-e MODULES_DIR=/app/src/main/resources/modules -e PYTHONPATH=/app -e ANTHROPIC_API_KEY=${params.anthropic_api_key}" -} - -trace { - enabled = true - overwrite = true - file = "$baseDir/trace.txt" + disease_name = null + outdir = null + population = 10 + gender = 0.5 + min_age = 0 + max_age = 90 + seed = null } profiles { - standard { - process.executor = 'local' - } + standard { + docker { + enabled = true + temp = 'auto' + } + } + + k8s { + process { + executor = 'k8s' + container = 'harbor.cluster.omic.ai/omic/synthea-alldiseases:latest' + } + docker { + enabled = true + } + k8s { + storageClaimName = 'eureka-pvc' + storageMountPath = '/omic/eureka' + } + } +} + +process { + cpus = 2 + memory = '4 GB' } diff --git a/params.json b/params.json index 991057a..411a71b 100644 --- a/params.json +++ b/params.json @@ -1,14 +1,117 @@ { - "disease_name": "Parkinson's Disease", - "modules_dir": "modules", - "output_dir": "/mnt/OmicNAS/private/old/olamide/synthea/output/new", - "generate_patients": true, - "population": 10, - "gender": 0.5, - "min_age": 0, - "max_age": 90, - "analyze_patient_data": false, - "report_format": "html", - "force_generate": false, - "publish_dir": "published_output" + "params": { + "disease_name": { + "type": "string", + "description": "Disease name to generate synthetic patients for", + "default": "Diabetes", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.disease_name", + "examples": [ + "Diabetes", + "Hypertension", + "Lung Cancer" + ], + "pattern": ".*", + "enum": [], + "validation": {}, + "notes": "The disease name used to find or generate a Synthea disease module. Case-insensitive." + }, + "population": { + "type": "integer", + "description": "Number of synthetic patients to generate", + "default": 10, + "required": false, + "pipeline_io": "parameter", + "var_name": "params.population", + "examples": [ + 10, + 100, + 1000 + ], + "pattern": "^\\d+$", + "enum": [], + "validation": { + "min": 1, + "max": 10000 + }, + "notes": "Higher numbers take longer to generate." + }, + "gender": { + "type": "number", + "description": "Proportion of female patients (0.0 = all male, 1.0 = all female)", + "default": 0.5, + "required": false, + "pipeline_io": "parameter", + "var_name": "params.gender", + "examples": [ + 0.5, + 0.0, + 1.0 + ], + "pattern": "^[01]\\.?\\d*$", + "enum": [], + "validation": { + "min": 0.0, + "max": 1.0 + }, + "notes": "Decimal between 0.0 and 1.0 representing the proportion of female patients." + }, + "min_age": { + "type": "integer", + "description": "Minimum age of generated patients", + "default": 0, + "required": false, + "pipeline_io": "parameter", + "var_name": "params.min_age", + "examples": [ + 0, + 18, + 40 + ], + "pattern": "^\\d+$", + "enum": [], + "validation": { + "min": 0, + "max": 140 + }, + "notes": "Minimum patient age in years." + }, + "max_age": { + "type": "integer", + "description": "Maximum age of generated patients", + "default": 90, + "required": false, + "pipeline_io": "parameter", + "var_name": "params.max_age", + "examples": [ + 90, + 65, + 100 + ], + "pattern": "^\\d+$", + "enum": [], + "validation": { + "min": 1, + "max": 140 + }, + "notes": "Maximum patient age in years." + }, + "outdir": { + "type": "folder", + "description": "Output directory for generated patient data", + "default": "s3://omic/eureka/synthea-alldiseases/output", + "required": true, + "pipeline_io": "output", + "var_name": "params.outdir", + "examples": [ + "s3://omic/eureka/synthea-alldiseases/output", + "s3://omic/eureka/synthea-alldiseases/results" + ], + "pattern": ".*", + "enum": [], + "validation": {}, + "notes": "Directory where generated FHIR patient bundles will be stored." + } + } }