Configure synthea-alldiseases for WES execution
- Rewrite params.json to match WES tool registry format - Update main.nf to use Harbor container image - Add k8s profile to nextflow.config for WES/Kubernetes execution - Use s3://omic/eureka paths for output
This commit is contained in:
138
main.nf
138
main.nf
@@ -2,127 +2,69 @@
|
|||||||
|
|
||||||
nextflow.enable.dsl=2
|
nextflow.enable.dsl=2
|
||||||
|
|
||||||
/*
|
|
||||||
* Synthea Disease Module Generator Pipeline
|
|
||||||
*
|
|
||||||
* A Nextflow pipeline to generate and manage Synthea disease modules
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Load API key from .env file if it exists
|
|
||||||
def envFile = file('.env')
|
|
||||||
if (envFile.exists()) {
|
|
||||||
envFile.eachLine { line ->
|
|
||||||
def (key, value) = line.tokenize('=')
|
|
||||||
if (key && value && key.trim() == 'ANTHROPIC_API_KEY') {
|
|
||||||
params.anthropic_api_key = value.trim()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Default parameters
|
// Default parameters
|
||||||
params.disease_name = null // Disease name to generate patients for
|
params.disease_name = null
|
||||||
params.output_dir = "/mnt/OmicNAS/private/old/olamide/synthea/output/new" // Output directory
|
params.outdir = null
|
||||||
params.modules_dir = "src/main/resources/modules" // Directory for module files
|
params.population = 10
|
||||||
params.population = 100 // Number of patients to generate
|
params.gender = 0.5
|
||||||
params.gender = 0.5 // Decimal representing proportion female (0.0-1.0)
|
params.min_age = 0
|
||||||
params.min_age = 0 // Minimum age of generated patients
|
params.max_age = 90
|
||||||
params.max_age = 90 // Maximum age of generated patients
|
params.seed = null
|
||||||
params.seed = null // Random seed for reproducibility
|
|
||||||
params.help = false // Show help message
|
|
||||||
|
|
||||||
// Show help message
|
|
||||||
if (params.help) {
|
|
||||||
log.info """
|
|
||||||
Synthea Patient Generator
|
|
||||||
========================
|
|
||||||
Usage: nextflow run main.nf --disease_name "Disease Name"
|
|
||||||
|
|
||||||
Required Arguments:
|
|
||||||
--disease_name Disease name to generate patients for
|
|
||||||
|
|
||||||
Optional Arguments:
|
|
||||||
--modules_dir Module directory (default: modules)
|
|
||||||
--output_dir Output directory (default: output)
|
|
||||||
--population Number of patients (default: 100)
|
|
||||||
--gender Gender ratio - female proportion 0.0-1.0 (default: 0.5)
|
|
||||||
--min_age Minimum age (default: 0)
|
|
||||||
--max_age Maximum age (default: 90)
|
|
||||||
--seed Random seed (default: random)
|
|
||||||
"""
|
|
||||||
exit 0
|
|
||||||
}
|
|
||||||
|
|
||||||
// Validate required parameters
|
// Validate required parameters
|
||||||
if (!params.disease_name && !params.help) {
|
if (!params.disease_name) {
|
||||||
error "Disease name is required. Please specify with --disease_name"
|
error "Disease name is required. Please specify with --disease_name"
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process to check if module exists and generate it if needed
|
if (!params.outdir) {
|
||||||
process checkAndGetModule {
|
error "Output directory is required. Please specify with --outdir"
|
||||||
container 'synthea-module-generator'
|
|
||||||
publishDir "${params.modules_dir}", mode: 'copy'
|
|
||||||
|
|
||||||
input:
|
|
||||||
val diseaseName
|
|
||||||
|
|
||||||
output:
|
|
||||||
path "*.json", emit: module_file
|
|
||||||
|
|
||||||
script:
|
|
||||||
def moduleFilename = diseaseName.toLowerCase().replaceAll(' ', '_') + '.json'
|
|
||||||
def fullPath = "/app/src/main/resources/modules/${moduleFilename}"
|
|
||||||
"""
|
|
||||||
echo "Looking for module at ${fullPath}"
|
|
||||||
if [ -f "${fullPath}" ]; then
|
|
||||||
echo "Module exists, copying..."
|
|
||||||
cp "${fullPath}" .
|
|
||||||
else
|
|
||||||
echo "Module not found, generating..."
|
|
||||||
python3 /app/module_generator/module_generator.py --disease "${diseaseName}" --output "${moduleFilename}"
|
|
||||||
if [ -f "${moduleFilename}" ]; then
|
|
||||||
echo "Successfully generated module"
|
|
||||||
else
|
|
||||||
echo "Error: Failed to generate module"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
"""
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process to generate synthetic patients
|
// Process to generate synthetic patients
|
||||||
process generatePatients {
|
process generatePatients {
|
||||||
container 'synthea-module-generator'
|
container 'harbor.cluster.omic.ai/omic/synthea-alldiseases:latest'
|
||||||
publishDir "${params.output_dir}/${diseaseName.toLowerCase().replaceAll(' ', '_')}", mode: 'copy'
|
publishDir params.outdir, mode: 'copy'
|
||||||
|
|
||||||
input:
|
input:
|
||||||
val diseaseName
|
val diseaseName
|
||||||
path moduleFile
|
|
||||||
|
|
||||||
output:
|
output:
|
||||||
path "m/*", optional: true
|
path "fhir/*.json", optional: true, emit: fhir_output
|
||||||
path "f/*", optional: true
|
path "run.log", emit: log_file
|
||||||
|
|
||||||
script:
|
script:
|
||||||
def moduleBasename = diseaseName.toLowerCase().replaceAll(' ', '_')
|
def moduleBasename = diseaseName.toLowerCase().replaceAll(' ', '_')
|
||||||
def genderArg = params.gender < 0.5 ? "M" : (params.gender > 0.5 ? "F" : "B")
|
def genderArg = params.gender < 0.5 ? "M" : (params.gender > 0.5 ? "F" : "B")
|
||||||
def seedValue = params.seed ?: new Random().nextInt(1000000)
|
def seedArg = params.seed ? "-s ${params.seed}" : ""
|
||||||
"""
|
"""
|
||||||
# Copy module and run Synthea
|
# Check if a custom module exists, otherwise use built-in Synthea modules
|
||||||
cp "${moduleFile}" /app/modules/
|
MODULE_FILE="/app/src/main/resources/modules/${moduleBasename}.json"
|
||||||
cd /app && ./run_synthea -p ${params.population} -g ${genderArg} -m ${moduleBasename} -a ${params.min_age}-${params.max_age} -s ${seedValue}
|
if [ -f "\${MODULE_FILE}" ]; then
|
||||||
|
echo "Found custom module: \${MODULE_FILE}" | tee run.log
|
||||||
|
else
|
||||||
|
echo "Using built-in Synthea modules for: ${diseaseName}" | tee run.log
|
||||||
|
fi
|
||||||
|
|
||||||
# Organize output by gender
|
# Run Synthea patient generation
|
||||||
mkdir -p m f
|
cd /app && ./run_synthea \
|
||||||
find /app/output/fhir -type f -name "*.json" ! -name "*hospital*" ! -name "*practitioner*" | xargs grep -l '"gender":"male"' | xargs -I{} cp {} m/
|
-p ${params.population} \
|
||||||
find /app/output/fhir -type f -name "*.json" ! -name "*hospital*" ! -name "*practitioner*" | xargs grep -l '"gender":"female"' | xargs -I{} cp {} f/
|
-g ${genderArg} \
|
||||||
|
-a ${params.min_age}-${params.max_age} \
|
||||||
|
${seedArg} \
|
||||||
|
-- ${diseaseName} 2>&1 | tee -a run.log
|
||||||
|
|
||||||
|
# Collect FHIR output
|
||||||
|
mkdir -p fhir
|
||||||
|
if [ -d /app/output/fhir ]; then
|
||||||
|
cp /app/output/fhir/*.json fhir/ 2>/dev/null || true
|
||||||
|
echo "Copied \$(ls fhir/*.json 2>/dev/null | wc -l) FHIR bundles" | tee -a run.log
|
||||||
|
else
|
||||||
|
echo "Warning: No FHIR output generated" | tee -a run.log
|
||||||
|
fi
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
// Define workflow
|
// Workflow
|
||||||
workflow {
|
workflow {
|
||||||
// First check if the module exists
|
generatePatients(params.disease_name)
|
||||||
checkAndGetModule(params.disease_name)
|
|
||||||
|
|
||||||
// Then generate patients
|
|
||||||
generatePatients(params.disease_name, checkAndGetModule.out.module_file)
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,54 +1,44 @@
|
|||||||
// Nextflow configuration file
|
|
||||||
manifest {
|
manifest {
|
||||||
description = 'Synthea Module Generator Pipeline'
|
name = 'synthea-alldiseases'
|
||||||
|
description = 'Synthea synthetic patient generator pipeline'
|
||||||
|
mainScript = 'main.nf'
|
||||||
|
version = '1.0.0'
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load parameters from params.json
|
|
||||||
def paramsJson = new File("$baseDir/params.json").text
|
|
||||||
def paramsData = new groovy.json.JsonSlurper().parseText(paramsJson)
|
|
||||||
|
|
||||||
// Keep work directories for debugging
|
|
||||||
cleanup = false
|
|
||||||
|
|
||||||
// Merge with defaults
|
|
||||||
params {
|
params {
|
||||||
local_output_dir = "$baseDir/local_output"
|
disease_name = null
|
||||||
disease_name = paramsData.disease_name
|
outdir = null
|
||||||
modules_dir = paramsData.modules_dir ?: "/Users/richman/workspace/synthea-alldiseases/modules"
|
population = 10
|
||||||
output_dir = paramsData.output_dir ?: "/mnt/OmicNAS/private/old/olamide/synthea/output/new"
|
gender = 0.5
|
||||||
population = paramsData.population ?: 100
|
min_age = 0
|
||||||
gender = paramsData.gender ?: 0.5
|
max_age = 90
|
||||||
min_age = paramsData.min_age ?: 0
|
seed = null
|
||||||
max_age = paramsData.max_age ?: 90
|
|
||||||
seed = paramsData.seed
|
|
||||||
generate_patients = paramsData.generate_patients ?: true
|
|
||||||
publish_dir = paramsData.publish_dir ?: "published_output"
|
|
||||||
// Additional params
|
|
||||||
max_cost = 5.0
|
|
||||||
timeout = 300
|
|
||||||
anthropic_api_key = null
|
|
||||||
batch_size = 1
|
|
||||||
help = false
|
|
||||||
}
|
|
||||||
|
|
||||||
docker {
|
|
||||||
enabled = true
|
|
||||||
runOptions = "-v $baseDir/$params.modules_dir:/app/src/main/resources/modules -v $baseDir/src/main/python:/app/src/main/python -v $baseDir/src/main/resources:/app/src/main/resources -v $baseDir/.env:/app/.env -v $baseDir/module_generator:/app/module_generator -v $baseDir/modules:/app/modules"
|
|
||||||
}
|
|
||||||
|
|
||||||
process {
|
|
||||||
container = 'synthea-module-generator'
|
|
||||||
containerOptions = "-e MODULES_DIR=/app/src/main/resources/modules -e PYTHONPATH=/app -e ANTHROPIC_API_KEY=${params.anthropic_api_key}"
|
|
||||||
}
|
|
||||||
|
|
||||||
trace {
|
|
||||||
enabled = true
|
|
||||||
overwrite = true
|
|
||||||
file = "$baseDir/trace.txt"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
profiles {
|
profiles {
|
||||||
standard {
|
standard {
|
||||||
process.executor = 'local'
|
docker {
|
||||||
|
enabled = true
|
||||||
|
temp = 'auto'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
k8s {
|
||||||
|
process {
|
||||||
|
executor = 'k8s'
|
||||||
|
container = 'harbor.cluster.omic.ai/omic/synthea-alldiseases:latest'
|
||||||
|
}
|
||||||
|
docker {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
k8s {
|
||||||
|
storageClaimName = 'eureka-pvc'
|
||||||
|
storageMountPath = '/omic/eureka'
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
process {
|
||||||
|
cpus = 2
|
||||||
|
memory = '4 GB'
|
||||||
|
}
|
||||||
|
|||||||
127
params.json
127
params.json
@@ -1,14 +1,117 @@
|
|||||||
{
|
{
|
||||||
"disease_name": "Parkinson's Disease",
|
"params": {
|
||||||
"modules_dir": "modules",
|
"disease_name": {
|
||||||
"output_dir": "/mnt/OmicNAS/private/old/olamide/synthea/output/new",
|
"type": "string",
|
||||||
"generate_patients": true,
|
"description": "Disease name to generate synthetic patients for",
|
||||||
"population": 10,
|
"default": "Diabetes",
|
||||||
"gender": 0.5,
|
"required": true,
|
||||||
"min_age": 0,
|
"pipeline_io": "parameter",
|
||||||
"max_age": 90,
|
"var_name": "params.disease_name",
|
||||||
"analyze_patient_data": false,
|
"examples": [
|
||||||
"report_format": "html",
|
"Diabetes",
|
||||||
"force_generate": false,
|
"Hypertension",
|
||||||
"publish_dir": "published_output"
|
"Lung Cancer"
|
||||||
|
],
|
||||||
|
"pattern": ".*",
|
||||||
|
"enum": [],
|
||||||
|
"validation": {},
|
||||||
|
"notes": "The disease name used to find or generate a Synthea disease module. Case-insensitive."
|
||||||
|
},
|
||||||
|
"population": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Number of synthetic patients to generate",
|
||||||
|
"default": 10,
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.population",
|
||||||
|
"examples": [
|
||||||
|
10,
|
||||||
|
100,
|
||||||
|
1000
|
||||||
|
],
|
||||||
|
"pattern": "^\\d+$",
|
||||||
|
"enum": [],
|
||||||
|
"validation": {
|
||||||
|
"min": 1,
|
||||||
|
"max": 10000
|
||||||
|
},
|
||||||
|
"notes": "Higher numbers take longer to generate."
|
||||||
|
},
|
||||||
|
"gender": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "Proportion of female patients (0.0 = all male, 1.0 = all female)",
|
||||||
|
"default": 0.5,
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.gender",
|
||||||
|
"examples": [
|
||||||
|
0.5,
|
||||||
|
0.0,
|
||||||
|
1.0
|
||||||
|
],
|
||||||
|
"pattern": "^[01]\\.?\\d*$",
|
||||||
|
"enum": [],
|
||||||
|
"validation": {
|
||||||
|
"min": 0.0,
|
||||||
|
"max": 1.0
|
||||||
|
},
|
||||||
|
"notes": "Decimal between 0.0 and 1.0 representing the proportion of female patients."
|
||||||
|
},
|
||||||
|
"min_age": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Minimum age of generated patients",
|
||||||
|
"default": 0,
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.min_age",
|
||||||
|
"examples": [
|
||||||
|
0,
|
||||||
|
18,
|
||||||
|
40
|
||||||
|
],
|
||||||
|
"pattern": "^\\d+$",
|
||||||
|
"enum": [],
|
||||||
|
"validation": {
|
||||||
|
"min": 0,
|
||||||
|
"max": 140
|
||||||
|
},
|
||||||
|
"notes": "Minimum patient age in years."
|
||||||
|
},
|
||||||
|
"max_age": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum age of generated patients",
|
||||||
|
"default": 90,
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.max_age",
|
||||||
|
"examples": [
|
||||||
|
90,
|
||||||
|
65,
|
||||||
|
100
|
||||||
|
],
|
||||||
|
"pattern": "^\\d+$",
|
||||||
|
"enum": [],
|
||||||
|
"validation": {
|
||||||
|
"min": 1,
|
||||||
|
"max": 140
|
||||||
|
},
|
||||||
|
"notes": "Maximum patient age in years."
|
||||||
|
},
|
||||||
|
"outdir": {
|
||||||
|
"type": "folder",
|
||||||
|
"description": "Output directory for generated patient data",
|
||||||
|
"default": "s3://omic/eureka/synthea-alldiseases/output",
|
||||||
|
"required": true,
|
||||||
|
"pipeline_io": "output",
|
||||||
|
"var_name": "params.outdir",
|
||||||
|
"examples": [
|
||||||
|
"s3://omic/eureka/synthea-alldiseases/output",
|
||||||
|
"s3://omic/eureka/synthea-alldiseases/results"
|
||||||
|
],
|
||||||
|
"pattern": ".*",
|
||||||
|
"enum": [],
|
||||||
|
"validation": {},
|
||||||
|
"notes": "Directory where generated FHIR patient bundles will be stored."
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user