Clean up pipeline files

- Remove unused variable and redundant comments/echo statements in main.nf
- Remove obsolete files: simple.nf, test.nf, generate_patients.sh,
  test_synthea.sh, trace.txt, docker-compose.yml
  (all referenced local-only synthea-module-generator image)
This commit is contained in:
2026-03-25 15:09:07 +01:00
parent f8df39d9af
commit 67bd6692b0
7 changed files with 2 additions and 378 deletions

View File

@@ -1,47 +0,0 @@
version: '3.8'
services:
synthea:
build:
context: .
dockerfile: Dockerfile
volumes:
- ./modules:/app/modules:ro # Mount modules directory read-only
- ./output:/app/output # Mount output directory for patient data
- ./.env:/app/.env:ro # Mount environment variables file
environment:
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
working_dir: /app
command: tail -f /dev/null # Keep container running
healthcheck:
test: ["CMD", "/app/healthcheck.sh"]
interval: 30s
timeout: 10s
retries: 3
start_period: 5s
restart: unless-stopped
ports:
- "8080:8080" # Only needed if you want to access the Synthea web interface
module-generator:
build:
context: .
dockerfile: Dockerfile
volumes:
- ./modules:/app/modules # Mount modules directory for writing
- ./module_generator:/app/module_generator
- ./src:/app/src
- ./scripts:/app/scripts
- ./.env:/app/.env:ro # Mount environment variables file
environment:
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
working_dir: /app
command: python3 /app/module_generator/run_module_generator.py --batch-size 5 --max-modules 10 --prioritize
depends_on:
- synthea
profiles:
- generator # This service won't start by default, only when explicitly requested
volumes:
synthea-output:
driver: local

View File

@@ -1,78 +0,0 @@
#!/bin/bash
# Script to generate synthetic patients directly using modules created by the pipeline
# Usage: ./generate_patients.sh <module_name> <output_directory> <population_size>
MODULE_NAME=$1
OUTPUT_DIR=$2
POPULATION=${3:-10}
if [ -z "$MODULE_NAME" ] || [ -z "$OUTPUT_DIR" ]; then
echo "Usage: $0 <module_name> <output_directory> [population_size]"
echo "Example: $0 diabetes /path/to/output 20"
exit 1
fi
# Create output directory
mkdir -p "$OUTPUT_DIR/m" "$OUTPUT_DIR/f"
# Location of module file
MODULE_PATH="/data/olamide/synthea-alldiseases/modules/${MODULE_NAME}.json"
if [ ! -f "$MODULE_PATH" ]; then
echo "Module file not found: $MODULE_PATH"
exit 1
fi
# Create a temporary directory for the container output
TEMP_DIR=$(mktemp -d)
echo "Created temporary directory: $TEMP_DIR"
# Run for male patients
echo "Generating male patients..."
docker run --rm \
-v "$MODULE_PATH:/app/modules/${MODULE_NAME}.json" \
-v "$TEMP_DIR:/app/output" \
synthea-module-generator \
bash -c "cd /app && ./run_synthea -p $((POPULATION/2)) -g M -m ${MODULE_NAME} Massachusetts"
# Copy male patient files to the output directory
echo "Copying male patient files..."
find "$TEMP_DIR/fhir" -name "*.json" ! -name "*hospital*" ! -name "*practitioner*" | while read file; do
# Check if it's a patient file by looking for gender field
if grep -q '"gender"' "$file"; then
cp "$file" "$OUTPUT_DIR/m/"
fi
done
# Clear the temp directory
rm -rf "$TEMP_DIR/fhir"/*
# Run for female patients
echo "Generating female patients..."
docker run --rm \
-v "$MODULE_PATH:/app/modules/${MODULE_NAME}.json" \
-v "$TEMP_DIR:/app/output" \
synthea-module-generator \
bash -c "cd /app && ./run_synthea -p $((POPULATION/2)) -g F -m ${MODULE_NAME} Massachusetts"
# Copy female patient files to the output directory
echo "Copying female patient files..."
find "$TEMP_DIR/fhir" -name "*.json" ! -name "*hospital*" ! -name "*practitioner*" | while read file; do
# Check if it's a patient file by looking for gender field
if grep -q '"gender"' "$file"; then
cp "$file" "$OUTPUT_DIR/f/"
fi
done
# Count the results
male_count=$(find "$OUTPUT_DIR/m" -type f -name "*.json" | wc -l)
female_count=$(find "$OUTPUT_DIR/f" -type f -name "*.json" | wc -l)
# Report results
echo "Patient generation complete. Results saved to $OUTPUT_DIR"
echo "Male patients: $male_count"
echo "Female patients: $female_count"
# Clean up temp directory
rm -rf "$TEMP_DIR"

14
main.nf
View File

@@ -2,7 +2,6 @@
nextflow.enable.dsl=2 nextflow.enable.dsl=2
// Default parameters
params.disease_name = null params.disease_name = null
params.outdir = null params.outdir = null
params.population = 10 params.population = 10
@@ -11,7 +10,6 @@ params.min_age = 0
params.max_age = 90 params.max_age = 90
params.seed = null params.seed = null
// Validate required parameters
if (!params.disease_name) { if (!params.disease_name) {
error "Disease name is required. Please specify with --disease_name" error "Disease name is required. Please specify with --disease_name"
} }
@@ -20,7 +18,6 @@ if (!params.outdir) {
error "Output directory is required. Please specify with --outdir" error "Output directory is required. Please specify with --outdir"
} }
// Process to generate synthetic patients
process generatePatients { process generatePatients {
container 'harbor.cluster.omic.ai/omic/synthea-alldiseases:v3' container 'harbor.cluster.omic.ai/omic/synthea-alldiseases:v3'
publishDir params.outdir, mode: 'copy' publishDir params.outdir, mode: 'copy'
@@ -33,14 +30,13 @@ process generatePatients {
path "run.log", emit: log_file path "run.log", emit: log_file
script: script:
def moduleBasename = diseaseName.toLowerCase().replaceAll(' ', '_')
def genderArg = params.gender < 0.5 ? "-g M" : (params.gender > 0.5 ? "-g F" : "") def genderArg = params.gender < 0.5 ? "-g M" : (params.gender > 0.5 ? "-g F" : "")
def seedArg = params.seed ? "-s ${params.seed}" : "" def seedArg = params.seed ? "-s ${params.seed}" : ""
""" """
set +e set +e
WORKDIR=\$(pwd) WORKDIR=\$(pwd)
# Use pre-built jar directly (bypasses Gradle which needs write access to .gradle) # Run Synthea via pre-built jar (Gradle is not writable in K8s)
cd /app cd /app
java -jar /app/build/libs/synthea-with-dependencies.jar \ java -jar /app/build/libs/synthea-with-dependencies.jar \
-p ${params.population} \ -p ${params.population} \
@@ -49,18 +45,13 @@ process generatePatients {
${seedArg} 2>&1 | tee \${WORKDIR}/run.log ${seedArg} 2>&1 | tee \${WORKDIR}/run.log
JAVA_EXIT=\${PIPESTATUS[0]} JAVA_EXIT=\${PIPESTATUS[0]}
# Collect FHIR output back into Nextflow work dir
cd \${WORKDIR} cd \${WORKDIR}
mkdir -p fhir mkdir -p fhir
if [ -d /app/output/fhir ]; then if [ -d /app/output/fhir ]; then
cp /app/output/fhir/*.json fhir/ 2>/dev/null || true cp /app/output/fhir/*.json fhir/ 2>/dev/null || true
FHIR_COUNT=\$(ls fhir/*.json 2>/dev/null | wc -l)
echo "Copied \${FHIR_COUNT} FHIR bundles" | tee -a run.log
else
echo "Warning: No FHIR output directory found" | tee -a run.log
fi fi
# Exit 0 if we got FHIR output, regardless of java exit code # Succeed if FHIR output was produced
if [ -n "\$(ls fhir/*.json 2>/dev/null)" ]; then if [ -n "\$(ls fhir/*.json 2>/dev/null)" ]; then
exit 0 exit 0
else else
@@ -69,7 +60,6 @@ process generatePatients {
""" """
} }
// Workflow
workflow { workflow {
generatePatients(params.disease_name) generatePatients(params.disease_name)
} }

View File

@@ -1,77 +0,0 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
// Default parameters
params.disease_name = "Diabetes" // Default disease name
params.output_dir = "/mnt/OmicNAS/private/old/olamide/synthea/output/new" // Output directory
params.modules_dir = "modules" // Directory for module files
// Process to generate synthetic patients
process generatePatients {
publishDir "${params.output_dir}/${params.disease_name.toLowerCase().replaceAll(' ', '_')}", mode: 'copy'
input:
path moduleFile
output:
path "**"
script:
"""
echo "Module file: ${moduleFile}"
echo "Disease: ${params.disease_name}"
# Check if Docker is available
if command -v docker &>/dev/null; then
echo "Docker is available, looking for Synthea container..."
# Find the Synthea container
container_id=\$(docker ps --format '{{.ID}}' --filter "name=synthea" | head -1)
if [ -n "\$container_id" ]; then
echo "Using Synthea container \$container_id"
# Copy module to container
docker exec \$container_id mkdir -p /app/modules
docker cp "${moduleFile}" \$container_id:/app/modules/
# Run Synthea with minimal parameters
docker exec \$container_id bash -c "cd /app && ./run_synthea -p 1 -m ${params.disease_name.toLowerCase().replaceAll(' ', '_')}"
# Copy output from container
docker cp \$container_id:/app/output/fhir ./ || mkdir -p ./fhir
docker cp \$container_id:/app/output/metadata ./ || mkdir -p ./metadata
echo "Completed patient generation"
else
echo "No Synthea container found, creating mock output for testing"
mkdir -p ./fhir ./metadata
echo "Mock FHIR data for ${params.disease_name}" > ./fhir/mock_patient.json
echo "Mock metadata for ${params.disease_name}" > ./metadata/mock_stats.json
fi
else
echo "Docker not available, creating mock output for testing"
mkdir -p ./fhir ./metadata
echo "Mock FHIR data for ${params.disease_name}" > ./fhir/mock_patient.json
echo "Mock metadata for ${params.disease_name}" > ./metadata/mock_stats.json
fi
"""
}
// Define workflow
workflow {
// Prepare module file
moduleFilename = params.disease_name.toLowerCase().replaceAll(' ', '_') + '.json'
moduleFile = file("${params.modules_dir}/${moduleFilename}")
if (!moduleFile.exists()) {
error "Module file not found: ${moduleFile}"
}
// Create a channel with the module file
moduleChannel = Channel.fromPath(moduleFile)
// Generate patients
generatePatients(moduleChannel)
}

141
test.nf
View File

@@ -1,141 +0,0 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
/*
* Synthea Disease Module Generator Pipeline
*
* A Nextflow pipeline to generate and manage Synthea disease modules
*/
// Load API key from .env file if it exists
def envFile = file('.env')
if (envFile.exists()) {
envFile.eachLine { line ->
def (key, value) = line.tokenize('=')
if (key && value && key.trim() == 'ANTHROPIC_API_KEY') {
params.anthropic_api_key = value.trim()
}
}
}
// Default parameters
params.disease_name = null // Disease name to generate patients for
params.output_dir = "/mnt/OmicNAS/private/old/olamide/synthea/output/new" // Output directory
params.modules_dir = "src/main/resources/modules" // Directory for module files
params.population = 100 // Number of patients to generate
params.gender = 0.5 // Decimal representing proportion female (0.0-1.0)
params.min_age = 0 // Minimum age of generated patients
params.max_age = 90 // Maximum age of generated patients
params.seed = null // Random seed for reproducibility
params.help = false // Show help message
// Show help message
if (params.help) {
log.info """
Synthea Patient Generator
========================
Usage: nextflow run main.nf --disease_name "Disease Name"
Required Arguments:
--disease_name Disease name to generate patients for
Optional Arguments:
--modules_dir Module directory (default: modules)
--output_dir Output directory (default: output)
--population Number of patients (default: 100)
--gender Gender ratio - female proportion 0.0-1.0 (default: 0.5)
--min_age Minimum age (default: 0)
--max_age Maximum age (default: 90)
--seed Random seed (default: random)
"""
exit 0
}
// Validate required parameters
if (!params.disease_name && !params.help) {
error "Disease name is required. Please specify with --disease_name"
}
// Process to check if module exists and generate it if needed
process checkAndGetModule {
container 'synthea-module-generator'
publishDir "${params.modules_dir}", mode: 'copy'
input:
val diseaseName
output:
path "*.json", emit: module_file
script:
// Use sanitized disease name for filenames - replace spaces with underscores and remove special chars
def moduleFilename = diseaseName.toLowerCase().replaceAll(' ', '_').replaceAll('[^a-z0-9_]', '') + '.json'
def fullPath = "/app/src/main/resources/modules/${moduleFilename}"
"""
echo "Looking for module at ${fullPath}"
if [ -f "${fullPath}" ]; then
echo "Module exists, copying..."
cp "${fullPath}" .
else
echo "Module not found, generating..."
# Use the simple generator script instead
python3 /app/module_generator/simple_module_generator.py --disease "${diseaseName}" --output "${moduleFilename}"
if [ -f "${moduleFilename}" ]; then
echo "Successfully generated module"
else
echo "Error: Failed to generate module"
exit 1
fi
fi
"""
}
// Process to generate synthetic patients
process generatePatients {
container 'synthea-module-generator'
publishDir "${params.output_dir}/${diseaseName.toLowerCase().replaceAll(' ', '_').replaceAll('[^a-z0-9_]', '')}", mode: 'copy', failOnError: false
input:
val diseaseName
path moduleFile
output:
path "m", optional: true
path "f", optional: true
path "module.json", optional: true
path "README.txt", optional: true
script:
def moduleBasename = diseaseName.toLowerCase().replaceAll(' ', '_').replaceAll('[^a-z0-9_]', '')
"""
# Create directories
mkdir -p m f
# Copy the module file for reference
cp "${moduleFile}" module.json
# Create a README file with instructions
cat > README.txt << EOF
This directory contains the module for ${diseaseName}.
To generate patients, run:
./generate_patients.sh ${moduleBasename} ${params.output_dir}/${moduleBasename}/patients 20
EOF
# Create marker files
touch m/.keep
touch f/.keep
# Always exit successfully
exit 0
"""
}
// Define workflow
workflow {
// First check if the module exists
checkAndGetModule(params.disease_name)
// Then generate patients
generatePatients(params.disease_name, checkAndGetModule.out.module_file)
}

View File

@@ -1,21 +0,0 @@
#!/bin/bash
# Set up environment and variables
MODULE_NAME="diabetes"
JSON_PATH="$(pwd)/modules/${MODULE_NAME}.json"
# Make sure we have the module file
if [ ! -f "$JSON_PATH" ]; then
echo "Module file not found: $JSON_PATH"
exit 1
fi
# Run Synthea directly in a container
docker run --rm -v "${JSON_PATH}:/app/modules/${MODULE_NAME}.json" \
-v "$(pwd)/test_output:/app/output" \
synthea-module-generator \
bash -c "cd /app && ./run_synthea -p 10 -g B -m ${MODULE_NAME} -a 0-90 -s 12345 | tee /app/output/synthea_run.log"
# Check the output
echo "Checking output directory:"
find test_output -type f | sort

View File

@@ -1,2 +0,0 @@
task_id hash native_id name status exit submit duration realtime %cpu peak_rss peak_vmem rchar wchar
1 48/6b3902 74409 checkAndGetModule FAILED 1 2025-03-23 11:46:23.178 1.1s 995ms - - - - -