Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 67bd6692b0 | |||
| f8df39d9af | |||
| 02d93f9360 | |||
| f29323323b | |||
| a871107728 |
15
Dockerfile
15
Dockerfile
@@ -50,21 +50,12 @@ RUN mkdir -p /app/modules
|
||||
# Test a simple module generation to ensure Synthea works
|
||||
RUN ./run_synthea -p 1 -m hypertension
|
||||
|
||||
# Set up a symlink from mounted modules to Synthea modules directory
|
||||
RUN echo '#!/bin/sh\n\
|
||||
# Update modules symlinks\n\
|
||||
# Load environment variables\n\
|
||||
if [ -f /app/.env ]; then\n\
|
||||
export $(grep -v "^#" /app/.env | xargs)\n\
|
||||
fi\n\
|
||||
\n\
|
||||
exec "$@"' > /app/entrypoint.sh && chmod +x /app/entrypoint.sh
|
||||
# Make directories writable for K8s (may run as non-root)
|
||||
RUN chmod -R 777 /app/.gradle /app/output /app/build /app/modules \
|
||||
/app/src/main/resources/modules
|
||||
|
||||
# Set PYTHONPATH to ensure modules can be found
|
||||
ENV PYTHONPATH="/app"
|
||||
|
||||
# Set entrypoint to use our script
|
||||
ENTRYPOINT ["/app/entrypoint.sh"]
|
||||
|
||||
# Default command when container runs
|
||||
CMD ["tail", "-f", "/dev/null"]
|
||||
@@ -1,47 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
synthea:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
volumes:
|
||||
- ./modules:/app/modules:ro # Mount modules directory read-only
|
||||
- ./output:/app/output # Mount output directory for patient data
|
||||
- ./.env:/app/.env:ro # Mount environment variables file
|
||||
environment:
|
||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
|
||||
working_dir: /app
|
||||
command: tail -f /dev/null # Keep container running
|
||||
healthcheck:
|
||||
test: ["CMD", "/app/healthcheck.sh"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 5s
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8080:8080" # Only needed if you want to access the Synthea web interface
|
||||
|
||||
module-generator:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
volumes:
|
||||
- ./modules:/app/modules # Mount modules directory for writing
|
||||
- ./module_generator:/app/module_generator
|
||||
- ./src:/app/src
|
||||
- ./scripts:/app/scripts
|
||||
- ./.env:/app/.env:ro # Mount environment variables file
|
||||
environment:
|
||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
|
||||
working_dir: /app
|
||||
command: python3 /app/module_generator/run_module_generator.py --batch-size 5 --max-modules 10 --prioritize
|
||||
depends_on:
|
||||
- synthea
|
||||
profiles:
|
||||
- generator # This service won't start by default, only when explicitly requested
|
||||
|
||||
volumes:
|
||||
synthea-output:
|
||||
driver: local
|
||||
@@ -1,78 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script to generate synthetic patients directly using modules created by the pipeline
|
||||
# Usage: ./generate_patients.sh <module_name> <output_directory> <population_size>
|
||||
|
||||
MODULE_NAME=$1
|
||||
OUTPUT_DIR=$2
|
||||
POPULATION=${3:-10}
|
||||
|
||||
if [ -z "$MODULE_NAME" ] || [ -z "$OUTPUT_DIR" ]; then
|
||||
echo "Usage: $0 <module_name> <output_directory> [population_size]"
|
||||
echo "Example: $0 diabetes /path/to/output 20"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create output directory
|
||||
mkdir -p "$OUTPUT_DIR/m" "$OUTPUT_DIR/f"
|
||||
|
||||
# Location of module file
|
||||
MODULE_PATH="/data/olamide/synthea-alldiseases/modules/${MODULE_NAME}.json"
|
||||
|
||||
if [ ! -f "$MODULE_PATH" ]; then
|
||||
echo "Module file not found: $MODULE_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create a temporary directory for the container output
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
echo "Created temporary directory: $TEMP_DIR"
|
||||
|
||||
# Run for male patients
|
||||
echo "Generating male patients..."
|
||||
docker run --rm \
|
||||
-v "$MODULE_PATH:/app/modules/${MODULE_NAME}.json" \
|
||||
-v "$TEMP_DIR:/app/output" \
|
||||
synthea-module-generator \
|
||||
bash -c "cd /app && ./run_synthea -p $((POPULATION/2)) -g M -m ${MODULE_NAME} Massachusetts"
|
||||
|
||||
# Copy male patient files to the output directory
|
||||
echo "Copying male patient files..."
|
||||
find "$TEMP_DIR/fhir" -name "*.json" ! -name "*hospital*" ! -name "*practitioner*" | while read file; do
|
||||
# Check if it's a patient file by looking for gender field
|
||||
if grep -q '"gender"' "$file"; then
|
||||
cp "$file" "$OUTPUT_DIR/m/"
|
||||
fi
|
||||
done
|
||||
|
||||
# Clear the temp directory
|
||||
rm -rf "$TEMP_DIR/fhir"/*
|
||||
|
||||
# Run for female patients
|
||||
echo "Generating female patients..."
|
||||
docker run --rm \
|
||||
-v "$MODULE_PATH:/app/modules/${MODULE_NAME}.json" \
|
||||
-v "$TEMP_DIR:/app/output" \
|
||||
synthea-module-generator \
|
||||
bash -c "cd /app && ./run_synthea -p $((POPULATION/2)) -g F -m ${MODULE_NAME} Massachusetts"
|
||||
|
||||
# Copy female patient files to the output directory
|
||||
echo "Copying female patient files..."
|
||||
find "$TEMP_DIR/fhir" -name "*.json" ! -name "*hospital*" ! -name "*practitioner*" | while read file; do
|
||||
# Check if it's a patient file by looking for gender field
|
||||
if grep -q '"gender"' "$file"; then
|
||||
cp "$file" "$OUTPUT_DIR/f/"
|
||||
fi
|
||||
done
|
||||
|
||||
# Count the results
|
||||
male_count=$(find "$OUTPUT_DIR/m" -type f -name "*.json" | wc -l)
|
||||
female_count=$(find "$OUTPUT_DIR/f" -type f -name "*.json" | wc -l)
|
||||
|
||||
# Report results
|
||||
echo "Patient generation complete. Results saved to $OUTPUT_DIR"
|
||||
echo "Male patients: $male_count"
|
||||
echo "Female patients: $female_count"
|
||||
|
||||
# Clean up temp directory
|
||||
rm -rf "$TEMP_DIR"
|
||||
39
main.nf
39
main.nf
@@ -2,7 +2,6 @@
|
||||
|
||||
nextflow.enable.dsl=2
|
||||
|
||||
// Default parameters
|
||||
params.disease_name = null
|
||||
params.outdir = null
|
||||
params.population = 10
|
||||
@@ -11,7 +10,6 @@ params.min_age = 0
|
||||
params.max_age = 90
|
||||
params.seed = null
|
||||
|
||||
// Validate required parameters
|
||||
if (!params.disease_name) {
|
||||
error "Disease name is required. Please specify with --disease_name"
|
||||
}
|
||||
@@ -20,9 +18,8 @@ if (!params.outdir) {
|
||||
error "Output directory is required. Please specify with --outdir"
|
||||
}
|
||||
|
||||
// Process to generate synthetic patients
|
||||
process generatePatients {
|
||||
container 'harbor.cluster.omic.ai/omic/synthea-alldiseases:latest'
|
||||
container 'harbor.cluster.omic.ai/omic/synthea-alldiseases:v3'
|
||||
publishDir params.outdir, mode: 'copy'
|
||||
|
||||
input:
|
||||
@@ -33,38 +30,36 @@ process generatePatients {
|
||||
path "run.log", emit: log_file
|
||||
|
||||
script:
|
||||
def moduleBasename = diseaseName.toLowerCase().replaceAll(' ', '_')
|
||||
def genderArg = params.gender < 0.5 ? "M" : (params.gender > 0.5 ? "F" : "B")
|
||||
def genderArg = params.gender < 0.5 ? "-g M" : (params.gender > 0.5 ? "-g F" : "")
|
||||
def seedArg = params.seed ? "-s ${params.seed}" : ""
|
||||
"""
|
||||
# Check if a custom module exists, otherwise use built-in Synthea modules
|
||||
MODULE_FILE="/app/src/main/resources/modules/${moduleBasename}.json"
|
||||
if [ -f "\${MODULE_FILE}" ]; then
|
||||
echo "Found custom module: \${MODULE_FILE}" | tee run.log
|
||||
else
|
||||
echo "Using built-in Synthea modules for: ${diseaseName}" | tee run.log
|
||||
fi
|
||||
set +e
|
||||
WORKDIR=\$(pwd)
|
||||
|
||||
# Run Synthea patient generation
|
||||
cd /app && ./run_synthea \
|
||||
# Run Synthea via pre-built jar (Gradle is not writable in K8s)
|
||||
cd /app
|
||||
java -jar /app/build/libs/synthea-with-dependencies.jar \
|
||||
-p ${params.population} \
|
||||
-g ${genderArg} \
|
||||
${genderArg} \
|
||||
-a ${params.min_age}-${params.max_age} \
|
||||
${seedArg} \
|
||||
-- ${diseaseName} 2>&1 | tee -a run.log
|
||||
${seedArg} 2>&1 | tee \${WORKDIR}/run.log
|
||||
JAVA_EXIT=\${PIPESTATUS[0]}
|
||||
|
||||
# Collect FHIR output
|
||||
cd \${WORKDIR}
|
||||
mkdir -p fhir
|
||||
if [ -d /app/output/fhir ]; then
|
||||
cp /app/output/fhir/*.json fhir/ 2>/dev/null || true
|
||||
echo "Copied \$(ls fhir/*.json 2>/dev/null | wc -l) FHIR bundles" | tee -a run.log
|
||||
fi
|
||||
|
||||
# Succeed if FHIR output was produced
|
||||
if [ -n "\$(ls fhir/*.json 2>/dev/null)" ]; then
|
||||
exit 0
|
||||
else
|
||||
echo "Warning: No FHIR output generated" | tee -a run.log
|
||||
exit \${JAVA_EXIT}
|
||||
fi
|
||||
"""
|
||||
}
|
||||
|
||||
// Workflow
|
||||
workflow {
|
||||
generatePatients(params.disease_name)
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ profiles {
|
||||
k8s {
|
||||
process {
|
||||
executor = 'k8s'
|
||||
container = 'harbor.cluster.omic.ai/omic/synthea-alldiseases:latest'
|
||||
container = 'harbor.cluster.omic.ai/omic/synthea-alldiseases:v3'
|
||||
}
|
||||
docker {
|
||||
enabled = true
|
||||
|
||||
77
simple.nf
77
simple.nf
@@ -1,77 +0,0 @@
|
||||
#!/usr/bin/env nextflow
|
||||
|
||||
nextflow.enable.dsl=2
|
||||
|
||||
// Default parameters
|
||||
params.disease_name = "Diabetes" // Default disease name
|
||||
params.output_dir = "/mnt/OmicNAS/private/old/olamide/synthea/output/new" // Output directory
|
||||
params.modules_dir = "modules" // Directory for module files
|
||||
|
||||
// Process to generate synthetic patients
|
||||
process generatePatients {
|
||||
publishDir "${params.output_dir}/${params.disease_name.toLowerCase().replaceAll(' ', '_')}", mode: 'copy'
|
||||
|
||||
input:
|
||||
path moduleFile
|
||||
|
||||
output:
|
||||
path "**"
|
||||
|
||||
script:
|
||||
"""
|
||||
echo "Module file: ${moduleFile}"
|
||||
echo "Disease: ${params.disease_name}"
|
||||
|
||||
# Check if Docker is available
|
||||
if command -v docker &>/dev/null; then
|
||||
echo "Docker is available, looking for Synthea container..."
|
||||
|
||||
# Find the Synthea container
|
||||
container_id=\$(docker ps --format '{{.ID}}' --filter "name=synthea" | head -1)
|
||||
|
||||
if [ -n "\$container_id" ]; then
|
||||
echo "Using Synthea container \$container_id"
|
||||
|
||||
# Copy module to container
|
||||
docker exec \$container_id mkdir -p /app/modules
|
||||
docker cp "${moduleFile}" \$container_id:/app/modules/
|
||||
|
||||
# Run Synthea with minimal parameters
|
||||
docker exec \$container_id bash -c "cd /app && ./run_synthea -p 1 -m ${params.disease_name.toLowerCase().replaceAll(' ', '_')}"
|
||||
|
||||
# Copy output from container
|
||||
docker cp \$container_id:/app/output/fhir ./ || mkdir -p ./fhir
|
||||
docker cp \$container_id:/app/output/metadata ./ || mkdir -p ./metadata
|
||||
|
||||
echo "Completed patient generation"
|
||||
else
|
||||
echo "No Synthea container found, creating mock output for testing"
|
||||
mkdir -p ./fhir ./metadata
|
||||
echo "Mock FHIR data for ${params.disease_name}" > ./fhir/mock_patient.json
|
||||
echo "Mock metadata for ${params.disease_name}" > ./metadata/mock_stats.json
|
||||
fi
|
||||
else
|
||||
echo "Docker not available, creating mock output for testing"
|
||||
mkdir -p ./fhir ./metadata
|
||||
echo "Mock FHIR data for ${params.disease_name}" > ./fhir/mock_patient.json
|
||||
echo "Mock metadata for ${params.disease_name}" > ./metadata/mock_stats.json
|
||||
fi
|
||||
"""
|
||||
}
|
||||
|
||||
// Define workflow
|
||||
workflow {
|
||||
// Prepare module file
|
||||
moduleFilename = params.disease_name.toLowerCase().replaceAll(' ', '_') + '.json'
|
||||
moduleFile = file("${params.modules_dir}/${moduleFilename}")
|
||||
|
||||
if (!moduleFile.exists()) {
|
||||
error "Module file not found: ${moduleFile}"
|
||||
}
|
||||
|
||||
// Create a channel with the module file
|
||||
moduleChannel = Channel.fromPath(moduleFile)
|
||||
|
||||
// Generate patients
|
||||
generatePatients(moduleChannel)
|
||||
}
|
||||
141
test.nf
141
test.nf
@@ -1,141 +0,0 @@
|
||||
#!/usr/bin/env nextflow
|
||||
|
||||
nextflow.enable.dsl=2
|
||||
|
||||
/*
|
||||
* Synthea Disease Module Generator Pipeline
|
||||
*
|
||||
* A Nextflow pipeline to generate and manage Synthea disease modules
|
||||
*/
|
||||
|
||||
// Load API key from .env file if it exists
|
||||
def envFile = file('.env')
|
||||
if (envFile.exists()) {
|
||||
envFile.eachLine { line ->
|
||||
def (key, value) = line.tokenize('=')
|
||||
if (key && value && key.trim() == 'ANTHROPIC_API_KEY') {
|
||||
params.anthropic_api_key = value.trim()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Default parameters
|
||||
params.disease_name = null // Disease name to generate patients for
|
||||
params.output_dir = "/mnt/OmicNAS/private/old/olamide/synthea/output/new" // Output directory
|
||||
params.modules_dir = "src/main/resources/modules" // Directory for module files
|
||||
params.population = 100 // Number of patients to generate
|
||||
params.gender = 0.5 // Decimal representing proportion female (0.0-1.0)
|
||||
params.min_age = 0 // Minimum age of generated patients
|
||||
params.max_age = 90 // Maximum age of generated patients
|
||||
params.seed = null // Random seed for reproducibility
|
||||
params.help = false // Show help message
|
||||
|
||||
// Show help message
|
||||
if (params.help) {
|
||||
log.info """
|
||||
Synthea Patient Generator
|
||||
========================
|
||||
Usage: nextflow run main.nf --disease_name "Disease Name"
|
||||
|
||||
Required Arguments:
|
||||
--disease_name Disease name to generate patients for
|
||||
|
||||
Optional Arguments:
|
||||
--modules_dir Module directory (default: modules)
|
||||
--output_dir Output directory (default: output)
|
||||
--population Number of patients (default: 100)
|
||||
--gender Gender ratio - female proportion 0.0-1.0 (default: 0.5)
|
||||
--min_age Minimum age (default: 0)
|
||||
--max_age Maximum age (default: 90)
|
||||
--seed Random seed (default: random)
|
||||
"""
|
||||
exit 0
|
||||
}
|
||||
|
||||
// Validate required parameters
|
||||
if (!params.disease_name && !params.help) {
|
||||
error "Disease name is required. Please specify with --disease_name"
|
||||
}
|
||||
|
||||
// Process to check if module exists and generate it if needed
|
||||
process checkAndGetModule {
|
||||
container 'synthea-module-generator'
|
||||
publishDir "${params.modules_dir}", mode: 'copy'
|
||||
|
||||
input:
|
||||
val diseaseName
|
||||
|
||||
output:
|
||||
path "*.json", emit: module_file
|
||||
|
||||
script:
|
||||
// Use sanitized disease name for filenames - replace spaces with underscores and remove special chars
|
||||
def moduleFilename = diseaseName.toLowerCase().replaceAll(' ', '_').replaceAll('[^a-z0-9_]', '') + '.json'
|
||||
def fullPath = "/app/src/main/resources/modules/${moduleFilename}"
|
||||
"""
|
||||
echo "Looking for module at ${fullPath}"
|
||||
if [ -f "${fullPath}" ]; then
|
||||
echo "Module exists, copying..."
|
||||
cp "${fullPath}" .
|
||||
else
|
||||
echo "Module not found, generating..."
|
||||
# Use the simple generator script instead
|
||||
python3 /app/module_generator/simple_module_generator.py --disease "${diseaseName}" --output "${moduleFilename}"
|
||||
if [ -f "${moduleFilename}" ]; then
|
||||
echo "Successfully generated module"
|
||||
else
|
||||
echo "Error: Failed to generate module"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
"""
|
||||
}
|
||||
|
||||
// Process to generate synthetic patients
|
||||
process generatePatients {
|
||||
container 'synthea-module-generator'
|
||||
publishDir "${params.output_dir}/${diseaseName.toLowerCase().replaceAll(' ', '_').replaceAll('[^a-z0-9_]', '')}", mode: 'copy', failOnError: false
|
||||
|
||||
input:
|
||||
val diseaseName
|
||||
path moduleFile
|
||||
|
||||
output:
|
||||
path "m", optional: true
|
||||
path "f", optional: true
|
||||
path "module.json", optional: true
|
||||
path "README.txt", optional: true
|
||||
|
||||
script:
|
||||
def moduleBasename = diseaseName.toLowerCase().replaceAll(' ', '_').replaceAll('[^a-z0-9_]', '')
|
||||
"""
|
||||
# Create directories
|
||||
mkdir -p m f
|
||||
|
||||
# Copy the module file for reference
|
||||
cp "${moduleFile}" module.json
|
||||
|
||||
# Create a README file with instructions
|
||||
cat > README.txt << EOF
|
||||
This directory contains the module for ${diseaseName}.
|
||||
To generate patients, run:
|
||||
./generate_patients.sh ${moduleBasename} ${params.output_dir}/${moduleBasename}/patients 20
|
||||
EOF
|
||||
|
||||
# Create marker files
|
||||
touch m/.keep
|
||||
touch f/.keep
|
||||
|
||||
# Always exit successfully
|
||||
exit 0
|
||||
"""
|
||||
}
|
||||
|
||||
// Define workflow
|
||||
workflow {
|
||||
// First check if the module exists
|
||||
checkAndGetModule(params.disease_name)
|
||||
|
||||
// Then generate patients
|
||||
generatePatients(params.disease_name, checkAndGetModule.out.module_file)
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Set up environment and variables
|
||||
MODULE_NAME="diabetes"
|
||||
JSON_PATH="$(pwd)/modules/${MODULE_NAME}.json"
|
||||
|
||||
# Make sure we have the module file
|
||||
if [ ! -f "$JSON_PATH" ]; then
|
||||
echo "Module file not found: $JSON_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run Synthea directly in a container
|
||||
docker run --rm -v "${JSON_PATH}:/app/modules/${MODULE_NAME}.json" \
|
||||
-v "$(pwd)/test_output:/app/output" \
|
||||
synthea-module-generator \
|
||||
bash -c "cd /app && ./run_synthea -p 10 -g B -m ${MODULE_NAME} -a 0-90 -s 12345 | tee /app/output/synthea_run.log"
|
||||
|
||||
# Check the output
|
||||
echo "Checking output directory:"
|
||||
find test_output -type f | sort
|
||||
Reference in New Issue
Block a user