Files
synthea-alldiseases/scripts/run_pipeline.sh

182 lines
5.3 KiB
Bash
Executable File

#!/bin/bash
# run_pipeline.sh
#
# This script runs the Nextflow pipeline for generating disease modules and synthetic patients
set -e # Exit on error
# Default values
DISEASE_NAME=""
FORCE_GENERATE=false
GENERATE_PATIENTS=false
POPULATION=100
GENDER=0.5
MIN_AGE=0
MAX_AGE=90
SEED=""
ANALYZE_DATA=false
REPORT_FORMAT="html"
VERIFY_DOCKER=true
# Parse command line arguments
function show_help {
echo "Usage: $0 [options] --disease \"Disease Name\""
echo ""
echo "Options:"
echo " --disease, -d NAME Disease name to generate a module for (required)"
echo " --force-generate, -f Force regeneration of disease module even if it exists"
echo " --patients, -p Generate synthetic patients (default: false)"
echo " --population, -n NUM Number of patients to generate (default: 100)"
echo " --gender, -g VALUE Gender distribution (0-1 for % female, default: 0.5)"
echo " --min-age, -a NUM Minimum patient age (default: 0)"
echo " --max-age, -m NUM Maximum patient age (default: 90)"
echo " --seed, -s SEED Random seed for reproducibility"
echo " --analyze, -A Analyze patient data after generation"
echo " --report-format, -r FMT Report format for analysis (html, json, csv, default: html)"
echo " --skip-docker-check Skip Docker container verification"
echo " --help, -h Show this help message"
echo ""
echo "Example:"
echo " $0 --disease \"Multiple Sclerosis\" --patients --population 50"
exit 1
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--disease|-d)
DISEASE_NAME="$2"
shift 2
;;
--force-generate|-f)
FORCE_GENERATE=true
shift
;;
--patients|-p)
GENERATE_PATIENTS=true
shift
;;
--population|-n)
POPULATION="$2"
shift 2
;;
--gender|-g)
GENDER="$2"
shift 2
;;
--min-age|-a)
MIN_AGE="$2"
shift 2
;;
--max-age|-m)
MAX_AGE="$2"
shift 2
;;
--seed|-s)
SEED="$2"
shift 2
;;
--analyze|-A)
ANALYZE_DATA=true
shift
;;
--report-format|-r)
REPORT_FORMAT="$2"
shift 2
;;
--skip-docker-check)
VERIFY_DOCKER=false
shift
;;
--help|-h)
show_help
;;
*)
echo "Unknown option: $1"
show_help
;;
esac
done
# Check if disease name is provided
if [ -z "$DISEASE_NAME" ]; then
echo "ERROR: Disease name is required!"
show_help
fi
# Check if Docker container is running if we need to generate patients
if [ "$VERIFY_DOCKER" = true ] && [ "$GENERATE_PATIENTS" = true ]; then
CONTAINER_RUNNING=$(docker ps | grep synthea | wc -l)
if [ "$CONTAINER_RUNNING" -eq 0 ]; then
echo "No Synthea Docker containers are running!"
echo "Running environment setup script to start containers..."
if [ -f "./scripts/prepare_environment.sh" ]; then
./scripts/prepare_environment.sh
elif [ -f "./prepare_environment.sh" ]; then
./prepare_environment.sh
else
echo "ERROR: prepare_environment.sh not found!"
exit 1
fi
else
echo "Synthea Docker container is running. Proceeding with pipeline execution."
fi
fi
# Create params.json file with our configuration
echo "Creating params.json file with pipeline configuration..."
cat > params.json << EOF
{
"disease_name": "$DISEASE_NAME",
"modules_dir": "modules",
"output_dir": "output",
"generate_patients": $GENERATE_PATIENTS,
"population": $POPULATION,
"gender": $GENDER,
"min_age": $MIN_AGE,
"max_age": $MAX_AGE,
"analyze_patient_data": $ANALYZE_DATA,
"report_format": "$REPORT_FORMAT",
"force_generate": $FORCE_GENERATE,
"publish_dir": "published_output"
EOF
# Add seed if provided
if [ ! -z "$SEED" ]; then
echo ", \"seed\": $SEED" >> params.json
fi
# Close JSON object
echo "}" >> params.json
# Display execution details
echo "=================================================="
echo "Running Synthea Pipeline for: $DISEASE_NAME"
echo "Generate patients: $GENERATE_PATIENTS"
if [ "$GENERATE_PATIENTS" = true ]; then
echo "Population: $POPULATION"
echo "Gender ratio (proportion female): $GENDER"
echo "Age range: $MIN_AGE-$MAX_AGE"
if [ ! -z "$SEED" ]; then
echo "Random seed: $SEED"
fi
fi
echo "Force module generation: $FORCE_GENERATE"
echo "Analyze patient data: $ANALYZE_DATA"
echo "=================================================="
# Execute the Nextflow command
echo "Starting Nextflow pipeline..."
nextflow run main.nf
echo ""
echo "Pipeline execution complete!"
if [ "$GENERATE_PATIENTS" = true ]; then
echo "Check the 'output' directory and 'published_output/$DISEASE_NAME_NORMALIZED' for generated patient data."
if [ "$ANALYZE_DATA" = true ]; then
echo "Analysis reports can be found in 'published_output/$DISEASE_NAME_NORMALIZED/analysis'."
fi
fi
echo "Generated modules can be found in the 'modules' directory and 'published_output/modules'."
echo ""