#!/bin/bash # run_pipeline.sh # # This script runs the Nextflow pipeline for generating disease modules and synthetic patients set -e # Exit on error # Default values DISEASE_NAME="" FORCE_GENERATE=false GENERATE_PATIENTS=false POPULATION=100 GENDER=0.5 MIN_AGE=0 MAX_AGE=90 SEED="" ANALYZE_DATA=false REPORT_FORMAT="html" VERIFY_DOCKER=true # Parse command line arguments function show_help { echo "Usage: $0 [options] --disease \"Disease Name\"" echo "" echo "Options:" echo " --disease, -d NAME Disease name to generate a module for (required)" echo " --force-generate, -f Force regeneration of disease module even if it exists" echo " --patients, -p Generate synthetic patients (default: false)" echo " --population, -n NUM Number of patients to generate (default: 100)" echo " --gender, -g VALUE Gender distribution (0-1 for % female, default: 0.5)" echo " --min-age, -a NUM Minimum patient age (default: 0)" echo " --max-age, -m NUM Maximum patient age (default: 90)" echo " --seed, -s SEED Random seed for reproducibility" echo " --analyze, -A Analyze patient data after generation" echo " --report-format, -r FMT Report format for analysis (html, json, csv, default: html)" echo " --skip-docker-check Skip Docker container verification" echo " --help, -h Show this help message" echo "" echo "Example:" echo " $0 --disease \"Multiple Sclerosis\" --patients --population 50" exit 1 } # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in --disease|-d) DISEASE_NAME="$2" shift 2 ;; --force-generate|-f) FORCE_GENERATE=true shift ;; --patients|-p) GENERATE_PATIENTS=true shift ;; --population|-n) POPULATION="$2" shift 2 ;; --gender|-g) GENDER="$2" shift 2 ;; --min-age|-a) MIN_AGE="$2" shift 2 ;; --max-age|-m) MAX_AGE="$2" shift 2 ;; --seed|-s) SEED="$2" shift 2 ;; --analyze|-A) ANALYZE_DATA=true shift ;; --report-format|-r) REPORT_FORMAT="$2" shift 2 ;; --skip-docker-check) VERIFY_DOCKER=false shift ;; --help|-h) show_help ;; *) echo "Unknown option: $1" show_help ;; esac done # Check if disease name is provided if [ -z "$DISEASE_NAME" ]; then echo "ERROR: Disease name is required!" show_help fi # Check if Docker container is running if we need to generate patients if [ "$VERIFY_DOCKER" = true ] && [ "$GENERATE_PATIENTS" = true ]; then CONTAINER_RUNNING=$(docker ps | grep synthea | wc -l) if [ "$CONTAINER_RUNNING" -eq 0 ]; then echo "No Synthea Docker containers are running!" echo "Running environment setup script to start containers..." if [ -f "./scripts/prepare_environment.sh" ]; then ./scripts/prepare_environment.sh elif [ -f "./prepare_environment.sh" ]; then ./prepare_environment.sh else echo "ERROR: prepare_environment.sh not found!" exit 1 fi else echo "Synthea Docker container is running. Proceeding with pipeline execution." fi fi # Create params.json file with our configuration echo "Creating params.json file with pipeline configuration..." cat > params.json << EOF { "disease_name": "$DISEASE_NAME", "modules_dir": "modules", "output_dir": "output", "generate_patients": $GENERATE_PATIENTS, "population": $POPULATION, "gender": $GENDER, "min_age": $MIN_AGE, "max_age": $MAX_AGE, "analyze_patient_data": $ANALYZE_DATA, "report_format": "$REPORT_FORMAT", "force_generate": $FORCE_GENERATE, "publish_dir": "published_output" EOF # Add seed if provided if [ ! -z "$SEED" ]; then echo ", \"seed\": $SEED" >> params.json fi # Close JSON object echo "}" >> params.json # Display execution details echo "==================================================" echo "Running Synthea Pipeline for: $DISEASE_NAME" echo "Generate patients: $GENERATE_PATIENTS" if [ "$GENERATE_PATIENTS" = true ]; then echo "Population: $POPULATION" echo "Gender ratio (proportion female): $GENDER" echo "Age range: $MIN_AGE-$MAX_AGE" if [ ! -z "$SEED" ]; then echo "Random seed: $SEED" fi fi echo "Force module generation: $FORCE_GENERATE" echo "Analyze patient data: $ANALYZE_DATA" echo "==================================================" # Execute the Nextflow command echo "Starting Nextflow pipeline..." nextflow run main.nf echo "" echo "Pipeline execution complete!" if [ "$GENERATE_PATIENTS" = true ]; then echo "Check the 'output' directory and 'published_output/$DISEASE_NAME_NORMALIZED' for generated patient data." if [ "$ANALYZE_DATA" = true ]; then echo "Analysis reports can be found in 'published_output/$DISEASE_NAME_NORMALIZED/analysis'." fi fi echo "Generated modules can be found in the 'modules' directory and 'published_output/modules'." echo ""