Trying to fix basic functionality again.

This commit is contained in:
2025-03-23 11:53:47 -07:00
parent ebda48190a
commit 2141e81f42
406 changed files with 173963 additions and 69 deletions

14
scripts/.nextflow.log Normal file
View File

@@ -0,0 +1,14 @@
Mar-20 17:36:01.178 [main] DEBUG nextflow.cli.Launcher - $> nextflow run main.nf --disease_name Migraine --generate_patients true --population 100 --gender 0.6 --modules_dir modules
Mar-20 17:36:01.428 [main] DEBUG nextflow.cli.CmdRun - N E X T F L O W ~ version 24.10.5
Mar-20 17:36:01.492 [main] DEBUG nextflow.plugin.PluginsFacade - Setting up plugin manager > mode=prod; embedded=false; plugins-dir=/Users/richman/.nextflow/plugins; core-plugins: nf-amazon@2.9.2,nf-azure@1.10.2,nf-cloudcache@0.4.2,nf-codecommit@0.2.2,nf-console@1.1.4,nf-google@1.15.4,nf-tower@1.9.3,nf-wave@1.7.4
Mar-20 17:36:01.528 [main] INFO o.pf4j.DefaultPluginStatusProvider - Enabled plugins: []
Mar-20 17:36:01.529 [main] INFO o.pf4j.DefaultPluginStatusProvider - Disabled plugins: []
Mar-20 17:36:01.538 [main] INFO org.pf4j.DefaultPluginManager - PF4J version 3.12.0 in 'deployment' mode
Mar-20 17:36:01.559 [main] INFO org.pf4j.AbstractPluginManager - No plugins
Mar-20 17:36:01.590 [main] DEBUG nextflow.scm.ProviderConfig - Using SCM config path: /Users/richman/.nextflow/scm
Mar-20 17:36:01.624 [main] DEBUG nextflow.cli.Launcher - Operation aborted
nextflow.exception.AbortOperationException: Cannot find script file: main.nf
at nextflow.cli.CmdRun.getScriptFile(CmdRun.groovy:536)
at nextflow.cli.CmdRun.run(CmdRun.groovy:325)
at nextflow.cli.Launcher.run(Launcher.groovy:503)
at nextflow.cli.Launcher.main(Launcher.groovy:658)

12
scripts/README.md Normal file
View File

@@ -0,0 +1,12 @@
# Utility Scripts
This directory contains utility scripts for working with Synthea modules:
- `analyze_patient_data.py` - Analyzes patient data generated by Synthea
- `check_json.py` - Validates JSON structure of module files
- `validate_module.py` - Performs comprehensive validation of modules
- `test_module_exists.py` - Checks if a module exists for a given disease
- `run_module.py` - Script to run a specific module with Synthea
- Other utility scripts for module generation and testing
These scripts complement the core module generator scripts found in the `module_generator/` directory.

View File

@@ -0,0 +1,304 @@
#!/usr/bin/env python3
import os
import sys
import json
import glob
import pandas as pd
from collections import Counter
from datetime import datetime
def analyze_patient_data(input_dir, output_dir, report_format='html', disease_name=None):
"""
Analyze Synthea-generated patient data and create reports.
Args:
input_dir: Directory containing patient JSON files
output_dir: Directory to save analysis outputs
report_format: Format for the report (html or csv)
disease_name: Optional name of the disease being simulated
"""
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Find all patient JSON files
patients_files = glob.glob(f"{input_dir}/**/*.json", recursive=True)
print(f"Found {len(patients_files)} patient records for analysis")
if len(patients_files) == 0:
print("No patient files found to analyze.")
with open(os.path.join(output_dir, 'patient_stats.json'), 'w') as f:
json.dump({"error": "No patient files found to analyze"}, f)
return
# Initialize data collectors
patient_data = []
condition_counts = Counter()
medication_counts = Counter()
demographics = {'gender': Counter(), 'age': [], 'race': Counter(), 'ethnicity': Counter()}
# Process each patient file
for patient_file in patients_files:
try:
with open(patient_file, 'r') as f:
data = json.load(f)
# Basic patient info
if 'gender' in data:
demographics['gender'][data['gender']] += 1
if 'birthDate' in data:
# Calculate age based on birth year
birth_year = int(data['birthDate'][:4])
current_year = datetime.now().year
age = current_year - birth_year
demographics['age'].append(age)
# Process race and ethnicity extensions
if 'extension' in data:
for ext in data['extension']:
if 'url' in ext and 'extension' in ext:
if ext['url'].endswith('us-core-race'):
for race_ext in ext['extension']:
if 'valueCoding' in race_ext:
race = race_ext['valueCoding'].get('display', 'Unknown')
demographics['race'][race] += 1
elif ext['url'].endswith('us-core-ethnicity'):
for eth_ext in ext['extension']:
if 'valueCoding' in eth_ext:
ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
demographics['ethnicity'][ethnicity] += 1
# Collect conditions and medications
if 'entry' in data:
for entry in data['entry']:
if 'resource' in entry:
resource = entry['resource']
# Check for conditions
if resource.get('resourceType') == 'Condition':
if 'code' in resource and 'coding' in resource['code']:
for code in resource['code']['coding']:
if 'display' in code:
condition_counts[code['display']] += 1
# Check for medications
if resource.get('resourceType') == 'MedicationRequest':
if 'medicationCodeableConcept' in resource and 'coding' in resource['medicationCodeableConcept']:
for code in resource['medicationCodeableConcept']['coding']:
if 'display' in code:
medication_counts[code['display']] += 1
except Exception as e:
print(f"Error processing {patient_file}: {e}")
# Prepare statistics
stats = {
'total_patients': len(patients_files),
'disease_name': disease_name,
'demographics': {
'gender_distribution': {gender: count for gender, count in demographics['gender'].items()},
'age_distribution': {
'min': min(demographics['age']) if demographics['age'] else None,
'max': max(demographics['age']) if demographics['age'] else None,
'average': sum(demographics['age']) / len(demographics['age']) if demographics['age'] else None,
'distribution': {'0-18': 0, '19-44': 0, '45-64': 0, '65+': 0}
},
'race_distribution': {race: count for race, count in demographics['race'].items()},
'ethnicity_distribution': {ethnicity: count for ethnicity, count in demographics['ethnicity'].items()}
},
'disease_stats': {
'top_conditions': dict(condition_counts.most_common(15)),
'top_medications': dict(medication_counts.most_common(15))
}
}
# Calculate age distribution
for age in demographics['age']:
if age <= 18:
stats['demographics']['age_distribution']['distribution']['0-18'] += 1
elif age <= 44:
stats['demographics']['age_distribution']['distribution']['19-44'] += 1
elif age <= 64:
stats['demographics']['age_distribution']['distribution']['45-64'] += 1
else:
stats['demographics']['age_distribution']['distribution']['65+'] += 1
# Save statistics to file
with open(os.path.join(output_dir, 'patient_stats.json'), 'w') as f:
json.dump(stats, f, indent=2)
# Generate report in requested format
if report_format == 'html':
generate_html_report(stats, output_dir)
elif report_format == 'csv':
generate_csv_reports(stats, output_dir)
else:
print(f"Unsupported report format: {report_format}. Only stats JSON file created.")
print(f"Analysis complete. Reports generated in {os.path.abspath(output_dir)}")
def generate_html_report(stats, output_dir):
"""Generate an HTML report from the patient statistics."""
disease_title = f" - {stats['disease_name']}" if stats['disease_name'] else ""
html = f'''<!DOCTYPE html>
<html>
<head>
<title>Synthea Patient Analysis{disease_title}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
h1, h2, h3 {{ color: #333; }}
.container {{ max-width: 1000px; margin: 0 auto; }}
table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
th, td {{ text-align: left; padding: 8px; border-bottom: 1px solid #ddd; }}
th {{ background-color: #f2f2f2; }}
tr:hover {{background-color: #f5f5f5;}}
.chart {{ margin: 20px 0; height: 300px; }}
</style>
</head>
<body>
<div class="container">
<h1>Synthea Patient Analysis{disease_title}</h1>
<p>Total patients: {stats['total_patients']}</p>
<h2>Demographics</h2>
<h3>Gender Distribution</h3>
<table>
<tr><th>Gender</th><th>Count</th><th>Percentage</th></tr>
'''
for gender, count in stats['demographics']['gender_distribution'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
html += f"<tr><td>{gender}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
html += '''
</table>
<h3>Age Distribution</h3>
<table>
<tr><th>Age Group</th><th>Count</th><th>Percentage</th></tr>
'''
for age_group, count in stats['demographics']['age_distribution']['distribution'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
html += f"<tr><td>{age_group}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
html += f'''
</table>
<p>Min Age: {stats['demographics']['age_distribution']['min']}</p>
<p>Max Age: {stats['demographics']['age_distribution']['max']}</p>
<p>Average Age: {stats['demographics']['age_distribution']['average']:.1f}</p>
<h3>Race Distribution</h3>
<table>
<tr><th>Race</th><th>Count</th><th>Percentage</th></tr>
'''
for race, count in stats['demographics']['race_distribution'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
html += f"<tr><td>{race}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
html += '''
</table>
<h3>Ethnicity Distribution</h3>
<table>
<tr><th>Ethnicity</th><th>Count</th><th>Percentage</th></tr>
'''
for ethnicity, count in stats['demographics']['ethnicity_distribution'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
html += f"<tr><td>{ethnicity}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
html += '''
</table>
<h2>Disease Statistics</h2>
<h3>Top Conditions</h3>
<table>
<tr><th>Condition</th><th>Count</th><th>Percentage</th></tr>
'''
for condition, count in stats['disease_stats']['top_conditions'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
html += f"<tr><td>{condition}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
html += '''
</table>
<h3>Top Medications</h3>
<table>
<tr><th>Medication</th><th>Count</th><th>Percentage of Patients</th></tr>
'''
for medication, count in stats['disease_stats']['top_medications'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
html += f"<tr><td>{medication}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
html += '''
</table>
</div>
</body>
</html>'''
with open(os.path.join(output_dir, 'patient_analysis.html'), 'w') as f:
f.write(html)
def generate_csv_reports(stats, output_dir):
"""Generate CSV reports from the patient statistics."""
import csv
# Demographics CSV
with open(os.path.join(output_dir, 'demographics.csv'), 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Category', 'Type', 'Count', 'Percentage'])
# Gender
for gender, count in stats['demographics']['gender_distribution'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
writer.writerow(['Gender', gender, count, f"{percentage:.1f}%"])
# Age
for age_group, count in stats['demographics']['age_distribution']['distribution'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
writer.writerow(['Age', age_group, count, f"{percentage:.1f}%"])
# Race
for race, count in stats['demographics']['race_distribution'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
writer.writerow(['Race', race, count, f"{percentage:.1f}%"])
# Ethnicity
for ethnicity, count in stats['demographics']['ethnicity_distribution'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
writer.writerow(['Ethnicity', ethnicity, count, f"{percentage:.1f}%"])
# Conditions CSV
with open(os.path.join(output_dir, 'conditions.csv'), 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Condition', 'Count', 'Percentage'])
for condition, count in stats['disease_stats']['top_conditions'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
writer.writerow([condition, count, f"{percentage:.1f}%"])
# Medications CSV
with open(os.path.join(output_dir, 'medications.csv'), 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Medication', 'Count', 'Percentage'])
for medication, count in stats['disease_stats']['top_medications'].items():
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
writer.writerow([medication, count, f"{percentage:.1f}%"])
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Analyze Synthea patient data and generate reports")
parser.add_argument("--input_dir", required=True, help="Directory containing patient JSON files")
parser.add_argument("--output_dir", default="analysis_output", help="Directory to save analysis outputs")
parser.add_argument("--report_format", default="html", choices=["html", "csv"], help="Format for the reports")
parser.add_argument("--disease_name", help="Name of the disease being simulated")
args = parser.parse_args()
analyze_patient_data(args.input_dir, args.output_dir, args.report_format, args.disease_name)

View File

@@ -0,0 +1,223 @@
#!/usr/bin/env python3
"""
Synthea Module Condition Structure Validator
This script scans all existing Synthea disease modules to check if any violate
the expected condition structure format, particularly looking for nested condition_type
objects that can cause errors during simulation.
Usage:
python check_condition_structure.py [--modules_dir DIRECTORY] [--verbose]
Arguments:
--modules_dir DIRECTORY Path to the modules directory (default: src/main/resources/modules)
--verbose Enable verbose output with detailed issue descriptions
--fix Attempt to automatically fix simple structure issues (experimental)
Example:
python check_condition_structure.py --modules_dir ../modules --verbose
"""
import os
import sys
import json
import glob
import argparse
import logging
from typing import Dict, List, Any, Tuple
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
def validate_condition_format(module_json):
"""Validate that conditions in the module follow Synthea's expected format"""
try:
module_dict = json.loads(module_json) if isinstance(module_json, str) else module_json
# Function to recursively check objects for improper condition structure
def check_conditions(obj, path=[]):
issues = []
if isinstance(obj, dict):
# Check if this is a condition object with nested condition_type
if "condition" in obj and isinstance(obj["condition"], dict):
condition = obj["condition"]
current_path = path + ["condition"]
# Look for the improper nested structure
if "condition_type" in condition and isinstance(condition["condition_type"], dict):
issue_path = '.'.join(current_path + ["condition_type"])
issues.append(f"Found nested condition_type in a condition object at path: {issue_path}")
# Recursively check all dictionary values
for key, value in obj.items():
child_issues = check_conditions(value, path + [key])
issues.extend(child_issues)
elif isinstance(obj, list):
# Recursively check all list items
for i, item in enumerate(obj):
child_issues = check_conditions(item, path + [f"[{i}]"])
issues.extend(child_issues)
return issues
# Check the entire module
issues = check_conditions(module_dict)
return len(issues) == 0, issues
except Exception as e:
return False, [f"Validation error: {str(e)}"]
def fix_condition_structure(module_json):
"""Attempt to fix common condition structure issues"""
try:
# Parse the module if it's a string
module_dict = json.loads(module_json) if isinstance(module_json, str) else module_json
fixed = False
def fix_conditions(obj):
nonlocal fixed
if isinstance(obj, dict):
# Check if this is a condition object with nested condition_type
if "condition" in obj and isinstance(obj["condition"], dict):
condition = obj["condition"]
# Fix the improper nested structure:
if "condition_type" in condition and isinstance(condition["condition_type"], dict):
# Replace nested dict with its first key as a string
first_key = list(condition["condition_type"].keys())[0]
condition["condition_type"] = first_key
fixed = True
# Recursively fix all dictionary values
for key, value in list(obj.items()):
if isinstance(value, (dict, list)):
fix_conditions(value)
elif isinstance(obj, list):
# Recursively fix all list items
for item in obj:
if isinstance(item, (dict, list)):
fix_conditions(item)
# Fix the entire module
fix_conditions(module_dict)
# Return the fixed module and whether changes were made
return module_dict, fixed
except Exception as e:
logger.error(f"Error fixing module structure: {e}")
return module_json, False
def main():
parser = argparse.ArgumentParser(description='Validate condition structure in Synthea modules')
parser.add_argument('--modules_dir', type=str, default='src/main/resources/modules',
help='Path to the modules directory')
parser.add_argument('--verbose', action='store_true',
help='Enable verbose output with detailed issue descriptions')
parser.add_argument('--fix', action='store_true',
help='Attempt to automatically fix simple structure issues (experimental)')
args = parser.parse_args()
# Check if modules directory exists
if not os.path.exists(args.modules_dir):
logger.error(f"Modules directory not found: {args.modules_dir}")
sys.exit(1)
# Get all module files
module_files = glob.glob(os.path.join(args.modules_dir, "*.json"))
logger.info(f"Found {len(module_files)} module files to check")
# Track statistics
valid_modules = 0
invalid_modules = 0
fixed_modules = 0
# Check each module
for module_path in module_files:
module_name = os.path.basename(module_path)
try:
# Load module content
with open(module_path, 'r') as f:
module_content = f.read()
# Validate condition structure
try:
module_json = json.loads(module_content)
valid, issues = validate_condition_format(module_json)
if valid:
valid_modules += 1
if args.verbose:
logger.info(f"{module_name}: Valid condition structure")
else:
invalid_modules += 1
if args.fix:
# Attempt to fix the module
fixed_module, was_fixed = fix_condition_structure(module_json)
if was_fixed:
# Write the fixed module back
with open(module_path, 'w') as f:
json.dump(fixed_module, f, indent=2)
# Validate again to confirm
valid_after_fix, remaining_issues = validate_condition_format(fixed_module)
if valid_after_fix:
fixed_modules += 1
logger.info(f"🔧 {module_name}: Fixed condition structure issues")
else:
logger.warning(f"⚠️ {module_name}: Could not fix all condition structure issues")
if args.verbose:
for issue in remaining_issues:
logger.warning(f" - {issue}")
else:
logger.warning(f"⚠️ {module_name}: Could not fix condition structure issues")
if args.verbose:
for issue in issues:
logger.warning(f" - {issue}")
else:
logger.warning(f"⚠️ {module_name}: Invalid condition structure")
if args.verbose:
for issue in issues:
logger.warning(f" - {issue}")
except json.JSONDecodeError as e:
invalid_modules += 1
logger.error(f"{module_name}: Invalid JSON format - {str(e)}")
except Exception as e:
invalid_modules += 1
logger.error(f"{module_name}: Error processing file - {str(e)}")
# Print summary
logger.info("\nSummary:")
logger.info(f"Total modules checked: {len(module_files)}")
logger.info(f"Valid modules: {valid_modules}")
logger.info(f"Invalid modules: {invalid_modules}")
if args.fix:
logger.info(f"Modules fixed: {fixed_modules}")
if invalid_modules > 0:
logger.warning("Some modules have condition structure issues that may cause problems in Synthea")
if not args.fix:
logger.info("Run with --fix to attempt automatic fixes for the issues")
else:
logger.info("All modules have valid condition structure")
if __name__ == "__main__":
main()

14
scripts/check_json.py Normal file
View File

@@ -0,0 +1,14 @@
import json
import sys
path = sys.argv[1]
with open(path, 'r') as f:
content = f.read()
print(f"File has {content.count('{')} opening braces and {content.count('}')} closing braces")
try:
json.loads(content)
print('Valid JSON')
except Exception as e:
print(f'Invalid JSON: {e}')

262
scripts/cleanup.sh Executable file
View File

@@ -0,0 +1,262 @@
#!/bin/bash
# Synthea-All-Diseases Repository Cleanup Script
# This script removes unnecessary files and keeps only the essential ones for the Synthea module generator
echo "Starting repository cleanup..."
# Create backup directory
mkdir -p backup
mkdir -p backup/scripts # Create a directory for Python scripts
mkdir -p backup/modules # Create a directory for modules
mkdir -p backup/module_generator # Create a directory for module generator scripts
# 1. Keep essential Nextflow files, remove others
echo "Handling Nextflow files..."
cp main.nf backup/
rm -f build_synthea.nf synthea_module_generator_old.nf minimal_test.nf fresh_synthea.nf simple_synthea.nf test_workflow.nf new_synthea.nf minimal_working.nf
# Keep synthea_module_generator.nf around for reference but not used
mv synthea_module_generator.nf backup/
# 2. Keep essential Python scripts, Docker files, and configuration files
echo "Keeping essential files..."
if [ -d "src/main/python" ]; then
cp -r src/main/python backup/module_generator/
fi
if [ -d "src/main/resources/modules" ]; then
cp -r src/main/resources/modules/* backup/modules/
fi
cp Dockerfile docker-compose.yml CLAUDE.md .env.example nextflow.config README.md backup/
cp DOCKER_README.md SYNTHEA_GUIDE.md backup/ 2>/dev/null || : # Keep additional README files
# Backup .env file if it exists
if [ -f ".env" ]; then
cp .env backup/
fi
# 3. Back up all Python scripts in the root directory
echo "Backing up Python scripts..."
# Key scripts - these will be kept in the scripts directory after cleanup
# The only key script we'll keep in the root is check_condition_structure.py for convenience
key_scripts=("check_condition_structure.py")
for script in "${key_scripts[@]}"; do
if [ -f "$script" ]; then
cp "$script" backup/
echo " Backed up key script for root: $script"
fi
done
# All other Python scripts - these will be moved to scripts/ directory
for script in *.py; do
if [ -f "$script" ] && [[ ! " ${key_scripts[@]} " =~ " ${script} " ]]; then
cp "$script" backup/scripts/
echo " Backed up utility script: $script"
fi
done
# 4. Remove ALL Synthea code - we'll clone it in Docker
echo "Removing ALL Synthea source code..."
echo "This will be cloned during Docker build based on Dockerfile..."
# Remove the entire src directory - we'll recreate what we need
rm -rf src/
# Remove any Synthea build files/directories
rm -rf build/ output/ simulator/ lib/ logs/
# Remove any Synthea run files
rm -f run_synthea run_synthea.bat synthea *.jar
# 5. Remove other experimental or temporary files
echo "Removing experimental and temporary files..."
rm -f *.log *.txt trace.txt
rm -f error_output.txt generate_module.sh run_synthetic_data_generation.sh
rm -f build_docker.sh entrypoint.sh run_module_generator.sh generate_samples.sh test_run.sh
rm -f run_flexporter .DS_Store
# 6. Remove gradle files and other unnecessary files
echo "Removing additional unnecessary files..."
rm -f build.gradle gradlew gradlew.bat settings.gradle
rm -rf gradle/ .gradle/
# Note: Keeping .git, .gitignore, .github for version control
rm -rf .nextflow/ .nextflow.log* nextflow-*.zip
# Keep LICENSE files but remove other documentation that will be included from Synthea
rm -f CODE_OF_CONDUCT.md NOTICE
rm -f *.bak *~
# 7. Create clean directory structure for the repository
echo "Creating clean directory structure..."
mkdir -p modules # Top-level modules directory
mkdir -p module_generator # Module generator directory (renamed from python)
mkdir -p scripts # Utility scripts directory
# 8. Restore files to the cleaned structure
echo "Restoring files to cleaned structure..."
# Restore modules
if [ -d "backup/modules" ]; then
cp -r backup/modules/* modules/ 2>/dev/null || :
echo " Restored modules to modules/ directory"
fi
# Restore Module generator core scripts
if [ -d "backup/module_generator" ]; then
cp -r backup/module_generator/* module_generator/ 2>/dev/null || :
echo " Restored core generator scripts to module_generator/ directory"
fi
# Restore key Python scripts to root directory
for script in "${key_scripts[@]}"; do
if [ -f "backup/$script" ]; then
cp "backup/$script" ./
echo " Restored key script to root: $script"
fi
done
# Restore utility Python scripts to scripts/ directory
if [ -d "backup/scripts" ]; then
cp backup/scripts/* scripts/ 2>/dev/null || :
echo " Restored utility scripts to scripts/ directory"
fi
# Restore root files
cp backup/main.nf backup/Dockerfile backup/docker-compose.yml backup/CLAUDE.md backup/nextflow.config backup/.env.example backup/README.md ./ 2>/dev/null || :
if [ -f "backup/DOCKER_README.md" ]; then
cp backup/DOCKER_README.md ./
fi
if [ -f "backup/SYNTHEA_GUIDE.md" ]; then
cp backup/SYNTHEA_GUIDE.md ./
fi
# Restore .env if it existed
if [ -f "backup/.env" ]; then
cp backup/.env ./
echo " Restored .env file"
fi
# 9. Update the Dockerfile to reference the new directory structure
echo "Updating Dockerfile references if needed..."
if [ -f "Dockerfile" ]; then
# Update the Dockerfile to use the new directory structure
sed -i.bak 's|COPY src/main/python/|COPY module_generator/|g' Dockerfile
sed -i.bak 's|COPY python/|COPY module_generator/|g' Dockerfile
sed -i.bak 's|COPY src/main/resources/modules/|COPY modules/|g' Dockerfile
sed -i.bak 's|src/main/python/|module_generator/|g' Dockerfile
sed -i.bak 's|src/main/resources/modules|modules|g' Dockerfile
# Update script paths if needed
sed -i.bak 's|/app/src/main/python/|/app/module_generator/|g' Dockerfile
rm -f Dockerfile.bak
fi
# 10. Update the main.nf file to reference the new directory structure
echo "Updating main.nf references if needed..."
if [ -f "main.nf" ]; then
# Update the main.nf file to use the new directory structure
sed -i.bak 's|src/main/resources/modules|modules|g' main.nf
sed -i.bak 's|src/main/python|module_generator|g' main.nf
sed -i.bak 's|python/|module_generator/|g' main.nf
rm -f main.nf.bak
fi
# 11. Create a simple README for the scripts directory
echo "Creating README for scripts directory..."
cat > scripts/README.md << 'EOF'
# Utility Scripts
This directory contains utility scripts for working with Synthea modules:
- `analyze_patient_data.py` - Analyzes patient data generated by Synthea
- `check_json.py` - Validates JSON structure of module files
- `validate_module.py` - Performs comprehensive validation of modules
- `test_module_exists.py` - Checks if a module exists for a given disease
- `run_module.py` - Script to run a specific module with Synthea
- Other utility scripts for module generation and testing
These scripts complement the core module generator scripts found in the `module_generator/` directory.
EOF
# Update README.md with instructions on using the cleaned repo
echo "Updating README..."
cat > README.md << 'EOF'
# Synthea All Diseases
This repository contains a workflow for generating Synthea disease modules using Claude AI and generating synthetic patient data.
## Repository Structure
- `module_generator/` - Core module generation scripts
- `modules/` - Generated disease modules
- `scripts/` - Utility Python scripts and tools
- `main.nf` - Nextflow workflow for module generation and patient data generation
- `Dockerfile` and `docker-compose.yml` - Docker configuration
## Key Files
- `check_condition_structure.py` - Validates condition structure in modules
- `main.nf` - Main workflow file for generating modules and patient data
## Running the Pipeline
### Prerequisites
- Docker and Docker Compose
- Nextflow
- Anthropic API key (for Claude AI)
### Setup
1. Clone this repository
2. Copy `.env.example` to `.env` and add your Anthropic API key
3. Run with Docker Compose:
```
docker-compose up
```
### Running the Workflow
```
nextflow run main.nf --disease_name "Disease Name" [OPTIONS]
```
For a full list of options, run:
```
nextflow run main.nf --help
```
## Key Features
- Generate modules for diseases that don't exist in Synthea
- Validate condition structure in generated modules
- Generate synthetic patient data using the modules
- Analyze generated patient data
## Documentation
- See `CLAUDE.md` for additional development guidelines
- See `DOCKER_README.md` for Docker setup and usage
- See `SYNTHEA_GUIDE.md` for detailed usage examples and scenarios
- See `scripts/README.md` for information about utility scripts
## License
This project uses the same license as Synthea.
EOF
# 12. Update docker-compose.yml to reference the new directory structure
echo "Updating docker-compose.yml references if needed..."
if [ -f "docker-compose.yml" ]; then
# No changes needed as it uses relative paths and volume mounts
echo " No changes needed for docker-compose.yml"
fi
echo "Cleanup complete! All essential files have been kept, and unnecessary files have been removed."
echo "Repository structure has been simplified to:"
echo " - module_generator/ (core module generation scripts)"
echo " - modules/ (disease modules)"
echo " - scripts/ (utility scripts)"
echo " - check_condition_structure.py (in root for easy access)"
echo ""
echo "The following important files were preserved:"
echo " - .git/ (version control repository)"
echo " - .gitignore (version control configuration)"
echo " - .env (environment configuration if it existed)"
echo " - .github/ (GitHub configuration if it existed)"
echo ""
echo "Optional: Remove the backup directory after verifying everything works: rm -rf backup"

175
scripts/generate_batch.py Executable file
View File

@@ -0,0 +1,175 @@
#!/usr/bin/env python3
import os
import sys
import csv
import time
import argparse
import subprocess
import concurrent.futures
import re
# Global variables
ARGS = None
def normalize_disease_name(name):
"""Convert a disease name to a normalized filename"""
# Convert to lowercase
name = name.lower()
# Replace special characters with underscores
name = re.sub(r'[^a-z0-9]+', '_', name)
# Remove leading/trailing underscores
name = name.strip('_')
# Ensure the name is not empty
if not name:
name = "unknown_disease"
return name
def process_disease(disease_entry):
"""Process a single disease from the CSV"""
disease_name = disease_entry.get("disease_name", "")
normalized_name = normalize_disease_name(disease_name)
icd10 = disease_entry.get("id", "")
category = disease_entry.get("disease_category", "")
print(f"\n{'='*80}")
print(f"Processing disease: {disease_name}")
print(f"ICD-10 code: {icd10}")
print(f"Category: {category}")
# Skip if module already exists (unless --force flag is used)
module_path = f"src/main/resources/modules/{normalized_name}.json"
if os.path.exists(module_path) and not ARGS.force:
print(f"✅ Module already exists at {module_path}, skipping")
return {"name": disease_name, "status": "skipped", "path": module_path}
# Create the command
cmd = ["python3", "generate_module.py", "--disease", disease_name, "--no-interactive"]
if icd10:
cmd.extend(["--icd10", icd10])
if category:
cmd.extend(["--category", category])
# Add auto-fallback option if requested
if ARGS.auto_fallback:
cmd.append("--auto-fallback")
# Run the generator
try:
print(f"Executing: {' '.join(cmd)}")
process = subprocess.run(cmd, check=True, text=True, capture_output=True)
print(f"✅ Successfully generated module for {disease_name}")
print(process.stdout.strip())
return {"name": disease_name, "status": "success", "path": module_path}
except subprocess.CalledProcessError as e:
print(f"❌ Failed to generate module for {disease_name}")
print(f"Error: {str(e)}")
print(f"STDOUT: {e.stdout}")
print(f"STDERR: {e.stderr}")
return {"name": disease_name, "status": "error", "error": str(e), "path": None}
def main():
"""Main function to process diseases from the CSV"""
global ARGS
parser = argparse.ArgumentParser(description='Generate Synthea modules for diseases')
parser.add_argument('--category', help='Only process diseases in this category')
parser.add_argument('--disease', help='Only process a specific disease (by name)')
parser.add_argument('--limit', type=int, help='Limit number of diseases to process')
parser.add_argument('--parallel', type=int, default=1, help='Number of parallel processes')
parser.add_argument('--skip-existing', action='store_true', help='Skip diseases that already have modules')
parser.add_argument('--csv-path', default='src/main/resources/disease_list.csv', help='Path to disease list CSV')
parser.add_argument('--force', action='store_true', help='Force generation even if module already exists')
parser.add_argument('--auto-fallback', action='store_true', help='Enable auto-fallback option')
args = parser.parse_args()
ARGS = args
# Read the disease list CSV
if not os.path.exists(args.csv_path):
print(f"Error: Disease list CSV not found at {args.csv_path}")
sys.exit(1)
print(f"Reading disease list from {args.csv_path}")
try:
with open(args.csv_path, 'r') as f:
reader = csv.DictReader(f)
diseases = list(reader)
except Exception as e:
print(f"Error reading CSV: {str(e)}")
sys.exit(1)
print(f"Found {len(diseases)} diseases in the CSV")
# Filter diseases
if args.category:
diseases = [d for d in diseases if (d.get('disease_category', '').lower() == args.category.lower())]
print(f"Filtered to {len(diseases)} diseases in category '{args.category}'")
if args.disease:
# Try to find an exact match first
disease_name_lower = args.disease.lower()
exact_match = [d for d in diseases if d.get("disease_name", "").lower() == disease_name_lower]
if exact_match:
diseases = exact_match
else:
# Try to find a disease that contains the specified name
partial_matches = [d for d in diseases if disease_name_lower in d.get("disease_name", "").lower()]
if partial_matches:
diseases = partial_matches
print(f"Found {len(diseases)} partial matches for '{args.disease}'")
else:
print(f"No matches found for disease '{args.disease}'")
sys.exit(1)
if args.limit and args.limit > 0:
diseases = diseases[:args.limit]
print(f"Limited to {args.limit} diseases")
# Process the diseases
total = len(diseases)
print(f"\nProcessing {total} diseases with {args.parallel} parallel workers")
start_time = time.time()
if args.parallel > 1:
with concurrent.futures.ProcessPoolExecutor(max_workers=args.parallel) as executor:
results = list(executor.map(process_disease, diseases))
else:
results = [process_disease(disease) for disease in diseases]
end_time = time.time()
elapsed = end_time - start_time
# Summarize results
success_count = sum(1 for r in results if r["status"] == "success")
skipped_count = sum(1 for r in results if r["status"] == "skipped")
error_count = sum(1 for r in results if r["status"] == "error")
print("\n" + "="*80)
print(f"SUMMARY: Processed {total} diseases in {elapsed:.2f} seconds")
print(f"- Successfully generated: {success_count}")
print(f"- Skipped (already exist): {skipped_count}")
print(f"- Failed: {error_count}")
# List errors if any
if error_count > 0:
print("\nFAILED DISEASES:")
for result in results:
if result["status"] == "error":
print(f"- {result['name']}: {result.get('error', 'Unknown error')}")
# Exit with error code if any failures
sys.exit(1)
print("\nAll done! 🎉")
if __name__ == "__main__":
main()

468
scripts/generate_module.py Executable file
View File

@@ -0,0 +1,468 @@
#!/usr/bin/env python3
import os
import sys
import json
import re
import anthropic
import csv
import argparse
# Define global variables
DISEASE_NAME = ""
OUTPUT_FILE = ""
icd10_code = ""
disease_category = ""
use_interactive = True
use_auto_fallback = False
def normalize_disease_name(name):
# Normalize disease name for filename
normalized_name = name.lower().replace(' ', '_').replace('-', '_')
normalized_name = ''.join(c if c.isalnum() or c == '_' else '_' for c in normalized_name)
normalized_name = '_'.join(filter(None, normalized_name.split('_')))
return normalized_name
def generate_module():
# Initialize the Anthropic client
api_key = os.environ.get('ANTHROPIC_API_KEY')
if not api_key:
print("Error: ANTHROPIC_API_KEY environment variable is not set")
sys.exit(1)
client = anthropic.Anthropic(api_key=api_key)
# Step 1: Ask Claude for clinical details in a structured format, not JSON
print("Step 1: Getting clinical details from Claude...")
# Prepare disease information for the prompt
disease_info = f"Disease name: {DISEASE_NAME}"
if icd10_code:
disease_info += f"\nICD-10 code: {icd10_code}"
if disease_category:
disease_info += f"\nCategory: {disease_category}"
# Construct a prompt that asks for structured data, not JSON
prompt = f"""I need information to create a Synthea disease module for {DISEASE_NAME}.
{disease_info}
Please provide the following information in a structured format (NOT JSON):
1. DESCRIPTION: A brief description of the disease (3-5 sentences)
2. RISK_FACTORS: Age ranges, gender factors, and/or other risk factors for this disease
3. SYMPTOMS: List the main symptoms of the disease
4. DIAGNOSTIC_TESTS: Tests typically used to diagnose this condition
5. TREATMENTS: List treatments for this disease, including:
- Medications (with their class and purpose)
- Procedures
- Other interventions
6. COMPLICATIONS: Possible complications of this disease
7. PROGRESSION: Typical progression of the disease, including:
- How it starts
- How it typically develops
- Possible outcomes
8. FOLLOW_UP: Typical follow-up care needed
For medications, just provide generic names - I'll handle the RxNorm codes.
For conditions and procedures, just provide plain English names - I'll handle the coding.
Please provide this information in a straightforward, structured text format. Do NOT use JSON.
"""
# Get clinical details
try:
response = client.messages.create(
model="claude-3-7-sonnet-20250219",
max_tokens=4000,
temperature=0.2,
messages=[
{"role": "user", "content": prompt}
]
)
# Extract the response text
clinical_details = response.content[0].text
# Save the raw clinical details for reference
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
with open(f"{OUTPUT_FILE}.details", "w") as f:
f.write(clinical_details)
print(f"Clinical details saved to {OUTPUT_FILE}.details")
# Step 2: Now build a valid module with the clinical details
print("Step 2: Building module with clinical details...")
# Extract information from the structured response
description_match = re.search(r'##\s*1\.\s*DESCRIPTION(.*?)(?=##\s*2\.)', clinical_details, re.DOTALL)
risk_factors_match = re.search(r'##\s*2\.\s*RISK_FACTORS(.*?)(?=##\s*3\.)', clinical_details, re.DOTALL)
symptoms_match = re.search(r'##\s*3\.\s*SYMPTOMS(.*?)(?=##\s*4\.)', clinical_details, re.DOTALL)
diagnostic_tests_match = re.search(r'##\s*4\.\s*DIAGNOSTIC_TESTS(.*?)(?=##\s*5\.)', clinical_details, re.DOTALL)
treatments_match = re.search(r'##\s*5\.\s*TREATMENTS(.*?)(?=##\s*6\.)', clinical_details, re.DOTALL)
complications_match = re.search(r'##\s*6\.\s*COMPLICATIONS(.*?)(?=##\s*7\.)', clinical_details, re.DOTALL)
progression_match = re.search(r'##\s*7\.\s*PROGRESSION(.*?)(?=##\s*8\.)', clinical_details, re.DOTALL)
follow_up_match = re.search(r'##\s*8\.\s*FOLLOW_UP(.*?)(?=$)', clinical_details, re.DOTALL)
# Extract text from matches
description = description_match.group(1).strip() if description_match else f"A module for {DISEASE_NAME}"
risk_factors = risk_factors_match.group(1).strip() if risk_factors_match else ""
symptoms = symptoms_match.group(1).strip() if symptoms_match else ""
diagnostic_tests = diagnostic_tests_match.group(1).strip() if diagnostic_tests_match else ""
treatments = treatments_match.group(1).strip() if treatments_match else ""
complications = complications_match.group(1).strip() if complications_match else ""
progression = progression_match.group(1).strip() if progression_match else ""
follow_up = follow_up_match.group(1).strip() if follow_up_match else ""
print(f"Extracted: {len(description)} chars description, {len(symptoms)} chars symptoms, {len(treatments)} chars treatments")
# Extract symptoms as a list
symptom_list = []
if symptoms_match:
# For structured format with bullet points
symptom_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', symptoms, re.MULTILINE)
if symptom_lines:
symptom_list = [s.strip() for s in symptom_lines if s.strip()]
else:
# Try to split by newlines for a less structured format
symptom_list = [s.strip() for s in symptoms.split('\n') if s.strip()]
# If that doesn't give us anything, just add the whole text as one symptom
if not symptom_list and symptoms.strip():
symptom_list = [symptoms.strip()]
print(f"Found {len(symptom_list)} symptoms")
# Extract medications from treatments
medications = []
if treatments_match:
# Find the "Medications:" section
meds_section_match = re.search(r'Medications?:\s*(.*?)(?=(?:- Procedures:|$))', treatments, re.DOTALL)
if meds_section_match:
meds_section = meds_section_match.group(1).strip()
# Extract main medication names from list items with explanations
# Format is typically: "- Medication name (explanation)" or "- Medication name - explanation"
med_items = re.findall(r'[-*]\s*(.*?)(?=[-*]|\n\n|\n- |$)', meds_section, re.DOTALL)
for item in med_items:
item = item.strip()
if not item:
continue
# Extract just the medication name, not the explanation
med_match = re.match(r'([^(]+)(?:\(|-).*', item)
if med_match:
med_name = med_match.group(1).strip()
else:
med_name = item.split('(')[0].strip()
# Clean up common patterns
med_name = re.sub(r'\s*\([^)]*\)', '', med_name) # Remove parentheticals
med_name = re.sub(r'\s*-.*', '', med_name) # Remove after dash
# Skip descriptive phrases that aren't medications
if any(phrase in med_name.lower() for phrase in ['reduce', 'block', 'prevent', 'for']):
continue
# Look for multiple medications in parentheses
if ',' in med_name:
for m in med_name.split(','):
m = m.strip()
if m and not any(m.lower() in existing.lower() for existing in medications):
medications.append(m)
else:
if med_name and not any(med_name.lower() in existing.lower() for existing in medications):
medications.append(med_name)
print(f"Found {len(medications)} medications: {', '.join(medications[:5])}")
# Start with a basic module template
module = {
"name": DISEASE_NAME,
"remarks": [],
"states": {
"Initial": {
"type": "Initial",
"direct_transition": "Age_Guard"
},
"Age_Guard": {
"type": "Guard",
"allow": {
"condition_type": "Age",
"operator": ">=",
"quantity": 18,
"unit": "years"
},
"direct_transition": "Delay_Until_Onset"
},
"Delay_Until_Onset": {
"type": "Delay",
"range": {
"low": 0,
"high": 20,
"unit": "years"
},
"direct_transition": "Onset"
},
"Onset": {
"type": "ConditionOnset",
"target_encounter": "Diagnosis_Encounter",
"codes": [
{
"system": "SNOMED-CT",
"code": "64109004",
"display": DISEASE_NAME
}
],
"direct_transition": "Diagnosis_Encounter"
},
"Diagnosis_Encounter": {
"type": "Encounter",
"encounter_class": "ambulatory",
"reason": "Onset",
"codes": [
{
"system": "SNOMED-CT",
"code": "185347001",
"display": "Encounter for problem"
}
],
"direct_transition": "End_Diagnosis_Encounter"
},
"End_Diagnosis_Encounter": {
"type": "EncounterEnd"
# direct_transition will be set later
}
}
}
# Add description to remarks
if description:
for line in description.split('\n'):
line = line.strip()
if line:
module["remarks"].append(line)
# Add ICD-10 code if available
if icd10_code:
module["remarks"].append(f"ICD-10 code: {icd10_code}")
# Add category if available
if disease_category:
module["remarks"].append(f"Category: {disease_category}")
# Make sure remarks is not empty
if not module["remarks"]:
module["remarks"].append(f"Generated module for {DISEASE_NAME}")
# Add risk factors to remarks if available
if risk_factors:
module["remarks"].append(f"Risk factors: {risk_factors.replace('\n', ' ')}")
# Add symptoms if available
symptom_i = 0
current_state = "End_Diagnosis_Encounter"
if symptom_list:
for i, symptom in enumerate(symptom_list[:3]): # Limit to 3 symptoms
symptom_name = symptom.strip()
if not symptom_name:
continue
state_name = f"Symptom_{symptom_i}"
symptom_i += 1
module["states"][state_name] = {
"type": "ConditionOnset",
"target_encounter": "Diagnosis_Encounter",
"codes": [
{
"system": "SNOMED-CT",
"code": "418107008", # Default symptom code
"display": symptom_name
}
]
}
# Connect states
module["states"][current_state]["direct_transition"] = state_name
current_state = state_name
# Add diagnostic tests if available
if diagnostic_tests:
test_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', diagnostic_tests, re.MULTILINE)
tests = [t.strip() for t in test_lines if t.strip()]
if tests:
for i, test in enumerate(tests[:2]): # Limit to 2 tests
test_name = test.strip()
if not test_name:
continue
state_name = f"DiagnosticTest_{i}"
module["states"][state_name] = {
"type": "Procedure",
"target_encounter": "Diagnosis_Encounter",
"reason": "Onset",
"codes": [
{
"system": "SNOMED-CT",
"code": "386053000", # Default diagnostic procedure code
"display": test_name
}
]
}
# Connect states
module["states"][current_state]["direct_transition"] = state_name
current_state = state_name
# Add medications if found
if medications:
for i, med in enumerate(medications[:3]): # Limit to 3 medications to keep module manageable
med_name = med.strip()
if not med_name:
continue
state_name = f"Prescribe_{i}"
module["states"][state_name] = {
"type": "MedicationOrder",
"target_encounter": "Diagnosis_Encounter",
"reason": "Onset",
"codes": [
{
"system": "RxNorm",
"code": "308047", # Default code
"display": med_name
}
]
}
# Connect states
module["states"][current_state]["direct_transition"] = state_name
current_state = state_name
# Connect to Follow-up
module["states"][current_state]["direct_transition"] = "Follow_Up_Encounter"
# Add follow-up encounter
module["states"]["Follow_Up_Encounter"] = {
"type": "Encounter",
"encounter_class": "ambulatory",
"reason": "Onset",
"codes": [
{
"system": "SNOMED-CT",
"code": "390906007",
"display": "Follow-up encounter"
}
],
"direct_transition": "End_Follow_Up_Encounter"
}
module["states"]["End_Follow_Up_Encounter"] = {
"type": "EncounterEnd",
"direct_transition": "Terminal"
}
module["states"]["Terminal"] = {
"type": "Terminal"
}
# Format the JSON for output
formatted_json = json.dumps(module, indent=2)
# Save the module
with open(OUTPUT_FILE, "w") as f:
f.write(formatted_json)
print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}")
return 0
except Exception as e:
print(f"❌ Error: {str(e)}")
# Write the error to a file
with open(f"{OUTPUT_FILE}.error", "w") as f:
f.write(f"Error: {str(e)}")
return 1
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='Generate a Synthea module for a disease')
parser.add_argument('--disease', type=str, required=True, help='Disease name')
parser.add_argument('--icd10', type=str, help='ICD-10 code')
parser.add_argument('--category', type=str, help='Disease category')
parser.add_argument('--no-interactive', action='store_true', help='Skip interactive prompts, for batch processing')
parser.add_argument('--auto-fallback', action='store_true', help='Automatically use fallback template if JSON parsing fails')
parser.add_argument('--force', action='store_true', help='Force regeneration even if module already exists')
args = parser.parse_args()
# Set global variables
global DISEASE_NAME, OUTPUT_FILE, use_interactive, use_auto_fallback
DISEASE_NAME = args.disease
normalized_name = normalize_disease_name(DISEASE_NAME)
OUTPUT_FILE = f"src/main/resources/modules/{normalized_name}.json"
use_interactive = not args.no_interactive
use_auto_fallback = args.auto_fallback
# Use provided ICD-10 code or category if specified
global icd10_code, disease_category
icd10_code = args.icd10
disease_category = args.category
print(f"Disease name: {DISEASE_NAME}")
print(f"Normalized filename: {normalized_name}.json")
# Check if module already exists
if os.path.exists(OUTPUT_FILE):
print(f"✅ Module already exists at: {OUTPUT_FILE}")
if args.force:
print("Force flag set. Regenerating module.")
else:
return 0
else:
print(f"❌ Module not found at: {OUTPUT_FILE}")
# If no ICD-10 code provided, look up in disease list CSV
if not icd10_code:
csv_path = "src/main/resources/disease_list.csv"
if os.path.exists(csv_path):
print(f"Looking up disease information in {csv_path}...")
try:
with open(csv_path, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
if row.get("disease_name", "").lower() == DISEASE_NAME.lower():
icd10_code = row.get("id", "")
disease_category = row.get("disease_category", "")
print(f"Found matching disease in CSV:")
print(f"- ICD-10 code: {icd10_code}")
print(f"- Category: {disease_category}")
break
else:
print("No matching disease found in CSV. Continuing without ICD-10 code.")
except Exception as e:
print(f"Error reading CSV: {str(e)}")
print("Continuing without ICD-10 code.")
# Ensure ANTHROPIC_API_KEY is set
if not os.environ.get('ANTHROPIC_API_KEY'):
print("Error: ANTHROPIC_API_KEY environment variable is not set")
return 1
# Ask Claude to generate the module
generate_module()
return 0
if __name__ == "__main__":
sys.exit(main())

321
scripts/patient_analysis.py Executable file
View File

@@ -0,0 +1,321 @@
#!/usr/bin/env python3
import os
import sys
import json
import glob
import argparse
from collections import Counter
from datetime import datetime
def analyze_patient_data(disease_name, input_dir, output_dir, format_type="html"):
print(f"Analyzing patient data for {disease_name}...")
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Find all patient JSON files
patients_files = glob.glob(f"{input_dir}/*.json")
patients_files = [f for f in patients_files if not 'hospitalInformation' in f and not 'practitionerInformation' in f]
print(f"Found {len(patients_files)} patient records for analysis")
if len(patients_files) == 0:
print("No patient files found to analyze.")
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.html"), 'w') as f:
f.write(f"<html><body><h1>Analysis Report for {disease_name}</h1><p>No patient files found to analyze.</p></body></html>")
# Create empty CSV and JSON files
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.csv"), 'w') as f:
f.write("No patient files found to analyze.\n")
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.json"), 'w') as f:
f.write('{"error": "No patient files found to analyze."}\n')
return
# Initialize data collectors
demographics = {'gender': Counter(), 'age': [], 'race': Counter(), 'ethnicity': Counter()}
condition_counts = Counter()
medication_counts = Counter()
# Process each patient file
for patient_file in patients_files:
try:
with open(patient_file, 'r') as f:
data = json.load(f)
# Skip non-patient resources
if 'resourceType' in data and data['resourceType'] == 'Patient':
# Basic patient info
if 'gender' in data:
demographics['gender'][data['gender']] += 1
if 'birthDate' in data:
# Calculate age based on birth year
birth_year = int(data['birthDate'][:4])
current_year = datetime.now().year
age = current_year - birth_year
demographics['age'].append(age)
# Process race and ethnicity extensions
if 'extension' in data:
for ext in data.get('extension', []):
if 'url' in ext and 'extension' in ext:
if ext['url'].endswith('us-core-race'):
for race_ext in ext['extension']:
if 'valueCoding' in race_ext:
race = race_ext['valueCoding'].get('display', 'Unknown')
demographics['race'][race] += 1
elif ext['url'].endswith('us-core-ethnicity'):
for eth_ext in ext['extension']:
if 'valueCoding' in eth_ext:
ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
demographics['ethnicity'][ethnicity] += 1
# Check for Bundle resources with entries
if 'resourceType' in data and data['resourceType'] == 'Bundle' and 'entry' in data:
bundle_has_patient = False
for entry in data['entry']:
if 'resource' in entry:
resource = entry['resource']
# Check if this bundle contains a patient
if resource.get('resourceType') == 'Patient':
bundle_has_patient = True
# Basic patient info
if 'gender' in resource:
demographics['gender'][resource['gender']] += 1
if 'birthDate' in resource:
# Calculate age based on birth year
birth_year = int(resource['birthDate'][:4])
current_year = datetime.now().year
age = current_year - birth_year
demographics['age'].append(age)
# Process race and ethnicity extensions
if 'extension' in resource:
for ext in resource.get('extension', []):
if 'url' in ext and 'extension' in ext:
if ext['url'].endswith('us-core-race'):
for race_ext in ext['extension']:
if 'valueCoding' in race_ext:
race = race_ext['valueCoding'].get('display', 'Unknown')
demographics['race'][race] += 1
elif ext['url'].endswith('us-core-ethnicity'):
for eth_ext in ext['extension']:
if 'valueCoding' in eth_ext:
ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
demographics['ethnicity'][ethnicity] += 1
# Check for conditions
if resource.get('resourceType') == 'Condition':
if 'code' in resource and 'coding' in resource['code']:
for code in resource['code']['coding']:
if 'display' in code:
condition_counts[code['display']] += 1
# Check for medications
if resource.get('resourceType') == 'MedicationRequest':
if 'medicationCodeableConcept' in resource and 'coding' in resource['medicationCodeableConcept']:
for code in resource['medicationCodeableConcept']['coding']:
if 'display' in code:
medication_counts[code['display']] += 1
except Exception as e:
print(f"Error processing {patient_file}: {e}")
# Calculate total patients (count unique patient files)
total_patients = sum(demographics['gender'].values())
if total_patients == 0:
print("Warning: No patient demographics found. Setting total_patients to file count.")
total_patients = len(patients_files)
print(f"Total patients found: {total_patients}")
print(f"Gender distribution: {dict(demographics['gender'])}")
if total_patients == 0:
total_patients = 1 # Avoid division by zero
# Generate HTML report
if format_type.lower() in ["html", "all"]:
create_html_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
# Generate CSV report
if format_type.lower() in ["csv", "all"]:
create_csv_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
# Generate JSON report
if format_type.lower() in ["json", "all"]:
create_json_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
print(f"Analysis complete. Reports generated in {output_dir}")
def create_html_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.html"), 'w') as f:
f.write(f'''<!DOCTYPE html>
<html>
<head>
<title>Synthea Patient Analysis - {disease_name}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
h1, h2, h3 {{ color: #333; }}
.container {{ max-width: 1000px; margin: 0 auto; }}
table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
th, td {{ text-align: left; padding: 8px; border-bottom: 1px solid #ddd; }}
th {{ background-color: #f2f2f2; }}
tr:hover {{background-color: #f5f5f5;}}
</style>
</head>
<body>
<div class="container">
<h1>Synthea Patient Analysis - {disease_name}</h1>
<p>Total patients analyzed: {total_patients}</p>
<h2>Demographics</h2>
<h3>Gender Distribution</h3>
<table>
<tr><th>Gender</th><th>Count</th><th>Percentage</th></tr>
''')
for gender, count in demographics['gender'].items():
percentage = (count / total_patients) * 100
f.write(f"<tr><td>{gender}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
f.write('''
</table>
<h3>Age Statistics</h3>
<table>
''')
if demographics['age']:
min_age = min(demographics['age'])
max_age = max(demographics['age'])
avg_age = sum(demographics['age']) / len(demographics['age'])
f.write(f"<tr><td>Minimum Age</td><td>{min_age}</td></tr>\n")
f.write(f"<tr><td>Maximum Age</td><td>{max_age}</td></tr>\n")
f.write(f"<tr><td>Average Age</td><td>{avg_age:.1f}</td></tr>\n")
else:
f.write("<tr><td colspan='2'>No age data available</td></tr>\n")
f.write('''
</table>
<h3>Top Conditions</h3>
<table>
<tr><th>Condition</th><th>Count</th><th>Percentage of Patients</th></tr>
''')
for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
percentage = (count / total_patients) * 100
f.write(f"<tr><td>{condition}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
f.write('''
</table>
<h3>Top Medications</h3>
<table>
<tr><th>Medication</th><th>Count</th><th>Percentage of Patients</th></tr>
''')
for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
percentage = (count / total_patients) * 100
f.write(f"<tr><td>{medication}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
f.write('''
</table>
</div>
</body>
</html>''')
def create_csv_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.csv"), 'w') as f:
# Write header
f.write(f"Synthea Patient Analysis - {disease_name}\n")
f.write(f"Total patients analyzed,{total_patients}\n\n")
# Gender distribution
f.write("Gender Distribution\n")
f.write("Gender,Count,Percentage\n")
for gender, count in demographics['gender'].items():
percentage = (count / total_patients) * 100
f.write(f"{gender},{count},{percentage:.1f}%\n")
f.write("\n")
# Age statistics
f.write("Age Statistics\n")
if demographics['age']:
min_age = min(demographics['age'])
max_age = max(demographics['age'])
avg_age = sum(demographics['age']) / len(demographics['age'])
f.write(f"Minimum Age,{min_age}\n")
f.write(f"Maximum Age,{max_age}\n")
f.write(f"Average Age,{avg_age:.1f}\n")
else:
f.write("No age data available\n")
f.write("\n")
# Top conditions
f.write("Top Conditions\n")
f.write("Condition,Count,Percentage of Patients\n")
for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
percentage = (count / total_patients) * 100
f.write(f"{condition},{count},{percentage:.1f}%\n")
f.write("\n")
# Top medications
f.write("Top Medications\n")
f.write("Medication,Count,Percentage of Patients\n")
for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
percentage = (count / total_patients) * 100
f.write(f"{medication},{count},{percentage:.1f}%\n")
def create_json_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
# Prepare the report data
report_data = {
"disease": disease_name,
"total_patients": total_patients,
"demographics": {
"gender": {k: v for k, v in demographics['gender'].items()},
"race": {k: v for k, v in demographics['race'].items()},
"ethnicity": {k: v for k, v in demographics['ethnicity'].items()}
},
"age_statistics": {}
}
if demographics['age']:
report_data["age_statistics"] = {
"min_age": min(demographics['age']),
"max_age": max(demographics['age']),
"avg_age": sum(demographics['age']) / len(demographics['age'])
}
# Add top conditions
report_data["top_conditions"] = [
{"name": condition, "count": count, "percentage": (count / total_patients) * 100}
for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]
]
# Add top medications
report_data["top_medications"] = [
{"name": medication, "count": count, "percentage": (count / total_patients) * 100}
for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]
]
# Write to JSON file
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.json"), 'w') as f:
json.dump(report_data, f, indent=2)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Analyze Synthea patient data")
parser.add_argument("--disease", required=True, help="Disease name")
parser.add_argument("--input_dir", required=True, help="Input directory with FHIR files")
parser.add_argument("--output_dir", default=".", help="Output directory for reports")
parser.add_argument("--format", default="html", choices=["html", "csv", "json", "all"],
help="Output format (html, csv, json, or all)")
args = parser.parse_args()
analyze_patient_data(args.disease, args.input_dir, args.output_dir, args.format)

127
scripts/prepare_environment.sh Executable file
View File

@@ -0,0 +1,127 @@
#!/bin/bash
# prepare_environment.sh
#
# This script prepares the environment for running the Synthea pipeline.
# It rebuilds the Docker containers, ensures the directory structure is correct,
# and starts the necessary services.
set -e # Exit on error
# Display a header
echo "=================================================="
echo "Synthea All Diseases Pipeline - Environment Setup"
echo "=================================================="
# Check for Docker
if ! command -v docker &> /dev/null; then
echo "ERROR: Docker is not installed or not in the PATH."
echo "Please install Docker and try again."
exit 1
fi
# Check for docker-compose
if ! command -v docker-compose &> /dev/null; then
echo "ERROR: docker-compose is not installed or not in the PATH."
echo "Please install docker-compose and try again."
exit 1
fi
# Create necessary directories if they don't exist
echo "Creating required directories..."
mkdir -p module_generator
mkdir -p src/main/resources/modules
mkdir -p src/main/resources/disease
mkdir -p modules
mkdir -p output
# Check if the module_generator directory has the required scripts
if [ ! -f "module_generator/module_generator.py" ] || [ ! -f "module_generator/run_module_generator.py" ]; then
echo "ERROR: Required Python scripts not found in module_generator directory!"
# Check if they're in the python directory and we need to move them
if [ -d "python" ] && [ -f "python/module_generator.py" ] && [ -f "python/run_module_generator.py" ]; then
echo "Found scripts in python directory, moving them to module_generator directory..."
mkdir -p module_generator
mv python/module_generator.py module_generator/
mv python/run_module_generator.py module_generator/
# Copy README if exists
if [ -f "python/README_module_generator.md" ]; then
mv python/README_module_generator.md module_generator/
fi
else
echo "Scripts not found in python directory either. Please ensure the scripts are in the module_generator directory."
exit 1
fi
fi
# Make the Python scripts executable
chmod +x module_generator/module_generator.py
chmod +x module_generator/run_module_generator.py
# Create symlinks for convenience
ln -sf "$(pwd)/module_generator/run_module_generator.py" "$(pwd)/run_module_generator.py"
ln -sf "$(pwd)/module_generator/module_generator.py" "$(pwd)/module_generator.py"
# Create an empty disease_list.json if it doesn't exist
if [ ! -f "src/main/resources/disease_list.json" ]; then
echo "Creating empty disease_list.json..."
echo "[]" > src/main/resources/disease_list.json
fi
# Check if .env file exists
if [ ! -f ".env" ]; then
echo ".env file is missing. Creating from .env.example..."
if [ -f ".env.example" ]; then
cp .env.example .env
echo " Created .env from .env.example. Please edit it with your API key."
else
echo "WARNING: .env.example not found. Creating minimal .env file..."
echo "ANTHROPIC_API_KEY=your_api_key_here" > .env
fi
fi
# Stop any running containers
echo "Stopping any running containers..."
docker-compose down || true
# Rebuild the containers
echo "Building Docker containers..."
docker-compose build
# Start the Synthea container
echo "Starting Synthea container..."
docker-compose up -d synthea
# Give the container a moment to start
echo "Waiting for Synthea container to be ready..."
sleep 10
# Check if the container is healthy
CONTAINER_ID=$(docker-compose ps -q synthea)
if [ -z "$CONTAINER_ID" ]; then
echo "ERROR: Failed to start Synthea container."
exit 1
fi
# Check if the container is using proper health checks
HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' $CONTAINER_ID 2>/dev/null || echo "unknown")
if [ "$HEALTH_STATUS" != "healthy" ] && [ "$HEALTH_STATUS" != "unknown" ]; then
echo "WARNING: Synthea container is not reporting as healthy (status: $HEALTH_STATUS)."
echo "It may take some time for the container to become fully operational."
echo "You can proceed, but the pipeline may fail if the container is not ready."
fi
# Check if we should generate modules
if [ "$1" == "--generate-modules" ]; then
echo "Starting module generator..."
docker-compose --profile generator up module-generator
fi
echo "=================================================="
echo "Environment is ready!"
echo ""
echo "To generate patients for a disease module:"
echo "nextflow run main.nf --disease_name \"Disease Name\" --generate_patients true"
echo ""
echo "To generate a new module and patients:"
echo "nextflow run main.nf --disease_name \"Disease Name\" --force_generate true --generate_patients true"
echo "=================================================="

77
scripts/run_module.py Normal file
View File

@@ -0,0 +1,77 @@
#!/usr/bin/env python3
import os
import sys
import json
import anthropic
DISEASE_NAME = "Excessive frequent and irregular menstruation"
OUTPUT_FILE = "excessive_frequent_and_irregular_menstruation.json"
# Initialize the Anthropic client with your API key
client = anthropic.Anthropic(
# This is the default and can be omitted
api_key=os.environ.get("ANTHROPIC_API_KEY"),
)
# Define the prompt for generating the module
def generate_module_prompt(disease_name):
return f"""
You are a medical expert creating a disease module for the Synthea patient simulation system.
I need you to create a structured JSON module for {disease_name}.
The module will be used to simulate patients with this condition in the Synthea healthcare simulation system.
The JSON should follow the Synthea module format which includes:
1. Basic module information (name, remarks)
2. States representing the progression of the disease
3. Transitions between states
4. Guard conditions based on patient attributes where appropriate
5. Care plans and medications that would be prescribed
Your output should be valid JSON that follows the Synthea module structure precisely. Format it as a complete,
well-structured Synthea module. Make sure it's medically accurate and includes all relevant clinical details,
treatment options, and disease progression patterns.
Output only the JSON with no additional commentary or markdown formatting. The output will be directly saved as a file.
"""
print(f"Generating module for {DISEASE_NAME}...")
try:
# Send a message to Claude
message = client.messages.create(
model="claude-3-7-sonnet-20250219",
max_tokens=4000,
temperature=0,
messages=[
{"role": "user", "content": generate_module_prompt(DISEASE_NAME)}
]
)
# Extract the JSON from the response
module_json = message.content[0].text
# Find the first '{' and last '}' to extract just the JSON part
start = module_json.find('{')
end = module_json.rfind('}') + 1
if start >= 0 and end > start:
module_json = module_json[start:end]
# Parse and format the JSON
parsed = json.loads(module_json)
formatted_json = json.dumps(parsed, indent=2)
# Write to file
with open(OUTPUT_FILE, 'w') as f:
f.write(formatted_json)
print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}")
except Exception as e:
print(f"Error: {e}")
# Save the raw response for debugging
with open("error_output.txt", 'w') as f:
f.write(str(e))
print("Error details saved to error_output.txt")
sys.exit(1)

182
scripts/run_pipeline.sh Executable file
View File

@@ -0,0 +1,182 @@
#!/bin/bash
# run_pipeline.sh
#
# This script runs the Nextflow pipeline for generating disease modules and synthetic patients
set -e # Exit on error
# Default values
DISEASE_NAME=""
FORCE_GENERATE=false
GENERATE_PATIENTS=false
POPULATION=100
GENDER=0.5
MIN_AGE=0
MAX_AGE=90
SEED=""
ANALYZE_DATA=false
REPORT_FORMAT="html"
VERIFY_DOCKER=true
# Parse command line arguments
function show_help {
echo "Usage: $0 [options] --disease \"Disease Name\""
echo ""
echo "Options:"
echo " --disease, -d NAME Disease name to generate a module for (required)"
echo " --force-generate, -f Force regeneration of disease module even if it exists"
echo " --patients, -p Generate synthetic patients (default: false)"
echo " --population, -n NUM Number of patients to generate (default: 100)"
echo " --gender, -g VALUE Gender distribution (0-1 for % female, default: 0.5)"
echo " --min-age, -a NUM Minimum patient age (default: 0)"
echo " --max-age, -m NUM Maximum patient age (default: 90)"
echo " --seed, -s SEED Random seed for reproducibility"
echo " --analyze, -A Analyze patient data after generation"
echo " --report-format, -r FMT Report format for analysis (html, json, csv, default: html)"
echo " --skip-docker-check Skip Docker container verification"
echo " --help, -h Show this help message"
echo ""
echo "Example:"
echo " $0 --disease \"Multiple Sclerosis\" --patients --population 50"
exit 1
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--disease|-d)
DISEASE_NAME="$2"
shift 2
;;
--force-generate|-f)
FORCE_GENERATE=true
shift
;;
--patients|-p)
GENERATE_PATIENTS=true
shift
;;
--population|-n)
POPULATION="$2"
shift 2
;;
--gender|-g)
GENDER="$2"
shift 2
;;
--min-age|-a)
MIN_AGE="$2"
shift 2
;;
--max-age|-m)
MAX_AGE="$2"
shift 2
;;
--seed|-s)
SEED="$2"
shift 2
;;
--analyze|-A)
ANALYZE_DATA=true
shift
;;
--report-format|-r)
REPORT_FORMAT="$2"
shift 2
;;
--skip-docker-check)
VERIFY_DOCKER=false
shift
;;
--help|-h)
show_help
;;
*)
echo "Unknown option: $1"
show_help
;;
esac
done
# Check if disease name is provided
if [ -z "$DISEASE_NAME" ]; then
echo "ERROR: Disease name is required!"
show_help
fi
# Check if Docker container is running if we need to generate patients
if [ "$VERIFY_DOCKER" = true ] && [ "$GENERATE_PATIENTS" = true ]; then
CONTAINER_RUNNING=$(docker ps | grep synthea | wc -l)
if [ "$CONTAINER_RUNNING" -eq 0 ]; then
echo "No Synthea Docker containers are running!"
echo "Running environment setup script to start containers..."
if [ -f "./scripts/prepare_environment.sh" ]; then
./scripts/prepare_environment.sh
elif [ -f "./prepare_environment.sh" ]; then
./prepare_environment.sh
else
echo "ERROR: prepare_environment.sh not found!"
exit 1
fi
else
echo "Synthea Docker container is running. Proceeding with pipeline execution."
fi
fi
# Create params.json file with our configuration
echo "Creating params.json file with pipeline configuration..."
cat > params.json << EOF
{
"disease_name": "$DISEASE_NAME",
"modules_dir": "modules",
"output_dir": "output",
"generate_patients": $GENERATE_PATIENTS,
"population": $POPULATION,
"gender": $GENDER,
"min_age": $MIN_AGE,
"max_age": $MAX_AGE,
"analyze_patient_data": $ANALYZE_DATA,
"report_format": "$REPORT_FORMAT",
"force_generate": $FORCE_GENERATE,
"publish_dir": "published_output"
EOF
# Add seed if provided
if [ ! -z "$SEED" ]; then
echo ", \"seed\": $SEED" >> params.json
fi
# Close JSON object
echo "}" >> params.json
# Display execution details
echo "=================================================="
echo "Running Synthea Pipeline for: $DISEASE_NAME"
echo "Generate patients: $GENERATE_PATIENTS"
if [ "$GENERATE_PATIENTS" = true ]; then
echo "Population: $POPULATION"
echo "Gender ratio (proportion female): $GENDER"
echo "Age range: $MIN_AGE-$MAX_AGE"
if [ ! -z "$SEED" ]; then
echo "Random seed: $SEED"
fi
fi
echo "Force module generation: $FORCE_GENERATE"
echo "Analyze patient data: $ANALYZE_DATA"
echo "=================================================="
# Execute the Nextflow command
echo "Starting Nextflow pipeline..."
nextflow run main.nf
echo ""
echo "Pipeline execution complete!"
if [ "$GENERATE_PATIENTS" = true ]; then
echo "Check the 'output' directory and 'published_output/$DISEASE_NAME_NORMALIZED' for generated patient data."
if [ "$ANALYZE_DATA" = true ]; then
echo "Analysis reports can be found in 'published_output/$DISEASE_NAME_NORMALIZED/analysis'."
fi
fi
echo "Generated modules can be found in the 'modules' directory and 'published_output/modules'."
echo ""

99
scripts/simple_module.py Executable file
View File

@@ -0,0 +1,99 @@
#!/usr/bin/env python3
import os
import sys
import json
import anthropic
DISEASE_NAME = "Excessive frequent and irregular menstruation"
OUTPUT_FILE = "excessive_frequent_and_irregular_menstruation.json"
# Initialize the Anthropic client
client = anthropic.Anthropic()
print(f"Generating module for {DISEASE_NAME}...")
try:
# Send a message to Claude
message = client.messages.create(
model="claude-3-7-sonnet-20250219",
max_tokens=4000,
temperature=0,
messages=[
{"role": "user", "content": f"""Create a Synthea disease module for {DISEASE_NAME} in JSON format.
The module should follow this structure:
{{
"name": "Module Name",
"remarks": [
"Description of the module"
],
"states": {{
"Initial": {{
"type": "Initial",
"direct_transition": "Next State"
}},
"Terminal": {{
"type": "Terminal"
}}
// Additional states with appropriate transitions
}}
}}
Make sure the JSON is properly formatted with no syntax errors.
Do not include any markdown formatting, comments, or explanations outside the JSON.
Output only the valid JSON object."""}
]
)
# Extract the JSON from the response
module_json = message.content[0].text
# Save the raw response for debugging
with open(f"{OUTPUT_FILE}.raw", "w") as f:
f.write(module_json)
print(f"Raw response saved to {OUTPUT_FILE}.raw")
# Find the first { and last } to extract just the JSON part
start = module_json.find("{")
end = module_json.rfind("}") + 1
if start >= 0 and end > start:
module_json = module_json[start:end]
# Fix common JSON issues
try:
# Manual cleaning of known JSON issues
# Find and remove lines with invalid syntax
cleaned_lines = []
for line in module_json.split('\n'):
# Skip lines with "{%" or any other invalid JSON syntax
if "{%" in line or "%}" in line or "//" in line:
print(f"Removing invalid line: {line}")
continue
cleaned_lines.append(line)
cleaned_json = '\n'.join(cleaned_lines)
# Try to parse and fix the JSON
parsed = json.loads(cleaned_json)
formatted_json = json.dumps(parsed, indent=2)
# Write to file
with open(OUTPUT_FILE, "w") as f:
f.write(formatted_json)
print(f"Successfully generated module and saved to {OUTPUT_FILE}")
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
print("Attempting secondary cleaning method...")
# Write the error details for debugging
with open(f"{OUTPUT_FILE}.error", "w") as f:
f.write(f"Error: {str(e)}\n\n")
f.write("JSON that failed to parse:\n")
f.write(module_json)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)

1
scripts/test.py Normal file
View File

@@ -0,0 +1 @@
import anthropic; import os; client = anthropic.Anthropic(); print(client.messages.create(model="claude-3-7-sonnet-20250219", max_tokens=1000, messages=[{"role": "user", "content": "Hello"}]).content[0].text)

123
scripts/test_module_exists.py Executable file
View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
Simple script to check if a module exists for a given disease and generate it if not.
"""
import os
import sys
import json
import subprocess
import re
# Constants
DISEASE_NAME = sys.argv[1] if len(sys.argv) > 1 else "Excessive frequent and irregular menstruation"
MODULES_DIR = "src/main/resources/modules"
RUN_MODULE_GENERATOR_PATH = "src/main/python/run_module_generator.py"
DISEASE_LIST_PATH = "src/main/resources/disease_list.json"
# Function to normalize disease name for filenames
def normalize_filename(name):
"""Convert disease name to normalized filename format."""
filename = name.lower()
filename = re.sub(r'[^a-zA-Z0-9]', '_', filename)
filename = re.sub(r'_+', '_', filename)
filename = filename.strip('_')
return filename
# Main function
def main():
print(f"Checking if module exists for: {DISEASE_NAME}")
# Normalize disease name for filename
normalized_name = normalize_filename(DISEASE_NAME)
module_path = os.path.join(MODULES_DIR, f"{normalized_name}.json")
# Check if module already exists
if os.path.exists(module_path):
print(f"✅ Module already exists at: {module_path}")
return
print(f"❌ Module not found at: {module_path}")
print(f"Creating disease list entry for {DISEASE_NAME}...")
# Create temporary disease list with just this disease
create_disease_list(DISEASE_NAME)
# Run the module generator
print(f"Running module generator for {DISEASE_NAME}...")
try:
result = subprocess.run(
[sys.executable, RUN_MODULE_GENERATOR_PATH, "--batch-size", "1"],
capture_output=True,
text=True,
check=True
)
print("Module generator output:")
print(result.stdout)
# Check if module was created
if os.path.exists(module_path):
print(f"✅ Module created successfully at: {module_path}")
else:
print(f"❌ Module generation failed, file not found at: {module_path}")
print("Error output:")
print(result.stderr)
except subprocess.CalledProcessError as e:
print(f"Error running module generator: {e}")
print("STDOUT:")
print(e.stdout)
print("STDERR:")
print(e.stderr)
def create_disease_list(disease_name):
"""Create a temporary disease list with just the requested disease."""
disease_list = []
# Try to read original disease list if it exists
if os.path.exists(DISEASE_LIST_PATH):
try:
with open(DISEASE_LIST_PATH, 'r') as f:
original_diseases = json.load(f)
# Check if our disease already exists
name_lower = disease_name.lower()
for disease in original_diseases:
if disease.get("disease_name", "").lower() == name_lower:
disease_list.append(disease)
print(f"Found existing disease entry for '{disease_name}'")
break
else:
# Disease not found in list
disease_list.append({
"id": "", # ICD-10 code (empty as we don't have it)
"disease_name": disease_name,
"ICD-10_name": disease_name
})
print(f"Created new disease entry for '{disease_name}'")
except Exception as e:
print(f"Error reading disease list: {e}")
# Create new entry
disease_list.append({
"id": "",
"disease_name": disease_name,
"ICD-10_name": disease_name
})
else:
# Create new entry
disease_list.append({
"id": "",
"disease_name": disease_name,
"ICD-10_name": disease_name
})
print(f"Created new disease list with '{disease_name}'")
# Save disease list
os.makedirs(os.path.dirname(DISEASE_LIST_PATH), exist_ok=True)
with open(DISEASE_LIST_PATH, 'w') as f:
json.dump(disease_list, f, indent=2)
print(f"Saved disease list with {len(disease_list)} entries.")
if __name__ == "__main__":
main()

305
scripts/validate_module.py Executable file
View File

@@ -0,0 +1,305 @@
#!/usr/bin/env python3
"""
Disease Module Validator for Synthea
This script validates a Synthea disease module for JSON correctness and
checks for common issues in the module structure.
Usage:
python validate_module.py <path_to_module.json>
"""
import json
import sys
import os
import re
def color_text(text, color_code):
"""Add color to terminal output"""
return f"\033[{color_code}m{text}\033[0m"
def red(text):
return color_text(text, "91")
def green(text):
return color_text(text, "92")
def yellow(text):
return color_text(text, "93")
def check_required_fields(module_json):
"""Check if the module has all required fields"""
required_fields = ['name', 'states', 'gmf_version']
missing_fields = []
for field in required_fields:
if field not in module_json:
missing_fields.append(field)
return missing_fields
def check_transitions(module_json):
"""Check if all transitions are valid"""
errors = []
# Skip if no states
if 'states' not in module_json:
return ["No 'states' field found"]
states = module_json['states']
state_names = set(states.keys())
# Check each state
for state_name, state_config in states.items():
# Skip Terminal states (they should not have transitions)
if state_config.get('type') == 'Terminal':
continue
# Check if the state has any transition
transition_found = False
transition_types = ['direct_transition', 'distributed_transition',
'conditional_transition', 'complex_transition']
for transition_type in transition_types:
if transition_type in state_config:
transition_found = True
# Check direct transitions
if transition_type == 'direct_transition':
target = state_config[transition_type]
if target not in state_names:
errors.append(f"State '{state_name}' has invalid direct_transition to non-existent state '{target}'")
# Check distributed transitions
elif transition_type == 'distributed_transition':
for transition in state_config[transition_type]:
if 'transition' in transition and transition['transition'] not in state_names:
errors.append(f"State '{state_name}' has invalid distributed_transition to non-existent state '{transition['transition']}'")
# Check conditional transitions
elif transition_type == 'conditional_transition':
for transition in state_config[transition_type]:
if 'transition' in transition and transition['transition'] not in state_names:
errors.append(f"State '{state_name}' has invalid conditional_transition to non-existent state '{transition['transition']}'")
# Check complex transitions
elif transition_type == 'complex_transition':
for transition in state_config[transition_type]:
if 'transition' in transition and transition['transition'] not in state_names:
errors.append(f"State '{state_name}' has invalid complex_transition to non-existent state '{transition['transition']}'")
if 'distributions' in transition:
for dist in transition['distributions']:
if 'transition' in dist and dist['transition'] not in state_names:
errors.append(f"State '{state_name}' has invalid complex_transition distribution to non-existent state '{dist['transition']}'")
if not transition_found and state_config.get('type') != 'Terminal':
errors.append(f"State '{state_name}' has no transition defined")
return errors
def check_codes(module_json):
"""Check if medical codes are properly formatted"""
warnings = []
json_str = json.dumps(module_json)
# Check for codes in common formats
code_patterns = {
'SNOMED-CT': r'"system":\s*"SNOMED-CT",\s*"code":\s*"[0-9]+"',
'LOINC': r'"system":\s*"LOINC",\s*"code":\s*"[0-9\\-]+"',
'RxNorm': r'"system":\s*"RxNorm",\s*"code":\s*"[0-9]+"',
'ICD-10': r'"system":\s*"ICD-10",\s*"code":\s*"[A-Z][0-9]+"'
}
for code_type, pattern in code_patterns.items():
if not re.search(pattern, json_str):
warnings.append(f"No {code_type} codes found. This may be normal depending on the module.")
return warnings
def check_prevalence(module_json):
"""Check if prevalence information exists"""
warnings = []
json_str = json.dumps(module_json)
# Check for prevalence patterns
if not any(pat in json_str for pat in ['"prevalence"', 'incidence', 'probability']):
warnings.append("No prevalence, incidence, or probability data found. This may affect realism.")
return warnings
def check_circular_references(module_json):
"""Check for circular references between states"""
errors = []
# Skip if no states
if 'states' not in module_json:
return []
states = module_json['states']
# Build a directed graph representation
graph = {}
for state_name in states:
graph[state_name] = []
# Add edges to the graph
for state_name, state_config in states.items():
# Add direct transitions
if 'direct_transition' in state_config:
target = state_config['direct_transition']
graph[state_name].append(target)
# Add distributed transitions
if 'distributed_transition' in state_config:
for transition in state_config['distributed_transition']:
if 'transition' in transition:
graph[state_name].append(transition['transition'])
# Add conditional transitions
if 'conditional_transition' in state_config:
for transition in state_config['conditional_transition']:
if 'transition' in transition:
graph[state_name].append(transition['transition'])
# Add complex transitions
if 'complex_transition' in state_config:
for transition in state_config['complex_transition']:
if 'transition' in transition:
graph[state_name].append(transition['transition'])
# Check for cycles in the graph (simplified check)
visited = set()
path = []
def dfs(node):
visited.add(node)
path.append(node)
for neighbor in graph[node]:
if neighbor in path:
# Found a cycle
cycle_start = path.index(neighbor)
errors.append(f"Circular reference detected: {' -> '.join(path[cycle_start:] + [neighbor])}")
elif neighbor not in visited:
dfs(neighbor)
path.pop()
# Run DFS from all states to find cycles
for state_name in states:
if state_name not in visited:
dfs(state_name)
return errors
def main():
if len(sys.argv) != 2:
print(f"Usage: python {sys.argv[0]} <path_to_module.json>")
sys.exit(1)
module_path = sys.argv[1]
if not os.path.exists(module_path):
print(red(f"Error: File {module_path} does not exist"))
sys.exit(1)
try:
with open(module_path, 'r') as f:
content = f.read()
# First check for valid JSON
try:
module_json = json.loads(content)
print(green("✓ Valid JSON structure"))
except json.JSONDecodeError as e:
print(red(f"✗ Invalid JSON: {e}"))
# Try to fix common issues
print(yellow("Attempting to fix common JSON issues..."))
# Fix trailing commas
fixed_content = re.sub(r',\s*}', '}', content)
fixed_content = re.sub(r',\s*]', ']', fixed_content)
# Count braces
open_braces = fixed_content.count('{')
close_braces = fixed_content.count('}')
if open_braces > close_braces:
print(yellow(f"Adding {open_braces - close_braces} missing closing braces"))
fixed_content += '}' * (open_braces - close_braces)
elif close_braces > open_braces:
print(yellow(f"Removing {close_braces - open_braces} excess closing braces"))
for _ in range(close_braces - open_braces):
fixed_content = fixed_content.rstrip().rstrip('}') + '}'
try:
module_json = json.loads(fixed_content)
print(green("✓ Fixed JSON issues successfully"))
# Write the fixed content back to the file
with open(module_path, 'w') as f:
f.write(json.dumps(module_json, indent=2))
except json.JSONDecodeError as e:
print(red(f"✗ Could not fix JSON: {e}"))
sys.exit(1)
# Check required fields
missing_fields = check_required_fields(module_json)
if missing_fields:
print(red(f"✗ Missing required fields: {', '.join(missing_fields)}"))
else:
print(green("✓ All required fields present"))
# Check for invalid transitions
transition_errors = check_transitions(module_json)
if transition_errors:
print(red("✗ Invalid transitions found:"))
for error in transition_errors:
print(red(f" - {error}"))
else:
print(green("✓ All transitions valid"))
# Check for circular references
circular_errors = check_circular_references(module_json)
if circular_errors:
print(red("✗ Circular references found:"))
for error in circular_errors:
print(red(f" - {error}"))
else:
print(green("✓ No circular references detected"))
# Check for medical codes
code_warnings = check_codes(module_json)
if code_warnings:
print(yellow("⚠ Possible code issues:"))
for warning in code_warnings:
print(yellow(f" - {warning}"))
else:
print(green("✓ Medical codes look good"))
# Check for prevalence information
prevalence_warnings = check_prevalence(module_json)
if prevalence_warnings:
print(yellow("⚠ Possible prevalence issues:"))
for warning in prevalence_warnings:
print(yellow(f" - {warning}"))
else:
print(green("✓ Prevalence information looks good"))
# Display module stats
print("\nModule Statistics:")
print(f"- Name: {module_json.get('name', 'Unknown')}")
print(f"- GMF Version: {module_json.get('gmf_version', 'Unknown')}")
print(f"- States: {len(module_json.get('states', {}))}")
print(f"- Remarks: {len(module_json.get('remarks', []))}")
except Exception as e:
print(red(f"Error: {e}"))
sys.exit(1)
if __name__ == "__main__":
main()