Trying to fix basic functionality again.
This commit is contained in:
14
scripts/.nextflow.log
Normal file
14
scripts/.nextflow.log
Normal file
@@ -0,0 +1,14 @@
|
||||
Mar-20 17:36:01.178 [main] DEBUG nextflow.cli.Launcher - $> nextflow run main.nf --disease_name Migraine --generate_patients true --population 100 --gender 0.6 --modules_dir modules
|
||||
Mar-20 17:36:01.428 [main] DEBUG nextflow.cli.CmdRun - N E X T F L O W ~ version 24.10.5
|
||||
Mar-20 17:36:01.492 [main] DEBUG nextflow.plugin.PluginsFacade - Setting up plugin manager > mode=prod; embedded=false; plugins-dir=/Users/richman/.nextflow/plugins; core-plugins: nf-amazon@2.9.2,nf-azure@1.10.2,nf-cloudcache@0.4.2,nf-codecommit@0.2.2,nf-console@1.1.4,nf-google@1.15.4,nf-tower@1.9.3,nf-wave@1.7.4
|
||||
Mar-20 17:36:01.528 [main] INFO o.pf4j.DefaultPluginStatusProvider - Enabled plugins: []
|
||||
Mar-20 17:36:01.529 [main] INFO o.pf4j.DefaultPluginStatusProvider - Disabled plugins: []
|
||||
Mar-20 17:36:01.538 [main] INFO org.pf4j.DefaultPluginManager - PF4J version 3.12.0 in 'deployment' mode
|
||||
Mar-20 17:36:01.559 [main] INFO org.pf4j.AbstractPluginManager - No plugins
|
||||
Mar-20 17:36:01.590 [main] DEBUG nextflow.scm.ProviderConfig - Using SCM config path: /Users/richman/.nextflow/scm
|
||||
Mar-20 17:36:01.624 [main] DEBUG nextflow.cli.Launcher - Operation aborted
|
||||
nextflow.exception.AbortOperationException: Cannot find script file: main.nf
|
||||
at nextflow.cli.CmdRun.getScriptFile(CmdRun.groovy:536)
|
||||
at nextflow.cli.CmdRun.run(CmdRun.groovy:325)
|
||||
at nextflow.cli.Launcher.run(Launcher.groovy:503)
|
||||
at nextflow.cli.Launcher.main(Launcher.groovy:658)
|
||||
12
scripts/README.md
Normal file
12
scripts/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# Utility Scripts
|
||||
|
||||
This directory contains utility scripts for working with Synthea modules:
|
||||
|
||||
- `analyze_patient_data.py` - Analyzes patient data generated by Synthea
|
||||
- `check_json.py` - Validates JSON structure of module files
|
||||
- `validate_module.py` - Performs comprehensive validation of modules
|
||||
- `test_module_exists.py` - Checks if a module exists for a given disease
|
||||
- `run_module.py` - Script to run a specific module with Synthea
|
||||
- Other utility scripts for module generation and testing
|
||||
|
||||
These scripts complement the core module generator scripts found in the `module_generator/` directory.
|
||||
304
scripts/analyze_patient_data.py
Normal file
304
scripts/analyze_patient_data.py
Normal file
@@ -0,0 +1,304 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import glob
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
from datetime import datetime
|
||||
|
||||
def analyze_patient_data(input_dir, output_dir, report_format='html', disease_name=None):
|
||||
"""
|
||||
Analyze Synthea-generated patient data and create reports.
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing patient JSON files
|
||||
output_dir: Directory to save analysis outputs
|
||||
report_format: Format for the report (html or csv)
|
||||
disease_name: Optional name of the disease being simulated
|
||||
"""
|
||||
# Ensure output directory exists
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Find all patient JSON files
|
||||
patients_files = glob.glob(f"{input_dir}/**/*.json", recursive=True)
|
||||
print(f"Found {len(patients_files)} patient records for analysis")
|
||||
|
||||
if len(patients_files) == 0:
|
||||
print("No patient files found to analyze.")
|
||||
with open(os.path.join(output_dir, 'patient_stats.json'), 'w') as f:
|
||||
json.dump({"error": "No patient files found to analyze"}, f)
|
||||
return
|
||||
|
||||
# Initialize data collectors
|
||||
patient_data = []
|
||||
condition_counts = Counter()
|
||||
medication_counts = Counter()
|
||||
demographics = {'gender': Counter(), 'age': [], 'race': Counter(), 'ethnicity': Counter()}
|
||||
|
||||
# Process each patient file
|
||||
for patient_file in patients_files:
|
||||
try:
|
||||
with open(patient_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Basic patient info
|
||||
if 'gender' in data:
|
||||
demographics['gender'][data['gender']] += 1
|
||||
|
||||
if 'birthDate' in data:
|
||||
# Calculate age based on birth year
|
||||
birth_year = int(data['birthDate'][:4])
|
||||
current_year = datetime.now().year
|
||||
age = current_year - birth_year
|
||||
demographics['age'].append(age)
|
||||
|
||||
# Process race and ethnicity extensions
|
||||
if 'extension' in data:
|
||||
for ext in data['extension']:
|
||||
if 'url' in ext and 'extension' in ext:
|
||||
if ext['url'].endswith('us-core-race'):
|
||||
for race_ext in ext['extension']:
|
||||
if 'valueCoding' in race_ext:
|
||||
race = race_ext['valueCoding'].get('display', 'Unknown')
|
||||
demographics['race'][race] += 1
|
||||
elif ext['url'].endswith('us-core-ethnicity'):
|
||||
for eth_ext in ext['extension']:
|
||||
if 'valueCoding' in eth_ext:
|
||||
ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
|
||||
demographics['ethnicity'][ethnicity] += 1
|
||||
|
||||
# Collect conditions and medications
|
||||
if 'entry' in data:
|
||||
for entry in data['entry']:
|
||||
if 'resource' in entry:
|
||||
resource = entry['resource']
|
||||
|
||||
# Check for conditions
|
||||
if resource.get('resourceType') == 'Condition':
|
||||
if 'code' in resource and 'coding' in resource['code']:
|
||||
for code in resource['code']['coding']:
|
||||
if 'display' in code:
|
||||
condition_counts[code['display']] += 1
|
||||
|
||||
# Check for medications
|
||||
if resource.get('resourceType') == 'MedicationRequest':
|
||||
if 'medicationCodeableConcept' in resource and 'coding' in resource['medicationCodeableConcept']:
|
||||
for code in resource['medicationCodeableConcept']['coding']:
|
||||
if 'display' in code:
|
||||
medication_counts[code['display']] += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {patient_file}: {e}")
|
||||
|
||||
# Prepare statistics
|
||||
stats = {
|
||||
'total_patients': len(patients_files),
|
||||
'disease_name': disease_name,
|
||||
'demographics': {
|
||||
'gender_distribution': {gender: count for gender, count in demographics['gender'].items()},
|
||||
'age_distribution': {
|
||||
'min': min(demographics['age']) if demographics['age'] else None,
|
||||
'max': max(demographics['age']) if demographics['age'] else None,
|
||||
'average': sum(demographics['age']) / len(demographics['age']) if demographics['age'] else None,
|
||||
'distribution': {'0-18': 0, '19-44': 0, '45-64': 0, '65+': 0}
|
||||
},
|
||||
'race_distribution': {race: count for race, count in demographics['race'].items()},
|
||||
'ethnicity_distribution': {ethnicity: count for ethnicity, count in demographics['ethnicity'].items()}
|
||||
},
|
||||
'disease_stats': {
|
||||
'top_conditions': dict(condition_counts.most_common(15)),
|
||||
'top_medications': dict(medication_counts.most_common(15))
|
||||
}
|
||||
}
|
||||
|
||||
# Calculate age distribution
|
||||
for age in demographics['age']:
|
||||
if age <= 18:
|
||||
stats['demographics']['age_distribution']['distribution']['0-18'] += 1
|
||||
elif age <= 44:
|
||||
stats['demographics']['age_distribution']['distribution']['19-44'] += 1
|
||||
elif age <= 64:
|
||||
stats['demographics']['age_distribution']['distribution']['45-64'] += 1
|
||||
else:
|
||||
stats['demographics']['age_distribution']['distribution']['65+'] += 1
|
||||
|
||||
# Save statistics to file
|
||||
with open(os.path.join(output_dir, 'patient_stats.json'), 'w') as f:
|
||||
json.dump(stats, f, indent=2)
|
||||
|
||||
# Generate report in requested format
|
||||
if report_format == 'html':
|
||||
generate_html_report(stats, output_dir)
|
||||
elif report_format == 'csv':
|
||||
generate_csv_reports(stats, output_dir)
|
||||
else:
|
||||
print(f"Unsupported report format: {report_format}. Only stats JSON file created.")
|
||||
|
||||
print(f"Analysis complete. Reports generated in {os.path.abspath(output_dir)}")
|
||||
|
||||
def generate_html_report(stats, output_dir):
|
||||
"""Generate an HTML report from the patient statistics."""
|
||||
|
||||
disease_title = f" - {stats['disease_name']}" if stats['disease_name'] else ""
|
||||
|
||||
html = f'''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Synthea Patient Analysis{disease_title}</title>
|
||||
<style>
|
||||
body {{ font-family: Arial, sans-serif; margin: 20px; }}
|
||||
h1, h2, h3 {{ color: #333; }}
|
||||
.container {{ max-width: 1000px; margin: 0 auto; }}
|
||||
table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
|
||||
th, td {{ text-align: left; padding: 8px; border-bottom: 1px solid #ddd; }}
|
||||
th {{ background-color: #f2f2f2; }}
|
||||
tr:hover {{background-color: #f5f5f5;}}
|
||||
.chart {{ margin: 20px 0; height: 300px; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>Synthea Patient Analysis{disease_title}</h1>
|
||||
<p>Total patients: {stats['total_patients']}</p>
|
||||
|
||||
<h2>Demographics</h2>
|
||||
<h3>Gender Distribution</h3>
|
||||
<table>
|
||||
<tr><th>Gender</th><th>Count</th><th>Percentage</th></tr>
|
||||
'''
|
||||
|
||||
for gender, count in stats['demographics']['gender_distribution'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
html += f"<tr><td>{gender}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
|
||||
|
||||
html += '''
|
||||
</table>
|
||||
|
||||
<h3>Age Distribution</h3>
|
||||
<table>
|
||||
<tr><th>Age Group</th><th>Count</th><th>Percentage</th></tr>
|
||||
'''
|
||||
|
||||
for age_group, count in stats['demographics']['age_distribution']['distribution'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
html += f"<tr><td>{age_group}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
|
||||
|
||||
html += f'''
|
||||
</table>
|
||||
<p>Min Age: {stats['demographics']['age_distribution']['min']}</p>
|
||||
<p>Max Age: {stats['demographics']['age_distribution']['max']}</p>
|
||||
<p>Average Age: {stats['demographics']['age_distribution']['average']:.1f}</p>
|
||||
|
||||
<h3>Race Distribution</h3>
|
||||
<table>
|
||||
<tr><th>Race</th><th>Count</th><th>Percentage</th></tr>
|
||||
'''
|
||||
|
||||
for race, count in stats['demographics']['race_distribution'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
html += f"<tr><td>{race}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
|
||||
|
||||
html += '''
|
||||
</table>
|
||||
|
||||
<h3>Ethnicity Distribution</h3>
|
||||
<table>
|
||||
<tr><th>Ethnicity</th><th>Count</th><th>Percentage</th></tr>
|
||||
'''
|
||||
|
||||
for ethnicity, count in stats['demographics']['ethnicity_distribution'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
html += f"<tr><td>{ethnicity}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
|
||||
|
||||
html += '''
|
||||
</table>
|
||||
|
||||
<h2>Disease Statistics</h2>
|
||||
<h3>Top Conditions</h3>
|
||||
<table>
|
||||
<tr><th>Condition</th><th>Count</th><th>Percentage</th></tr>
|
||||
'''
|
||||
|
||||
for condition, count in stats['disease_stats']['top_conditions'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
html += f"<tr><td>{condition}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
|
||||
|
||||
html += '''
|
||||
</table>
|
||||
|
||||
<h3>Top Medications</h3>
|
||||
<table>
|
||||
<tr><th>Medication</th><th>Count</th><th>Percentage of Patients</th></tr>
|
||||
'''
|
||||
|
||||
for medication, count in stats['disease_stats']['top_medications'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
html += f"<tr><td>{medication}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
|
||||
|
||||
html += '''
|
||||
</table>
|
||||
</div>
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
with open(os.path.join(output_dir, 'patient_analysis.html'), 'w') as f:
|
||||
f.write(html)
|
||||
|
||||
def generate_csv_reports(stats, output_dir):
|
||||
"""Generate CSV reports from the patient statistics."""
|
||||
import csv
|
||||
|
||||
# Demographics CSV
|
||||
with open(os.path.join(output_dir, 'demographics.csv'), 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Category', 'Type', 'Count', 'Percentage'])
|
||||
|
||||
# Gender
|
||||
for gender, count in stats['demographics']['gender_distribution'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
writer.writerow(['Gender', gender, count, f"{percentage:.1f}%"])
|
||||
|
||||
# Age
|
||||
for age_group, count in stats['demographics']['age_distribution']['distribution'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
writer.writerow(['Age', age_group, count, f"{percentage:.1f}%"])
|
||||
|
||||
# Race
|
||||
for race, count in stats['demographics']['race_distribution'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
writer.writerow(['Race', race, count, f"{percentage:.1f}%"])
|
||||
|
||||
# Ethnicity
|
||||
for ethnicity, count in stats['demographics']['ethnicity_distribution'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
writer.writerow(['Ethnicity', ethnicity, count, f"{percentage:.1f}%"])
|
||||
|
||||
# Conditions CSV
|
||||
with open(os.path.join(output_dir, 'conditions.csv'), 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Condition', 'Count', 'Percentage'])
|
||||
for condition, count in stats['disease_stats']['top_conditions'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
writer.writerow([condition, count, f"{percentage:.1f}%"])
|
||||
|
||||
# Medications CSV
|
||||
with open(os.path.join(output_dir, 'medications.csv'), 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Medication', 'Count', 'Percentage'])
|
||||
for medication, count in stats['disease_stats']['top_medications'].items():
|
||||
percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
|
||||
writer.writerow([medication, count, f"{percentage:.1f}%"])
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Analyze Synthea patient data and generate reports")
|
||||
parser.add_argument("--input_dir", required=True, help="Directory containing patient JSON files")
|
||||
parser.add_argument("--output_dir", default="analysis_output", help="Directory to save analysis outputs")
|
||||
parser.add_argument("--report_format", default="html", choices=["html", "csv"], help="Format for the reports")
|
||||
parser.add_argument("--disease_name", help="Name of the disease being simulated")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
analyze_patient_data(args.input_dir, args.output_dir, args.report_format, args.disease_name)
|
||||
223
scripts/check_condition_structure.py
Executable file
223
scripts/check_condition_structure.py
Executable file
@@ -0,0 +1,223 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Synthea Module Condition Structure Validator
|
||||
|
||||
This script scans all existing Synthea disease modules to check if any violate
|
||||
the expected condition structure format, particularly looking for nested condition_type
|
||||
objects that can cause errors during simulation.
|
||||
|
||||
Usage:
|
||||
python check_condition_structure.py [--modules_dir DIRECTORY] [--verbose]
|
||||
|
||||
Arguments:
|
||||
--modules_dir DIRECTORY Path to the modules directory (default: src/main/resources/modules)
|
||||
--verbose Enable verbose output with detailed issue descriptions
|
||||
--fix Attempt to automatically fix simple structure issues (experimental)
|
||||
|
||||
Example:
|
||||
python check_condition_structure.py --modules_dir ../modules --verbose
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import glob
|
||||
import argparse
|
||||
import logging
|
||||
from typing import Dict, List, Any, Tuple
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def validate_condition_format(module_json):
|
||||
"""Validate that conditions in the module follow Synthea's expected format"""
|
||||
try:
|
||||
module_dict = json.loads(module_json) if isinstance(module_json, str) else module_json
|
||||
|
||||
# Function to recursively check objects for improper condition structure
|
||||
def check_conditions(obj, path=[]):
|
||||
issues = []
|
||||
|
||||
if isinstance(obj, dict):
|
||||
# Check if this is a condition object with nested condition_type
|
||||
if "condition" in obj and isinstance(obj["condition"], dict):
|
||||
condition = obj["condition"]
|
||||
current_path = path + ["condition"]
|
||||
|
||||
# Look for the improper nested structure
|
||||
if "condition_type" in condition and isinstance(condition["condition_type"], dict):
|
||||
issue_path = '.'.join(current_path + ["condition_type"])
|
||||
issues.append(f"Found nested condition_type in a condition object at path: {issue_path}")
|
||||
|
||||
# Recursively check all dictionary values
|
||||
for key, value in obj.items():
|
||||
child_issues = check_conditions(value, path + [key])
|
||||
issues.extend(child_issues)
|
||||
|
||||
elif isinstance(obj, list):
|
||||
# Recursively check all list items
|
||||
for i, item in enumerate(obj):
|
||||
child_issues = check_conditions(item, path + [f"[{i}]"])
|
||||
issues.extend(child_issues)
|
||||
|
||||
return issues
|
||||
|
||||
# Check the entire module
|
||||
issues = check_conditions(module_dict)
|
||||
return len(issues) == 0, issues
|
||||
|
||||
except Exception as e:
|
||||
return False, [f"Validation error: {str(e)}"]
|
||||
|
||||
def fix_condition_structure(module_json):
|
||||
"""Attempt to fix common condition structure issues"""
|
||||
try:
|
||||
# Parse the module if it's a string
|
||||
module_dict = json.loads(module_json) if isinstance(module_json, str) else module_json
|
||||
fixed = False
|
||||
|
||||
def fix_conditions(obj):
|
||||
nonlocal fixed
|
||||
|
||||
if isinstance(obj, dict):
|
||||
# Check if this is a condition object with nested condition_type
|
||||
if "condition" in obj and isinstance(obj["condition"], dict):
|
||||
condition = obj["condition"]
|
||||
|
||||
# Fix the improper nested structure:
|
||||
if "condition_type" in condition and isinstance(condition["condition_type"], dict):
|
||||
# Replace nested dict with its first key as a string
|
||||
first_key = list(condition["condition_type"].keys())[0]
|
||||
condition["condition_type"] = first_key
|
||||
fixed = True
|
||||
|
||||
# Recursively fix all dictionary values
|
||||
for key, value in list(obj.items()):
|
||||
if isinstance(value, (dict, list)):
|
||||
fix_conditions(value)
|
||||
|
||||
elif isinstance(obj, list):
|
||||
# Recursively fix all list items
|
||||
for item in obj:
|
||||
if isinstance(item, (dict, list)):
|
||||
fix_conditions(item)
|
||||
|
||||
# Fix the entire module
|
||||
fix_conditions(module_dict)
|
||||
|
||||
# Return the fixed module and whether changes were made
|
||||
return module_dict, fixed
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fixing module structure: {e}")
|
||||
return module_json, False
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Validate condition structure in Synthea modules')
|
||||
parser.add_argument('--modules_dir', type=str, default='src/main/resources/modules',
|
||||
help='Path to the modules directory')
|
||||
parser.add_argument('--verbose', action='store_true',
|
||||
help='Enable verbose output with detailed issue descriptions')
|
||||
parser.add_argument('--fix', action='store_true',
|
||||
help='Attempt to automatically fix simple structure issues (experimental)')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check if modules directory exists
|
||||
if not os.path.exists(args.modules_dir):
|
||||
logger.error(f"Modules directory not found: {args.modules_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
# Get all module files
|
||||
module_files = glob.glob(os.path.join(args.modules_dir, "*.json"))
|
||||
logger.info(f"Found {len(module_files)} module files to check")
|
||||
|
||||
# Track statistics
|
||||
valid_modules = 0
|
||||
invalid_modules = 0
|
||||
fixed_modules = 0
|
||||
|
||||
# Check each module
|
||||
for module_path in module_files:
|
||||
module_name = os.path.basename(module_path)
|
||||
|
||||
try:
|
||||
# Load module content
|
||||
with open(module_path, 'r') as f:
|
||||
module_content = f.read()
|
||||
|
||||
# Validate condition structure
|
||||
try:
|
||||
module_json = json.loads(module_content)
|
||||
valid, issues = validate_condition_format(module_json)
|
||||
|
||||
if valid:
|
||||
valid_modules += 1
|
||||
if args.verbose:
|
||||
logger.info(f"✅ {module_name}: Valid condition structure")
|
||||
else:
|
||||
invalid_modules += 1
|
||||
|
||||
if args.fix:
|
||||
# Attempt to fix the module
|
||||
fixed_module, was_fixed = fix_condition_structure(module_json)
|
||||
|
||||
if was_fixed:
|
||||
# Write the fixed module back
|
||||
with open(module_path, 'w') as f:
|
||||
json.dump(fixed_module, f, indent=2)
|
||||
|
||||
# Validate again to confirm
|
||||
valid_after_fix, remaining_issues = validate_condition_format(fixed_module)
|
||||
|
||||
if valid_after_fix:
|
||||
fixed_modules += 1
|
||||
logger.info(f"🔧 {module_name}: Fixed condition structure issues")
|
||||
else:
|
||||
logger.warning(f"⚠️ {module_name}: Could not fix all condition structure issues")
|
||||
if args.verbose:
|
||||
for issue in remaining_issues:
|
||||
logger.warning(f" - {issue}")
|
||||
else:
|
||||
logger.warning(f"⚠️ {module_name}: Could not fix condition structure issues")
|
||||
if args.verbose:
|
||||
for issue in issues:
|
||||
logger.warning(f" - {issue}")
|
||||
else:
|
||||
logger.warning(f"⚠️ {module_name}: Invalid condition structure")
|
||||
if args.verbose:
|
||||
for issue in issues:
|
||||
logger.warning(f" - {issue}")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
invalid_modules += 1
|
||||
logger.error(f"❌ {module_name}: Invalid JSON format - {str(e)}")
|
||||
|
||||
except Exception as e:
|
||||
invalid_modules += 1
|
||||
logger.error(f"❌ {module_name}: Error processing file - {str(e)}")
|
||||
|
||||
# Print summary
|
||||
logger.info("\nSummary:")
|
||||
logger.info(f"Total modules checked: {len(module_files)}")
|
||||
logger.info(f"Valid modules: {valid_modules}")
|
||||
logger.info(f"Invalid modules: {invalid_modules}")
|
||||
|
||||
if args.fix:
|
||||
logger.info(f"Modules fixed: {fixed_modules}")
|
||||
|
||||
if invalid_modules > 0:
|
||||
logger.warning("Some modules have condition structure issues that may cause problems in Synthea")
|
||||
if not args.fix:
|
||||
logger.info("Run with --fix to attempt automatic fixes for the issues")
|
||||
else:
|
||||
logger.info("All modules have valid condition structure")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
14
scripts/check_json.py
Normal file
14
scripts/check_json.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
path = sys.argv[1]
|
||||
with open(path, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
print(f"File has {content.count('{')} opening braces and {content.count('}')} closing braces")
|
||||
|
||||
try:
|
||||
json.loads(content)
|
||||
print('Valid JSON')
|
||||
except Exception as e:
|
||||
print(f'Invalid JSON: {e}')
|
||||
262
scripts/cleanup.sh
Executable file
262
scripts/cleanup.sh
Executable file
@@ -0,0 +1,262 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Synthea-All-Diseases Repository Cleanup Script
|
||||
# This script removes unnecessary files and keeps only the essential ones for the Synthea module generator
|
||||
|
||||
echo "Starting repository cleanup..."
|
||||
|
||||
# Create backup directory
|
||||
mkdir -p backup
|
||||
mkdir -p backup/scripts # Create a directory for Python scripts
|
||||
mkdir -p backup/modules # Create a directory for modules
|
||||
mkdir -p backup/module_generator # Create a directory for module generator scripts
|
||||
|
||||
# 1. Keep essential Nextflow files, remove others
|
||||
echo "Handling Nextflow files..."
|
||||
cp main.nf backup/
|
||||
rm -f build_synthea.nf synthea_module_generator_old.nf minimal_test.nf fresh_synthea.nf simple_synthea.nf test_workflow.nf new_synthea.nf minimal_working.nf
|
||||
# Keep synthea_module_generator.nf around for reference but not used
|
||||
mv synthea_module_generator.nf backup/
|
||||
|
||||
# 2. Keep essential Python scripts, Docker files, and configuration files
|
||||
echo "Keeping essential files..."
|
||||
if [ -d "src/main/python" ]; then
|
||||
cp -r src/main/python backup/module_generator/
|
||||
fi
|
||||
if [ -d "src/main/resources/modules" ]; then
|
||||
cp -r src/main/resources/modules/* backup/modules/
|
||||
fi
|
||||
cp Dockerfile docker-compose.yml CLAUDE.md .env.example nextflow.config README.md backup/
|
||||
cp DOCKER_README.md SYNTHEA_GUIDE.md backup/ 2>/dev/null || : # Keep additional README files
|
||||
# Backup .env file if it exists
|
||||
if [ -f ".env" ]; then
|
||||
cp .env backup/
|
||||
fi
|
||||
|
||||
# 3. Back up all Python scripts in the root directory
|
||||
echo "Backing up Python scripts..."
|
||||
# Key scripts - these will be kept in the scripts directory after cleanup
|
||||
# The only key script we'll keep in the root is check_condition_structure.py for convenience
|
||||
key_scripts=("check_condition_structure.py")
|
||||
for script in "${key_scripts[@]}"; do
|
||||
if [ -f "$script" ]; then
|
||||
cp "$script" backup/
|
||||
echo " Backed up key script for root: $script"
|
||||
fi
|
||||
done
|
||||
|
||||
# All other Python scripts - these will be moved to scripts/ directory
|
||||
for script in *.py; do
|
||||
if [ -f "$script" ] && [[ ! " ${key_scripts[@]} " =~ " ${script} " ]]; then
|
||||
cp "$script" backup/scripts/
|
||||
echo " Backed up utility script: $script"
|
||||
fi
|
||||
done
|
||||
|
||||
# 4. Remove ALL Synthea code - we'll clone it in Docker
|
||||
echo "Removing ALL Synthea source code..."
|
||||
echo "This will be cloned during Docker build based on Dockerfile..."
|
||||
# Remove the entire src directory - we'll recreate what we need
|
||||
rm -rf src/
|
||||
# Remove any Synthea build files/directories
|
||||
rm -rf build/ output/ simulator/ lib/ logs/
|
||||
# Remove any Synthea run files
|
||||
rm -f run_synthea run_synthea.bat synthea *.jar
|
||||
|
||||
# 5. Remove other experimental or temporary files
|
||||
echo "Removing experimental and temporary files..."
|
||||
rm -f *.log *.txt trace.txt
|
||||
rm -f error_output.txt generate_module.sh run_synthetic_data_generation.sh
|
||||
rm -f build_docker.sh entrypoint.sh run_module_generator.sh generate_samples.sh test_run.sh
|
||||
rm -f run_flexporter .DS_Store
|
||||
|
||||
# 6. Remove gradle files and other unnecessary files
|
||||
echo "Removing additional unnecessary files..."
|
||||
rm -f build.gradle gradlew gradlew.bat settings.gradle
|
||||
rm -rf gradle/ .gradle/
|
||||
# Note: Keeping .git, .gitignore, .github for version control
|
||||
rm -rf .nextflow/ .nextflow.log* nextflow-*.zip
|
||||
# Keep LICENSE files but remove other documentation that will be included from Synthea
|
||||
rm -f CODE_OF_CONDUCT.md NOTICE
|
||||
rm -f *.bak *~
|
||||
|
||||
# 7. Create clean directory structure for the repository
|
||||
echo "Creating clean directory structure..."
|
||||
mkdir -p modules # Top-level modules directory
|
||||
mkdir -p module_generator # Module generator directory (renamed from python)
|
||||
mkdir -p scripts # Utility scripts directory
|
||||
|
||||
# 8. Restore files to the cleaned structure
|
||||
echo "Restoring files to cleaned structure..."
|
||||
|
||||
# Restore modules
|
||||
if [ -d "backup/modules" ]; then
|
||||
cp -r backup/modules/* modules/ 2>/dev/null || :
|
||||
echo " Restored modules to modules/ directory"
|
||||
fi
|
||||
|
||||
# Restore Module generator core scripts
|
||||
if [ -d "backup/module_generator" ]; then
|
||||
cp -r backup/module_generator/* module_generator/ 2>/dev/null || :
|
||||
echo " Restored core generator scripts to module_generator/ directory"
|
||||
fi
|
||||
|
||||
# Restore key Python scripts to root directory
|
||||
for script in "${key_scripts[@]}"; do
|
||||
if [ -f "backup/$script" ]; then
|
||||
cp "backup/$script" ./
|
||||
echo " Restored key script to root: $script"
|
||||
fi
|
||||
done
|
||||
|
||||
# Restore utility Python scripts to scripts/ directory
|
||||
if [ -d "backup/scripts" ]; then
|
||||
cp backup/scripts/* scripts/ 2>/dev/null || :
|
||||
echo " Restored utility scripts to scripts/ directory"
|
||||
fi
|
||||
|
||||
# Restore root files
|
||||
cp backup/main.nf backup/Dockerfile backup/docker-compose.yml backup/CLAUDE.md backup/nextflow.config backup/.env.example backup/README.md ./ 2>/dev/null || :
|
||||
if [ -f "backup/DOCKER_README.md" ]; then
|
||||
cp backup/DOCKER_README.md ./
|
||||
fi
|
||||
if [ -f "backup/SYNTHEA_GUIDE.md" ]; then
|
||||
cp backup/SYNTHEA_GUIDE.md ./
|
||||
fi
|
||||
# Restore .env if it existed
|
||||
if [ -f "backup/.env" ]; then
|
||||
cp backup/.env ./
|
||||
echo " Restored .env file"
|
||||
fi
|
||||
|
||||
# 9. Update the Dockerfile to reference the new directory structure
|
||||
echo "Updating Dockerfile references if needed..."
|
||||
if [ -f "Dockerfile" ]; then
|
||||
# Update the Dockerfile to use the new directory structure
|
||||
sed -i.bak 's|COPY src/main/python/|COPY module_generator/|g' Dockerfile
|
||||
sed -i.bak 's|COPY python/|COPY module_generator/|g' Dockerfile
|
||||
sed -i.bak 's|COPY src/main/resources/modules/|COPY modules/|g' Dockerfile
|
||||
sed -i.bak 's|src/main/python/|module_generator/|g' Dockerfile
|
||||
sed -i.bak 's|src/main/resources/modules|modules|g' Dockerfile
|
||||
# Update script paths if needed
|
||||
sed -i.bak 's|/app/src/main/python/|/app/module_generator/|g' Dockerfile
|
||||
rm -f Dockerfile.bak
|
||||
fi
|
||||
|
||||
# 10. Update the main.nf file to reference the new directory structure
|
||||
echo "Updating main.nf references if needed..."
|
||||
if [ -f "main.nf" ]; then
|
||||
# Update the main.nf file to use the new directory structure
|
||||
sed -i.bak 's|src/main/resources/modules|modules|g' main.nf
|
||||
sed -i.bak 's|src/main/python|module_generator|g' main.nf
|
||||
sed -i.bak 's|python/|module_generator/|g' main.nf
|
||||
rm -f main.nf.bak
|
||||
fi
|
||||
|
||||
# 11. Create a simple README for the scripts directory
|
||||
echo "Creating README for scripts directory..."
|
||||
cat > scripts/README.md << 'EOF'
|
||||
# Utility Scripts
|
||||
|
||||
This directory contains utility scripts for working with Synthea modules:
|
||||
|
||||
- `analyze_patient_data.py` - Analyzes patient data generated by Synthea
|
||||
- `check_json.py` - Validates JSON structure of module files
|
||||
- `validate_module.py` - Performs comprehensive validation of modules
|
||||
- `test_module_exists.py` - Checks if a module exists for a given disease
|
||||
- `run_module.py` - Script to run a specific module with Synthea
|
||||
- Other utility scripts for module generation and testing
|
||||
|
||||
These scripts complement the core module generator scripts found in the `module_generator/` directory.
|
||||
EOF
|
||||
|
||||
# Update README.md with instructions on using the cleaned repo
|
||||
echo "Updating README..."
|
||||
cat > README.md << 'EOF'
|
||||
# Synthea All Diseases
|
||||
|
||||
This repository contains a workflow for generating Synthea disease modules using Claude AI and generating synthetic patient data.
|
||||
|
||||
## Repository Structure
|
||||
|
||||
- `module_generator/` - Core module generation scripts
|
||||
- `modules/` - Generated disease modules
|
||||
- `scripts/` - Utility Python scripts and tools
|
||||
- `main.nf` - Nextflow workflow for module generation and patient data generation
|
||||
- `Dockerfile` and `docker-compose.yml` - Docker configuration
|
||||
|
||||
## Key Files
|
||||
|
||||
- `check_condition_structure.py` - Validates condition structure in modules
|
||||
- `main.nf` - Main workflow file for generating modules and patient data
|
||||
|
||||
## Running the Pipeline
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Docker and Docker Compose
|
||||
- Nextflow
|
||||
- Anthropic API key (for Claude AI)
|
||||
|
||||
### Setup
|
||||
|
||||
1. Clone this repository
|
||||
2. Copy `.env.example` to `.env` and add your Anthropic API key
|
||||
3. Run with Docker Compose:
|
||||
|
||||
```
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
### Running the Workflow
|
||||
|
||||
```
|
||||
nextflow run main.nf --disease_name "Disease Name" [OPTIONS]
|
||||
```
|
||||
|
||||
For a full list of options, run:
|
||||
|
||||
```
|
||||
nextflow run main.nf --help
|
||||
```
|
||||
|
||||
## Key Features
|
||||
|
||||
- Generate modules for diseases that don't exist in Synthea
|
||||
- Validate condition structure in generated modules
|
||||
- Generate synthetic patient data using the modules
|
||||
- Analyze generated patient data
|
||||
|
||||
## Documentation
|
||||
|
||||
- See `CLAUDE.md` for additional development guidelines
|
||||
- See `DOCKER_README.md` for Docker setup and usage
|
||||
- See `SYNTHEA_GUIDE.md` for detailed usage examples and scenarios
|
||||
- See `scripts/README.md` for information about utility scripts
|
||||
|
||||
## License
|
||||
|
||||
This project uses the same license as Synthea.
|
||||
EOF
|
||||
|
||||
# 12. Update docker-compose.yml to reference the new directory structure
|
||||
echo "Updating docker-compose.yml references if needed..."
|
||||
if [ -f "docker-compose.yml" ]; then
|
||||
# No changes needed as it uses relative paths and volume mounts
|
||||
echo " No changes needed for docker-compose.yml"
|
||||
fi
|
||||
|
||||
echo "Cleanup complete! All essential files have been kept, and unnecessary files have been removed."
|
||||
echo "Repository structure has been simplified to:"
|
||||
echo " - module_generator/ (core module generation scripts)"
|
||||
echo " - modules/ (disease modules)"
|
||||
echo " - scripts/ (utility scripts)"
|
||||
echo " - check_condition_structure.py (in root for easy access)"
|
||||
echo ""
|
||||
echo "The following important files were preserved:"
|
||||
echo " - .git/ (version control repository)"
|
||||
echo " - .gitignore (version control configuration)"
|
||||
echo " - .env (environment configuration if it existed)"
|
||||
echo " - .github/ (GitHub configuration if it existed)"
|
||||
echo ""
|
||||
echo "Optional: Remove the backup directory after verifying everything works: rm -rf backup"
|
||||
175
scripts/generate_batch.py
Executable file
175
scripts/generate_batch.py
Executable file
@@ -0,0 +1,175 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import csv
|
||||
import time
|
||||
import argparse
|
||||
import subprocess
|
||||
import concurrent.futures
|
||||
import re
|
||||
|
||||
# Global variables
|
||||
ARGS = None
|
||||
|
||||
def normalize_disease_name(name):
|
||||
"""Convert a disease name to a normalized filename"""
|
||||
# Convert to lowercase
|
||||
name = name.lower()
|
||||
|
||||
# Replace special characters with underscores
|
||||
name = re.sub(r'[^a-z0-9]+', '_', name)
|
||||
|
||||
# Remove leading/trailing underscores
|
||||
name = name.strip('_')
|
||||
|
||||
# Ensure the name is not empty
|
||||
if not name:
|
||||
name = "unknown_disease"
|
||||
|
||||
return name
|
||||
|
||||
def process_disease(disease_entry):
|
||||
"""Process a single disease from the CSV"""
|
||||
disease_name = disease_entry.get("disease_name", "")
|
||||
normalized_name = normalize_disease_name(disease_name)
|
||||
icd10 = disease_entry.get("id", "")
|
||||
category = disease_entry.get("disease_category", "")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Processing disease: {disease_name}")
|
||||
print(f"ICD-10 code: {icd10}")
|
||||
print(f"Category: {category}")
|
||||
|
||||
# Skip if module already exists (unless --force flag is used)
|
||||
module_path = f"src/main/resources/modules/{normalized_name}.json"
|
||||
if os.path.exists(module_path) and not ARGS.force:
|
||||
print(f"✅ Module already exists at {module_path}, skipping")
|
||||
return {"name": disease_name, "status": "skipped", "path": module_path}
|
||||
|
||||
# Create the command
|
||||
cmd = ["python3", "generate_module.py", "--disease", disease_name, "--no-interactive"]
|
||||
|
||||
if icd10:
|
||||
cmd.extend(["--icd10", icd10])
|
||||
|
||||
if category:
|
||||
cmd.extend(["--category", category])
|
||||
|
||||
# Add auto-fallback option if requested
|
||||
if ARGS.auto_fallback:
|
||||
cmd.append("--auto-fallback")
|
||||
|
||||
# Run the generator
|
||||
try:
|
||||
print(f"Executing: {' '.join(cmd)}")
|
||||
process = subprocess.run(cmd, check=True, text=True, capture_output=True)
|
||||
print(f"✅ Successfully generated module for {disease_name}")
|
||||
print(process.stdout.strip())
|
||||
return {"name": disease_name, "status": "success", "path": module_path}
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Failed to generate module for {disease_name}")
|
||||
print(f"Error: {str(e)}")
|
||||
print(f"STDOUT: {e.stdout}")
|
||||
print(f"STDERR: {e.stderr}")
|
||||
return {"name": disease_name, "status": "error", "error": str(e), "path": None}
|
||||
|
||||
def main():
|
||||
"""Main function to process diseases from the CSV"""
|
||||
global ARGS
|
||||
parser = argparse.ArgumentParser(description='Generate Synthea modules for diseases')
|
||||
parser.add_argument('--category', help='Only process diseases in this category')
|
||||
parser.add_argument('--disease', help='Only process a specific disease (by name)')
|
||||
parser.add_argument('--limit', type=int, help='Limit number of diseases to process')
|
||||
parser.add_argument('--parallel', type=int, default=1, help='Number of parallel processes')
|
||||
parser.add_argument('--skip-existing', action='store_true', help='Skip diseases that already have modules')
|
||||
parser.add_argument('--csv-path', default='src/main/resources/disease_list.csv', help='Path to disease list CSV')
|
||||
parser.add_argument('--force', action='store_true', help='Force generation even if module already exists')
|
||||
parser.add_argument('--auto-fallback', action='store_true', help='Enable auto-fallback option')
|
||||
args = parser.parse_args()
|
||||
|
||||
ARGS = args
|
||||
|
||||
# Read the disease list CSV
|
||||
if not os.path.exists(args.csv_path):
|
||||
print(f"Error: Disease list CSV not found at {args.csv_path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Reading disease list from {args.csv_path}")
|
||||
try:
|
||||
with open(args.csv_path, 'r') as f:
|
||||
reader = csv.DictReader(f)
|
||||
diseases = list(reader)
|
||||
except Exception as e:
|
||||
print(f"Error reading CSV: {str(e)}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(diseases)} diseases in the CSV")
|
||||
|
||||
# Filter diseases
|
||||
if args.category:
|
||||
diseases = [d for d in diseases if (d.get('disease_category', '').lower() == args.category.lower())]
|
||||
print(f"Filtered to {len(diseases)} diseases in category '{args.category}'")
|
||||
|
||||
if args.disease:
|
||||
# Try to find an exact match first
|
||||
disease_name_lower = args.disease.lower()
|
||||
exact_match = [d for d in diseases if d.get("disease_name", "").lower() == disease_name_lower]
|
||||
|
||||
if exact_match:
|
||||
diseases = exact_match
|
||||
else:
|
||||
# Try to find a disease that contains the specified name
|
||||
partial_matches = [d for d in diseases if disease_name_lower in d.get("disease_name", "").lower()]
|
||||
|
||||
if partial_matches:
|
||||
diseases = partial_matches
|
||||
print(f"Found {len(diseases)} partial matches for '{args.disease}'")
|
||||
else:
|
||||
print(f"No matches found for disease '{args.disease}'")
|
||||
sys.exit(1)
|
||||
|
||||
if args.limit and args.limit > 0:
|
||||
diseases = diseases[:args.limit]
|
||||
print(f"Limited to {args.limit} diseases")
|
||||
|
||||
# Process the diseases
|
||||
total = len(diseases)
|
||||
print(f"\nProcessing {total} diseases with {args.parallel} parallel workers")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
if args.parallel > 1:
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=args.parallel) as executor:
|
||||
results = list(executor.map(process_disease, diseases))
|
||||
else:
|
||||
results = [process_disease(disease) for disease in diseases]
|
||||
|
||||
end_time = time.time()
|
||||
elapsed = end_time - start_time
|
||||
|
||||
# Summarize results
|
||||
success_count = sum(1 for r in results if r["status"] == "success")
|
||||
skipped_count = sum(1 for r in results if r["status"] == "skipped")
|
||||
error_count = sum(1 for r in results if r["status"] == "error")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print(f"SUMMARY: Processed {total} diseases in {elapsed:.2f} seconds")
|
||||
print(f"- Successfully generated: {success_count}")
|
||||
print(f"- Skipped (already exist): {skipped_count}")
|
||||
print(f"- Failed: {error_count}")
|
||||
|
||||
# List errors if any
|
||||
if error_count > 0:
|
||||
print("\nFAILED DISEASES:")
|
||||
for result in results:
|
||||
if result["status"] == "error":
|
||||
print(f"- {result['name']}: {result.get('error', 'Unknown error')}")
|
||||
|
||||
# Exit with error code if any failures
|
||||
sys.exit(1)
|
||||
|
||||
print("\nAll done! 🎉")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
468
scripts/generate_module.py
Executable file
468
scripts/generate_module.py
Executable file
@@ -0,0 +1,468 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import anthropic
|
||||
import csv
|
||||
import argparse
|
||||
|
||||
# Define global variables
|
||||
DISEASE_NAME = ""
|
||||
OUTPUT_FILE = ""
|
||||
icd10_code = ""
|
||||
disease_category = ""
|
||||
use_interactive = True
|
||||
use_auto_fallback = False
|
||||
|
||||
def normalize_disease_name(name):
|
||||
# Normalize disease name for filename
|
||||
normalized_name = name.lower().replace(' ', '_').replace('-', '_')
|
||||
normalized_name = ''.join(c if c.isalnum() or c == '_' else '_' for c in normalized_name)
|
||||
normalized_name = '_'.join(filter(None, normalized_name.split('_')))
|
||||
return normalized_name
|
||||
|
||||
def generate_module():
|
||||
# Initialize the Anthropic client
|
||||
api_key = os.environ.get('ANTHROPIC_API_KEY')
|
||||
if not api_key:
|
||||
print("Error: ANTHROPIC_API_KEY environment variable is not set")
|
||||
sys.exit(1)
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
|
||||
# Step 1: Ask Claude for clinical details in a structured format, not JSON
|
||||
print("Step 1: Getting clinical details from Claude...")
|
||||
|
||||
# Prepare disease information for the prompt
|
||||
disease_info = f"Disease name: {DISEASE_NAME}"
|
||||
if icd10_code:
|
||||
disease_info += f"\nICD-10 code: {icd10_code}"
|
||||
if disease_category:
|
||||
disease_info += f"\nCategory: {disease_category}"
|
||||
|
||||
# Construct a prompt that asks for structured data, not JSON
|
||||
prompt = f"""I need information to create a Synthea disease module for {DISEASE_NAME}.
|
||||
|
||||
{disease_info}
|
||||
|
||||
Please provide the following information in a structured format (NOT JSON):
|
||||
|
||||
1. DESCRIPTION: A brief description of the disease (3-5 sentences)
|
||||
|
||||
2. RISK_FACTORS: Age ranges, gender factors, and/or other risk factors for this disease
|
||||
|
||||
3. SYMPTOMS: List the main symptoms of the disease
|
||||
|
||||
4. DIAGNOSTIC_TESTS: Tests typically used to diagnose this condition
|
||||
|
||||
5. TREATMENTS: List treatments for this disease, including:
|
||||
- Medications (with their class and purpose)
|
||||
- Procedures
|
||||
- Other interventions
|
||||
|
||||
6. COMPLICATIONS: Possible complications of this disease
|
||||
|
||||
7. PROGRESSION: Typical progression of the disease, including:
|
||||
- How it starts
|
||||
- How it typically develops
|
||||
- Possible outcomes
|
||||
|
||||
8. FOLLOW_UP: Typical follow-up care needed
|
||||
|
||||
For medications, just provide generic names - I'll handle the RxNorm codes.
|
||||
For conditions and procedures, just provide plain English names - I'll handle the coding.
|
||||
|
||||
Please provide this information in a straightforward, structured text format. Do NOT use JSON.
|
||||
"""
|
||||
|
||||
# Get clinical details
|
||||
try:
|
||||
response = client.messages.create(
|
||||
model="claude-3-7-sonnet-20250219",
|
||||
max_tokens=4000,
|
||||
temperature=0.2,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
)
|
||||
|
||||
# Extract the response text
|
||||
clinical_details = response.content[0].text
|
||||
|
||||
# Save the raw clinical details for reference
|
||||
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
|
||||
with open(f"{OUTPUT_FILE}.details", "w") as f:
|
||||
f.write(clinical_details)
|
||||
|
||||
print(f"Clinical details saved to {OUTPUT_FILE}.details")
|
||||
|
||||
# Step 2: Now build a valid module with the clinical details
|
||||
print("Step 2: Building module with clinical details...")
|
||||
|
||||
# Extract information from the structured response
|
||||
description_match = re.search(r'##\s*1\.\s*DESCRIPTION(.*?)(?=##\s*2\.)', clinical_details, re.DOTALL)
|
||||
risk_factors_match = re.search(r'##\s*2\.\s*RISK_FACTORS(.*?)(?=##\s*3\.)', clinical_details, re.DOTALL)
|
||||
symptoms_match = re.search(r'##\s*3\.\s*SYMPTOMS(.*?)(?=##\s*4\.)', clinical_details, re.DOTALL)
|
||||
diagnostic_tests_match = re.search(r'##\s*4\.\s*DIAGNOSTIC_TESTS(.*?)(?=##\s*5\.)', clinical_details, re.DOTALL)
|
||||
treatments_match = re.search(r'##\s*5\.\s*TREATMENTS(.*?)(?=##\s*6\.)', clinical_details, re.DOTALL)
|
||||
complications_match = re.search(r'##\s*6\.\s*COMPLICATIONS(.*?)(?=##\s*7\.)', clinical_details, re.DOTALL)
|
||||
progression_match = re.search(r'##\s*7\.\s*PROGRESSION(.*?)(?=##\s*8\.)', clinical_details, re.DOTALL)
|
||||
follow_up_match = re.search(r'##\s*8\.\s*FOLLOW_UP(.*?)(?=$)', clinical_details, re.DOTALL)
|
||||
|
||||
# Extract text from matches
|
||||
description = description_match.group(1).strip() if description_match else f"A module for {DISEASE_NAME}"
|
||||
risk_factors = risk_factors_match.group(1).strip() if risk_factors_match else ""
|
||||
symptoms = symptoms_match.group(1).strip() if symptoms_match else ""
|
||||
diagnostic_tests = diagnostic_tests_match.group(1).strip() if diagnostic_tests_match else ""
|
||||
treatments = treatments_match.group(1).strip() if treatments_match else ""
|
||||
complications = complications_match.group(1).strip() if complications_match else ""
|
||||
progression = progression_match.group(1).strip() if progression_match else ""
|
||||
follow_up = follow_up_match.group(1).strip() if follow_up_match else ""
|
||||
|
||||
print(f"Extracted: {len(description)} chars description, {len(symptoms)} chars symptoms, {len(treatments)} chars treatments")
|
||||
|
||||
# Extract symptoms as a list
|
||||
symptom_list = []
|
||||
if symptoms_match:
|
||||
# For structured format with bullet points
|
||||
symptom_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', symptoms, re.MULTILINE)
|
||||
if symptom_lines:
|
||||
symptom_list = [s.strip() for s in symptom_lines if s.strip()]
|
||||
else:
|
||||
# Try to split by newlines for a less structured format
|
||||
symptom_list = [s.strip() for s in symptoms.split('\n') if s.strip()]
|
||||
# If that doesn't give us anything, just add the whole text as one symptom
|
||||
if not symptom_list and symptoms.strip():
|
||||
symptom_list = [symptoms.strip()]
|
||||
|
||||
print(f"Found {len(symptom_list)} symptoms")
|
||||
|
||||
# Extract medications from treatments
|
||||
medications = []
|
||||
if treatments_match:
|
||||
# Find the "Medications:" section
|
||||
meds_section_match = re.search(r'Medications?:\s*(.*?)(?=(?:- Procedures:|$))', treatments, re.DOTALL)
|
||||
if meds_section_match:
|
||||
meds_section = meds_section_match.group(1).strip()
|
||||
# Extract main medication names from list items with explanations
|
||||
# Format is typically: "- Medication name (explanation)" or "- Medication name - explanation"
|
||||
med_items = re.findall(r'[-*]\s*(.*?)(?=[-*]|\n\n|\n- |$)', meds_section, re.DOTALL)
|
||||
|
||||
for item in med_items:
|
||||
item = item.strip()
|
||||
if not item:
|
||||
continue
|
||||
|
||||
# Extract just the medication name, not the explanation
|
||||
med_match = re.match(r'([^(]+)(?:\(|-).*', item)
|
||||
if med_match:
|
||||
med_name = med_match.group(1).strip()
|
||||
else:
|
||||
med_name = item.split('(')[0].strip()
|
||||
|
||||
# Clean up common patterns
|
||||
med_name = re.sub(r'\s*\([^)]*\)', '', med_name) # Remove parentheticals
|
||||
med_name = re.sub(r'\s*-.*', '', med_name) # Remove after dash
|
||||
|
||||
# Skip descriptive phrases that aren't medications
|
||||
if any(phrase in med_name.lower() for phrase in ['reduce', 'block', 'prevent', 'for']):
|
||||
continue
|
||||
|
||||
# Look for multiple medications in parentheses
|
||||
if ',' in med_name:
|
||||
for m in med_name.split(','):
|
||||
m = m.strip()
|
||||
if m and not any(m.lower() in existing.lower() for existing in medications):
|
||||
medications.append(m)
|
||||
else:
|
||||
if med_name and not any(med_name.lower() in existing.lower() for existing in medications):
|
||||
medications.append(med_name)
|
||||
|
||||
print(f"Found {len(medications)} medications: {', '.join(medications[:5])}")
|
||||
|
||||
# Start with a basic module template
|
||||
module = {
|
||||
"name": DISEASE_NAME,
|
||||
"remarks": [],
|
||||
"states": {
|
||||
"Initial": {
|
||||
"type": "Initial",
|
||||
"direct_transition": "Age_Guard"
|
||||
},
|
||||
"Age_Guard": {
|
||||
"type": "Guard",
|
||||
"allow": {
|
||||
"condition_type": "Age",
|
||||
"operator": ">=",
|
||||
"quantity": 18,
|
||||
"unit": "years"
|
||||
},
|
||||
"direct_transition": "Delay_Until_Onset"
|
||||
},
|
||||
"Delay_Until_Onset": {
|
||||
"type": "Delay",
|
||||
"range": {
|
||||
"low": 0,
|
||||
"high": 20,
|
||||
"unit": "years"
|
||||
},
|
||||
"direct_transition": "Onset"
|
||||
},
|
||||
"Onset": {
|
||||
"type": "ConditionOnset",
|
||||
"target_encounter": "Diagnosis_Encounter",
|
||||
"codes": [
|
||||
{
|
||||
"system": "SNOMED-CT",
|
||||
"code": "64109004",
|
||||
"display": DISEASE_NAME
|
||||
}
|
||||
],
|
||||
"direct_transition": "Diagnosis_Encounter"
|
||||
},
|
||||
"Diagnosis_Encounter": {
|
||||
"type": "Encounter",
|
||||
"encounter_class": "ambulatory",
|
||||
"reason": "Onset",
|
||||
"codes": [
|
||||
{
|
||||
"system": "SNOMED-CT",
|
||||
"code": "185347001",
|
||||
"display": "Encounter for problem"
|
||||
}
|
||||
],
|
||||
"direct_transition": "End_Diagnosis_Encounter"
|
||||
},
|
||||
"End_Diagnosis_Encounter": {
|
||||
"type": "EncounterEnd"
|
||||
# direct_transition will be set later
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Add description to remarks
|
||||
if description:
|
||||
for line in description.split('\n'):
|
||||
line = line.strip()
|
||||
if line:
|
||||
module["remarks"].append(line)
|
||||
|
||||
# Add ICD-10 code if available
|
||||
if icd10_code:
|
||||
module["remarks"].append(f"ICD-10 code: {icd10_code}")
|
||||
|
||||
# Add category if available
|
||||
if disease_category:
|
||||
module["remarks"].append(f"Category: {disease_category}")
|
||||
|
||||
# Make sure remarks is not empty
|
||||
if not module["remarks"]:
|
||||
module["remarks"].append(f"Generated module for {DISEASE_NAME}")
|
||||
|
||||
# Add risk factors to remarks if available
|
||||
if risk_factors:
|
||||
module["remarks"].append(f"Risk factors: {risk_factors.replace('\n', ' ')}")
|
||||
|
||||
# Add symptoms if available
|
||||
symptom_i = 0
|
||||
current_state = "End_Diagnosis_Encounter"
|
||||
|
||||
if symptom_list:
|
||||
for i, symptom in enumerate(symptom_list[:3]): # Limit to 3 symptoms
|
||||
symptom_name = symptom.strip()
|
||||
if not symptom_name:
|
||||
continue
|
||||
|
||||
state_name = f"Symptom_{symptom_i}"
|
||||
symptom_i += 1
|
||||
|
||||
module["states"][state_name] = {
|
||||
"type": "ConditionOnset",
|
||||
"target_encounter": "Diagnosis_Encounter",
|
||||
"codes": [
|
||||
{
|
||||
"system": "SNOMED-CT",
|
||||
"code": "418107008", # Default symptom code
|
||||
"display": symptom_name
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Connect states
|
||||
module["states"][current_state]["direct_transition"] = state_name
|
||||
current_state = state_name
|
||||
|
||||
# Add diagnostic tests if available
|
||||
if diagnostic_tests:
|
||||
test_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', diagnostic_tests, re.MULTILINE)
|
||||
tests = [t.strip() for t in test_lines if t.strip()]
|
||||
|
||||
if tests:
|
||||
for i, test in enumerate(tests[:2]): # Limit to 2 tests
|
||||
test_name = test.strip()
|
||||
if not test_name:
|
||||
continue
|
||||
|
||||
state_name = f"DiagnosticTest_{i}"
|
||||
|
||||
module["states"][state_name] = {
|
||||
"type": "Procedure",
|
||||
"target_encounter": "Diagnosis_Encounter",
|
||||
"reason": "Onset",
|
||||
"codes": [
|
||||
{
|
||||
"system": "SNOMED-CT",
|
||||
"code": "386053000", # Default diagnostic procedure code
|
||||
"display": test_name
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Connect states
|
||||
module["states"][current_state]["direct_transition"] = state_name
|
||||
current_state = state_name
|
||||
|
||||
# Add medications if found
|
||||
if medications:
|
||||
for i, med in enumerate(medications[:3]): # Limit to 3 medications to keep module manageable
|
||||
med_name = med.strip()
|
||||
if not med_name:
|
||||
continue
|
||||
|
||||
state_name = f"Prescribe_{i}"
|
||||
|
||||
module["states"][state_name] = {
|
||||
"type": "MedicationOrder",
|
||||
"target_encounter": "Diagnosis_Encounter",
|
||||
"reason": "Onset",
|
||||
"codes": [
|
||||
{
|
||||
"system": "RxNorm",
|
||||
"code": "308047", # Default code
|
||||
"display": med_name
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Connect states
|
||||
module["states"][current_state]["direct_transition"] = state_name
|
||||
current_state = state_name
|
||||
|
||||
# Connect to Follow-up
|
||||
module["states"][current_state]["direct_transition"] = "Follow_Up_Encounter"
|
||||
|
||||
# Add follow-up encounter
|
||||
module["states"]["Follow_Up_Encounter"] = {
|
||||
"type": "Encounter",
|
||||
"encounter_class": "ambulatory",
|
||||
"reason": "Onset",
|
||||
"codes": [
|
||||
{
|
||||
"system": "SNOMED-CT",
|
||||
"code": "390906007",
|
||||
"display": "Follow-up encounter"
|
||||
}
|
||||
],
|
||||
"direct_transition": "End_Follow_Up_Encounter"
|
||||
}
|
||||
|
||||
module["states"]["End_Follow_Up_Encounter"] = {
|
||||
"type": "EncounterEnd",
|
||||
"direct_transition": "Terminal"
|
||||
}
|
||||
|
||||
module["states"]["Terminal"] = {
|
||||
"type": "Terminal"
|
||||
}
|
||||
|
||||
# Format the JSON for output
|
||||
formatted_json = json.dumps(module, indent=2)
|
||||
|
||||
# Save the module
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
f.write(formatted_json)
|
||||
|
||||
print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
|
||||
# Write the error to a file
|
||||
with open(f"{OUTPUT_FILE}.error", "w") as f:
|
||||
f.write(f"Error: {str(e)}")
|
||||
return 1
|
||||
|
||||
def main():
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description='Generate a Synthea module for a disease')
|
||||
parser.add_argument('--disease', type=str, required=True, help='Disease name')
|
||||
parser.add_argument('--icd10', type=str, help='ICD-10 code')
|
||||
parser.add_argument('--category', type=str, help='Disease category')
|
||||
parser.add_argument('--no-interactive', action='store_true', help='Skip interactive prompts, for batch processing')
|
||||
parser.add_argument('--auto-fallback', action='store_true', help='Automatically use fallback template if JSON parsing fails')
|
||||
parser.add_argument('--force', action='store_true', help='Force regeneration even if module already exists')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set global variables
|
||||
global DISEASE_NAME, OUTPUT_FILE, use_interactive, use_auto_fallback
|
||||
DISEASE_NAME = args.disease
|
||||
normalized_name = normalize_disease_name(DISEASE_NAME)
|
||||
OUTPUT_FILE = f"src/main/resources/modules/{normalized_name}.json"
|
||||
use_interactive = not args.no_interactive
|
||||
use_auto_fallback = args.auto_fallback
|
||||
|
||||
# Use provided ICD-10 code or category if specified
|
||||
global icd10_code, disease_category
|
||||
icd10_code = args.icd10
|
||||
disease_category = args.category
|
||||
|
||||
print(f"Disease name: {DISEASE_NAME}")
|
||||
print(f"Normalized filename: {normalized_name}.json")
|
||||
|
||||
# Check if module already exists
|
||||
if os.path.exists(OUTPUT_FILE):
|
||||
print(f"✅ Module already exists at: {OUTPUT_FILE}")
|
||||
if args.force:
|
||||
print("Force flag set. Regenerating module.")
|
||||
else:
|
||||
return 0
|
||||
else:
|
||||
print(f"❌ Module not found at: {OUTPUT_FILE}")
|
||||
|
||||
# If no ICD-10 code provided, look up in disease list CSV
|
||||
if not icd10_code:
|
||||
csv_path = "src/main/resources/disease_list.csv"
|
||||
if os.path.exists(csv_path):
|
||||
print(f"Looking up disease information in {csv_path}...")
|
||||
try:
|
||||
with open(csv_path, 'r') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
if row.get("disease_name", "").lower() == DISEASE_NAME.lower():
|
||||
icd10_code = row.get("id", "")
|
||||
disease_category = row.get("disease_category", "")
|
||||
print(f"Found matching disease in CSV:")
|
||||
print(f"- ICD-10 code: {icd10_code}")
|
||||
print(f"- Category: {disease_category}")
|
||||
break
|
||||
else:
|
||||
print("No matching disease found in CSV. Continuing without ICD-10 code.")
|
||||
except Exception as e:
|
||||
print(f"Error reading CSV: {str(e)}")
|
||||
print("Continuing without ICD-10 code.")
|
||||
|
||||
# Ensure ANTHROPIC_API_KEY is set
|
||||
if not os.environ.get('ANTHROPIC_API_KEY'):
|
||||
print("Error: ANTHROPIC_API_KEY environment variable is not set")
|
||||
return 1
|
||||
|
||||
# Ask Claude to generate the module
|
||||
generate_module()
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
321
scripts/patient_analysis.py
Executable file
321
scripts/patient_analysis.py
Executable file
@@ -0,0 +1,321 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import glob
|
||||
import argparse
|
||||
from collections import Counter
|
||||
from datetime import datetime
|
||||
|
||||
def analyze_patient_data(disease_name, input_dir, output_dir, format_type="html"):
|
||||
print(f"Analyzing patient data for {disease_name}...")
|
||||
|
||||
# Create the output directory if it doesn't exist
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Find all patient JSON files
|
||||
patients_files = glob.glob(f"{input_dir}/*.json")
|
||||
patients_files = [f for f in patients_files if not 'hospitalInformation' in f and not 'practitionerInformation' in f]
|
||||
|
||||
print(f"Found {len(patients_files)} patient records for analysis")
|
||||
|
||||
if len(patients_files) == 0:
|
||||
print("No patient files found to analyze.")
|
||||
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.html"), 'w') as f:
|
||||
f.write(f"<html><body><h1>Analysis Report for {disease_name}</h1><p>No patient files found to analyze.</p></body></html>")
|
||||
# Create empty CSV and JSON files
|
||||
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.csv"), 'w') as f:
|
||||
f.write("No patient files found to analyze.\n")
|
||||
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.json"), 'w') as f:
|
||||
f.write('{"error": "No patient files found to analyze."}\n')
|
||||
return
|
||||
|
||||
# Initialize data collectors
|
||||
demographics = {'gender': Counter(), 'age': [], 'race': Counter(), 'ethnicity': Counter()}
|
||||
condition_counts = Counter()
|
||||
medication_counts = Counter()
|
||||
|
||||
# Process each patient file
|
||||
for patient_file in patients_files:
|
||||
try:
|
||||
with open(patient_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Skip non-patient resources
|
||||
if 'resourceType' in data and data['resourceType'] == 'Patient':
|
||||
# Basic patient info
|
||||
if 'gender' in data:
|
||||
demographics['gender'][data['gender']] += 1
|
||||
|
||||
if 'birthDate' in data:
|
||||
# Calculate age based on birth year
|
||||
birth_year = int(data['birthDate'][:4])
|
||||
current_year = datetime.now().year
|
||||
age = current_year - birth_year
|
||||
demographics['age'].append(age)
|
||||
|
||||
# Process race and ethnicity extensions
|
||||
if 'extension' in data:
|
||||
for ext in data.get('extension', []):
|
||||
if 'url' in ext and 'extension' in ext:
|
||||
if ext['url'].endswith('us-core-race'):
|
||||
for race_ext in ext['extension']:
|
||||
if 'valueCoding' in race_ext:
|
||||
race = race_ext['valueCoding'].get('display', 'Unknown')
|
||||
demographics['race'][race] += 1
|
||||
elif ext['url'].endswith('us-core-ethnicity'):
|
||||
for eth_ext in ext['extension']:
|
||||
if 'valueCoding' in eth_ext:
|
||||
ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
|
||||
demographics['ethnicity'][ethnicity] += 1
|
||||
|
||||
# Check for Bundle resources with entries
|
||||
if 'resourceType' in data and data['resourceType'] == 'Bundle' and 'entry' in data:
|
||||
bundle_has_patient = False
|
||||
for entry in data['entry']:
|
||||
if 'resource' in entry:
|
||||
resource = entry['resource']
|
||||
|
||||
# Check if this bundle contains a patient
|
||||
if resource.get('resourceType') == 'Patient':
|
||||
bundle_has_patient = True
|
||||
# Basic patient info
|
||||
if 'gender' in resource:
|
||||
demographics['gender'][resource['gender']] += 1
|
||||
|
||||
if 'birthDate' in resource:
|
||||
# Calculate age based on birth year
|
||||
birth_year = int(resource['birthDate'][:4])
|
||||
current_year = datetime.now().year
|
||||
age = current_year - birth_year
|
||||
demographics['age'].append(age)
|
||||
|
||||
# Process race and ethnicity extensions
|
||||
if 'extension' in resource:
|
||||
for ext in resource.get('extension', []):
|
||||
if 'url' in ext and 'extension' in ext:
|
||||
if ext['url'].endswith('us-core-race'):
|
||||
for race_ext in ext['extension']:
|
||||
if 'valueCoding' in race_ext:
|
||||
race = race_ext['valueCoding'].get('display', 'Unknown')
|
||||
demographics['race'][race] += 1
|
||||
elif ext['url'].endswith('us-core-ethnicity'):
|
||||
for eth_ext in ext['extension']:
|
||||
if 'valueCoding' in eth_ext:
|
||||
ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
|
||||
demographics['ethnicity'][ethnicity] += 1
|
||||
|
||||
# Check for conditions
|
||||
if resource.get('resourceType') == 'Condition':
|
||||
if 'code' in resource and 'coding' in resource['code']:
|
||||
for code in resource['code']['coding']:
|
||||
if 'display' in code:
|
||||
condition_counts[code['display']] += 1
|
||||
|
||||
# Check for medications
|
||||
if resource.get('resourceType') == 'MedicationRequest':
|
||||
if 'medicationCodeableConcept' in resource and 'coding' in resource['medicationCodeableConcept']:
|
||||
for code in resource['medicationCodeableConcept']['coding']:
|
||||
if 'display' in code:
|
||||
medication_counts[code['display']] += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {patient_file}: {e}")
|
||||
|
||||
# Calculate total patients (count unique patient files)
|
||||
total_patients = sum(demographics['gender'].values())
|
||||
if total_patients == 0:
|
||||
print("Warning: No patient demographics found. Setting total_patients to file count.")
|
||||
total_patients = len(patients_files)
|
||||
|
||||
print(f"Total patients found: {total_patients}")
|
||||
print(f"Gender distribution: {dict(demographics['gender'])}")
|
||||
|
||||
if total_patients == 0:
|
||||
total_patients = 1 # Avoid division by zero
|
||||
|
||||
# Generate HTML report
|
||||
if format_type.lower() in ["html", "all"]:
|
||||
create_html_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
|
||||
|
||||
# Generate CSV report
|
||||
if format_type.lower() in ["csv", "all"]:
|
||||
create_csv_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
|
||||
|
||||
# Generate JSON report
|
||||
if format_type.lower() in ["json", "all"]:
|
||||
create_json_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
|
||||
|
||||
print(f"Analysis complete. Reports generated in {output_dir}")
|
||||
|
||||
|
||||
def create_html_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
|
||||
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.html"), 'w') as f:
|
||||
f.write(f'''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Synthea Patient Analysis - {disease_name}</title>
|
||||
<style>
|
||||
body {{ font-family: Arial, sans-serif; margin: 20px; }}
|
||||
h1, h2, h3 {{ color: #333; }}
|
||||
.container {{ max-width: 1000px; margin: 0 auto; }}
|
||||
table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
|
||||
th, td {{ text-align: left; padding: 8px; border-bottom: 1px solid #ddd; }}
|
||||
th {{ background-color: #f2f2f2; }}
|
||||
tr:hover {{background-color: #f5f5f5;}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>Synthea Patient Analysis - {disease_name}</h1>
|
||||
<p>Total patients analyzed: {total_patients}</p>
|
||||
|
||||
<h2>Demographics</h2>
|
||||
<h3>Gender Distribution</h3>
|
||||
<table>
|
||||
<tr><th>Gender</th><th>Count</th><th>Percentage</th></tr>
|
||||
''')
|
||||
|
||||
for gender, count in demographics['gender'].items():
|
||||
percentage = (count / total_patients) * 100
|
||||
f.write(f"<tr><td>{gender}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
|
||||
|
||||
f.write('''
|
||||
</table>
|
||||
|
||||
<h3>Age Statistics</h3>
|
||||
<table>
|
||||
''')
|
||||
|
||||
if demographics['age']:
|
||||
min_age = min(demographics['age'])
|
||||
max_age = max(demographics['age'])
|
||||
avg_age = sum(demographics['age']) / len(demographics['age'])
|
||||
f.write(f"<tr><td>Minimum Age</td><td>{min_age}</td></tr>\n")
|
||||
f.write(f"<tr><td>Maximum Age</td><td>{max_age}</td></tr>\n")
|
||||
f.write(f"<tr><td>Average Age</td><td>{avg_age:.1f}</td></tr>\n")
|
||||
else:
|
||||
f.write("<tr><td colspan='2'>No age data available</td></tr>\n")
|
||||
|
||||
f.write('''
|
||||
</table>
|
||||
|
||||
<h3>Top Conditions</h3>
|
||||
<table>
|
||||
<tr><th>Condition</th><th>Count</th><th>Percentage of Patients</th></tr>
|
||||
''')
|
||||
|
||||
for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
|
||||
percentage = (count / total_patients) * 100
|
||||
f.write(f"<tr><td>{condition}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
|
||||
|
||||
f.write('''
|
||||
</table>
|
||||
|
||||
<h3>Top Medications</h3>
|
||||
<table>
|
||||
<tr><th>Medication</th><th>Count</th><th>Percentage of Patients</th></tr>
|
||||
''')
|
||||
|
||||
for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
|
||||
percentage = (count / total_patients) * 100
|
||||
f.write(f"<tr><td>{medication}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
|
||||
|
||||
f.write('''
|
||||
</table>
|
||||
</div>
|
||||
</body>
|
||||
</html>''')
|
||||
|
||||
|
||||
def create_csv_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
|
||||
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.csv"), 'w') as f:
|
||||
# Write header
|
||||
f.write(f"Synthea Patient Analysis - {disease_name}\n")
|
||||
f.write(f"Total patients analyzed,{total_patients}\n\n")
|
||||
|
||||
# Gender distribution
|
||||
f.write("Gender Distribution\n")
|
||||
f.write("Gender,Count,Percentage\n")
|
||||
for gender, count in demographics['gender'].items():
|
||||
percentage = (count / total_patients) * 100
|
||||
f.write(f"{gender},{count},{percentage:.1f}%\n")
|
||||
f.write("\n")
|
||||
|
||||
# Age statistics
|
||||
f.write("Age Statistics\n")
|
||||
if demographics['age']:
|
||||
min_age = min(demographics['age'])
|
||||
max_age = max(demographics['age'])
|
||||
avg_age = sum(demographics['age']) / len(demographics['age'])
|
||||
f.write(f"Minimum Age,{min_age}\n")
|
||||
f.write(f"Maximum Age,{max_age}\n")
|
||||
f.write(f"Average Age,{avg_age:.1f}\n")
|
||||
else:
|
||||
f.write("No age data available\n")
|
||||
f.write("\n")
|
||||
|
||||
# Top conditions
|
||||
f.write("Top Conditions\n")
|
||||
f.write("Condition,Count,Percentage of Patients\n")
|
||||
for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
|
||||
percentage = (count / total_patients) * 100
|
||||
f.write(f"{condition},{count},{percentage:.1f}%\n")
|
||||
f.write("\n")
|
||||
|
||||
# Top medications
|
||||
f.write("Top Medications\n")
|
||||
f.write("Medication,Count,Percentage of Patients\n")
|
||||
for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
|
||||
percentage = (count / total_patients) * 100
|
||||
f.write(f"{medication},{count},{percentage:.1f}%\n")
|
||||
|
||||
|
||||
def create_json_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
|
||||
# Prepare the report data
|
||||
report_data = {
|
||||
"disease": disease_name,
|
||||
"total_patients": total_patients,
|
||||
"demographics": {
|
||||
"gender": {k: v for k, v in demographics['gender'].items()},
|
||||
"race": {k: v for k, v in demographics['race'].items()},
|
||||
"ethnicity": {k: v for k, v in demographics['ethnicity'].items()}
|
||||
},
|
||||
"age_statistics": {}
|
||||
}
|
||||
|
||||
if demographics['age']:
|
||||
report_data["age_statistics"] = {
|
||||
"min_age": min(demographics['age']),
|
||||
"max_age": max(demographics['age']),
|
||||
"avg_age": sum(demographics['age']) / len(demographics['age'])
|
||||
}
|
||||
|
||||
# Add top conditions
|
||||
report_data["top_conditions"] = [
|
||||
{"name": condition, "count": count, "percentage": (count / total_patients) * 100}
|
||||
for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]
|
||||
]
|
||||
|
||||
# Add top medications
|
||||
report_data["top_medications"] = [
|
||||
{"name": medication, "count": count, "percentage": (count / total_patients) * 100}
|
||||
for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]
|
||||
]
|
||||
|
||||
# Write to JSON file
|
||||
with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.json"), 'w') as f:
|
||||
json.dump(report_data, f, indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Analyze Synthea patient data")
|
||||
parser.add_argument("--disease", required=True, help="Disease name")
|
||||
parser.add_argument("--input_dir", required=True, help="Input directory with FHIR files")
|
||||
parser.add_argument("--output_dir", default=".", help="Output directory for reports")
|
||||
parser.add_argument("--format", default="html", choices=["html", "csv", "json", "all"],
|
||||
help="Output format (html, csv, json, or all)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
analyze_patient_data(args.disease, args.input_dir, args.output_dir, args.format)
|
||||
127
scripts/prepare_environment.sh
Executable file
127
scripts/prepare_environment.sh
Executable file
@@ -0,0 +1,127 @@
|
||||
#!/bin/bash
|
||||
# prepare_environment.sh
|
||||
#
|
||||
# This script prepares the environment for running the Synthea pipeline.
|
||||
# It rebuilds the Docker containers, ensures the directory structure is correct,
|
||||
# and starts the necessary services.
|
||||
|
||||
set -e # Exit on error
|
||||
|
||||
# Display a header
|
||||
echo "=================================================="
|
||||
echo "Synthea All Diseases Pipeline - Environment Setup"
|
||||
echo "=================================================="
|
||||
|
||||
# Check for Docker
|
||||
if ! command -v docker &> /dev/null; then
|
||||
echo "ERROR: Docker is not installed or not in the PATH."
|
||||
echo "Please install Docker and try again."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check for docker-compose
|
||||
if ! command -v docker-compose &> /dev/null; then
|
||||
echo "ERROR: docker-compose is not installed or not in the PATH."
|
||||
echo "Please install docker-compose and try again."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create necessary directories if they don't exist
|
||||
echo "Creating required directories..."
|
||||
mkdir -p module_generator
|
||||
mkdir -p src/main/resources/modules
|
||||
mkdir -p src/main/resources/disease
|
||||
mkdir -p modules
|
||||
mkdir -p output
|
||||
|
||||
# Check if the module_generator directory has the required scripts
|
||||
if [ ! -f "module_generator/module_generator.py" ] || [ ! -f "module_generator/run_module_generator.py" ]; then
|
||||
echo "ERROR: Required Python scripts not found in module_generator directory!"
|
||||
# Check if they're in the python directory and we need to move them
|
||||
if [ -d "python" ] && [ -f "python/module_generator.py" ] && [ -f "python/run_module_generator.py" ]; then
|
||||
echo "Found scripts in python directory, moving them to module_generator directory..."
|
||||
mkdir -p module_generator
|
||||
mv python/module_generator.py module_generator/
|
||||
mv python/run_module_generator.py module_generator/
|
||||
# Copy README if exists
|
||||
if [ -f "python/README_module_generator.md" ]; then
|
||||
mv python/README_module_generator.md module_generator/
|
||||
fi
|
||||
else
|
||||
echo "Scripts not found in python directory either. Please ensure the scripts are in the module_generator directory."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Make the Python scripts executable
|
||||
chmod +x module_generator/module_generator.py
|
||||
chmod +x module_generator/run_module_generator.py
|
||||
|
||||
# Create symlinks for convenience
|
||||
ln -sf "$(pwd)/module_generator/run_module_generator.py" "$(pwd)/run_module_generator.py"
|
||||
ln -sf "$(pwd)/module_generator/module_generator.py" "$(pwd)/module_generator.py"
|
||||
|
||||
# Create an empty disease_list.json if it doesn't exist
|
||||
if [ ! -f "src/main/resources/disease_list.json" ]; then
|
||||
echo "Creating empty disease_list.json..."
|
||||
echo "[]" > src/main/resources/disease_list.json
|
||||
fi
|
||||
|
||||
# Check if .env file exists
|
||||
if [ ! -f ".env" ]; then
|
||||
echo ".env file is missing. Creating from .env.example..."
|
||||
if [ -f ".env.example" ]; then
|
||||
cp .env.example .env
|
||||
echo " Created .env from .env.example. Please edit it with your API key."
|
||||
else
|
||||
echo "WARNING: .env.example not found. Creating minimal .env file..."
|
||||
echo "ANTHROPIC_API_KEY=your_api_key_here" > .env
|
||||
fi
|
||||
fi
|
||||
|
||||
# Stop any running containers
|
||||
echo "Stopping any running containers..."
|
||||
docker-compose down || true
|
||||
|
||||
# Rebuild the containers
|
||||
echo "Building Docker containers..."
|
||||
docker-compose build
|
||||
|
||||
# Start the Synthea container
|
||||
echo "Starting Synthea container..."
|
||||
docker-compose up -d synthea
|
||||
|
||||
# Give the container a moment to start
|
||||
echo "Waiting for Synthea container to be ready..."
|
||||
sleep 10
|
||||
|
||||
# Check if the container is healthy
|
||||
CONTAINER_ID=$(docker-compose ps -q synthea)
|
||||
if [ -z "$CONTAINER_ID" ]; then
|
||||
echo "ERROR: Failed to start Synthea container."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if the container is using proper health checks
|
||||
HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' $CONTAINER_ID 2>/dev/null || echo "unknown")
|
||||
if [ "$HEALTH_STATUS" != "healthy" ] && [ "$HEALTH_STATUS" != "unknown" ]; then
|
||||
echo "WARNING: Synthea container is not reporting as healthy (status: $HEALTH_STATUS)."
|
||||
echo "It may take some time for the container to become fully operational."
|
||||
echo "You can proceed, but the pipeline may fail if the container is not ready."
|
||||
fi
|
||||
|
||||
# Check if we should generate modules
|
||||
if [ "$1" == "--generate-modules" ]; then
|
||||
echo "Starting module generator..."
|
||||
docker-compose --profile generator up module-generator
|
||||
fi
|
||||
|
||||
echo "=================================================="
|
||||
echo "Environment is ready!"
|
||||
echo ""
|
||||
echo "To generate patients for a disease module:"
|
||||
echo "nextflow run main.nf --disease_name \"Disease Name\" --generate_patients true"
|
||||
echo ""
|
||||
echo "To generate a new module and patients:"
|
||||
echo "nextflow run main.nf --disease_name \"Disease Name\" --force_generate true --generate_patients true"
|
||||
echo "=================================================="
|
||||
77
scripts/run_module.py
Normal file
77
scripts/run_module.py
Normal file
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import anthropic
|
||||
|
||||
DISEASE_NAME = "Excessive frequent and irregular menstruation"
|
||||
OUTPUT_FILE = "excessive_frequent_and_irregular_menstruation.json"
|
||||
|
||||
# Initialize the Anthropic client with your API key
|
||||
client = anthropic.Anthropic(
|
||||
# This is the default and can be omitted
|
||||
api_key=os.environ.get("ANTHROPIC_API_KEY"),
|
||||
)
|
||||
|
||||
# Define the prompt for generating the module
|
||||
def generate_module_prompt(disease_name):
|
||||
return f"""
|
||||
You are a medical expert creating a disease module for the Synthea patient simulation system.
|
||||
I need you to create a structured JSON module for {disease_name}.
|
||||
|
||||
The module will be used to simulate patients with this condition in the Synthea healthcare simulation system.
|
||||
The JSON should follow the Synthea module format which includes:
|
||||
|
||||
1. Basic module information (name, remarks)
|
||||
2. States representing the progression of the disease
|
||||
3. Transitions between states
|
||||
4. Guard conditions based on patient attributes where appropriate
|
||||
5. Care plans and medications that would be prescribed
|
||||
|
||||
Your output should be valid JSON that follows the Synthea module structure precisely. Format it as a complete,
|
||||
well-structured Synthea module. Make sure it's medically accurate and includes all relevant clinical details,
|
||||
treatment options, and disease progression patterns.
|
||||
|
||||
Output only the JSON with no additional commentary or markdown formatting. The output will be directly saved as a file.
|
||||
"""
|
||||
|
||||
print(f"Generating module for {DISEASE_NAME}...")
|
||||
|
||||
try:
|
||||
# Send a message to Claude
|
||||
message = client.messages.create(
|
||||
model="claude-3-7-sonnet-20250219",
|
||||
max_tokens=4000,
|
||||
temperature=0,
|
||||
messages=[
|
||||
{"role": "user", "content": generate_module_prompt(DISEASE_NAME)}
|
||||
]
|
||||
)
|
||||
|
||||
# Extract the JSON from the response
|
||||
module_json = message.content[0].text
|
||||
|
||||
# Find the first '{' and last '}' to extract just the JSON part
|
||||
start = module_json.find('{')
|
||||
end = module_json.rfind('}') + 1
|
||||
if start >= 0 and end > start:
|
||||
module_json = module_json[start:end]
|
||||
|
||||
# Parse and format the JSON
|
||||
parsed = json.loads(module_json)
|
||||
formatted_json = json.dumps(parsed, indent=2)
|
||||
|
||||
# Write to file
|
||||
with open(OUTPUT_FILE, 'w') as f:
|
||||
f.write(formatted_json)
|
||||
|
||||
print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
# Save the raw response for debugging
|
||||
with open("error_output.txt", 'w') as f:
|
||||
f.write(str(e))
|
||||
print("Error details saved to error_output.txt")
|
||||
sys.exit(1)
|
||||
182
scripts/run_pipeline.sh
Executable file
182
scripts/run_pipeline.sh
Executable file
@@ -0,0 +1,182 @@
|
||||
#!/bin/bash
|
||||
# run_pipeline.sh
|
||||
#
|
||||
# This script runs the Nextflow pipeline for generating disease modules and synthetic patients
|
||||
|
||||
set -e # Exit on error
|
||||
|
||||
# Default values
|
||||
DISEASE_NAME=""
|
||||
FORCE_GENERATE=false
|
||||
GENERATE_PATIENTS=false
|
||||
POPULATION=100
|
||||
GENDER=0.5
|
||||
MIN_AGE=0
|
||||
MAX_AGE=90
|
||||
SEED=""
|
||||
ANALYZE_DATA=false
|
||||
REPORT_FORMAT="html"
|
||||
VERIFY_DOCKER=true
|
||||
|
||||
# Parse command line arguments
|
||||
function show_help {
|
||||
echo "Usage: $0 [options] --disease \"Disease Name\""
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --disease, -d NAME Disease name to generate a module for (required)"
|
||||
echo " --force-generate, -f Force regeneration of disease module even if it exists"
|
||||
echo " --patients, -p Generate synthetic patients (default: false)"
|
||||
echo " --population, -n NUM Number of patients to generate (default: 100)"
|
||||
echo " --gender, -g VALUE Gender distribution (0-1 for % female, default: 0.5)"
|
||||
echo " --min-age, -a NUM Minimum patient age (default: 0)"
|
||||
echo " --max-age, -m NUM Maximum patient age (default: 90)"
|
||||
echo " --seed, -s SEED Random seed for reproducibility"
|
||||
echo " --analyze, -A Analyze patient data after generation"
|
||||
echo " --report-format, -r FMT Report format for analysis (html, json, csv, default: html)"
|
||||
echo " --skip-docker-check Skip Docker container verification"
|
||||
echo " --help, -h Show this help message"
|
||||
echo ""
|
||||
echo "Example:"
|
||||
echo " $0 --disease \"Multiple Sclerosis\" --patients --population 50"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--disease|-d)
|
||||
DISEASE_NAME="$2"
|
||||
shift 2
|
||||
;;
|
||||
--force-generate|-f)
|
||||
FORCE_GENERATE=true
|
||||
shift
|
||||
;;
|
||||
--patients|-p)
|
||||
GENERATE_PATIENTS=true
|
||||
shift
|
||||
;;
|
||||
--population|-n)
|
||||
POPULATION="$2"
|
||||
shift 2
|
||||
;;
|
||||
--gender|-g)
|
||||
GENDER="$2"
|
||||
shift 2
|
||||
;;
|
||||
--min-age|-a)
|
||||
MIN_AGE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--max-age|-m)
|
||||
MAX_AGE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--seed|-s)
|
||||
SEED="$2"
|
||||
shift 2
|
||||
;;
|
||||
--analyze|-A)
|
||||
ANALYZE_DATA=true
|
||||
shift
|
||||
;;
|
||||
--report-format|-r)
|
||||
REPORT_FORMAT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--skip-docker-check)
|
||||
VERIFY_DOCKER=false
|
||||
shift
|
||||
;;
|
||||
--help|-h)
|
||||
show_help
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
show_help
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check if disease name is provided
|
||||
if [ -z "$DISEASE_NAME" ]; then
|
||||
echo "ERROR: Disease name is required!"
|
||||
show_help
|
||||
fi
|
||||
|
||||
# Check if Docker container is running if we need to generate patients
|
||||
if [ "$VERIFY_DOCKER" = true ] && [ "$GENERATE_PATIENTS" = true ]; then
|
||||
CONTAINER_RUNNING=$(docker ps | grep synthea | wc -l)
|
||||
if [ "$CONTAINER_RUNNING" -eq 0 ]; then
|
||||
echo "No Synthea Docker containers are running!"
|
||||
echo "Running environment setup script to start containers..."
|
||||
|
||||
if [ -f "./scripts/prepare_environment.sh" ]; then
|
||||
./scripts/prepare_environment.sh
|
||||
elif [ -f "./prepare_environment.sh" ]; then
|
||||
./prepare_environment.sh
|
||||
else
|
||||
echo "ERROR: prepare_environment.sh not found!"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Synthea Docker container is running. Proceeding with pipeline execution."
|
||||
fi
|
||||
fi
|
||||
|
||||
# Create params.json file with our configuration
|
||||
echo "Creating params.json file with pipeline configuration..."
|
||||
cat > params.json << EOF
|
||||
{
|
||||
"disease_name": "$DISEASE_NAME",
|
||||
"modules_dir": "modules",
|
||||
"output_dir": "output",
|
||||
"generate_patients": $GENERATE_PATIENTS,
|
||||
"population": $POPULATION,
|
||||
"gender": $GENDER,
|
||||
"min_age": $MIN_AGE,
|
||||
"max_age": $MAX_AGE,
|
||||
"analyze_patient_data": $ANALYZE_DATA,
|
||||
"report_format": "$REPORT_FORMAT",
|
||||
"force_generate": $FORCE_GENERATE,
|
||||
"publish_dir": "published_output"
|
||||
EOF
|
||||
|
||||
# Add seed if provided
|
||||
if [ ! -z "$SEED" ]; then
|
||||
echo ", \"seed\": $SEED" >> params.json
|
||||
fi
|
||||
|
||||
# Close JSON object
|
||||
echo "}" >> params.json
|
||||
|
||||
# Display execution details
|
||||
echo "=================================================="
|
||||
echo "Running Synthea Pipeline for: $DISEASE_NAME"
|
||||
echo "Generate patients: $GENERATE_PATIENTS"
|
||||
if [ "$GENERATE_PATIENTS" = true ]; then
|
||||
echo "Population: $POPULATION"
|
||||
echo "Gender ratio (proportion female): $GENDER"
|
||||
echo "Age range: $MIN_AGE-$MAX_AGE"
|
||||
if [ ! -z "$SEED" ]; then
|
||||
echo "Random seed: $SEED"
|
||||
fi
|
||||
fi
|
||||
echo "Force module generation: $FORCE_GENERATE"
|
||||
echo "Analyze patient data: $ANALYZE_DATA"
|
||||
echo "=================================================="
|
||||
|
||||
# Execute the Nextflow command
|
||||
echo "Starting Nextflow pipeline..."
|
||||
nextflow run main.nf
|
||||
|
||||
echo ""
|
||||
echo "Pipeline execution complete!"
|
||||
if [ "$GENERATE_PATIENTS" = true ]; then
|
||||
echo "Check the 'output' directory and 'published_output/$DISEASE_NAME_NORMALIZED' for generated patient data."
|
||||
if [ "$ANALYZE_DATA" = true ]; then
|
||||
echo "Analysis reports can be found in 'published_output/$DISEASE_NAME_NORMALIZED/analysis'."
|
||||
fi
|
||||
fi
|
||||
echo "Generated modules can be found in the 'modules' directory and 'published_output/modules'."
|
||||
echo ""
|
||||
99
scripts/simple_module.py
Executable file
99
scripts/simple_module.py
Executable file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import anthropic
|
||||
|
||||
DISEASE_NAME = "Excessive frequent and irregular menstruation"
|
||||
OUTPUT_FILE = "excessive_frequent_and_irregular_menstruation.json"
|
||||
|
||||
# Initialize the Anthropic client
|
||||
client = anthropic.Anthropic()
|
||||
|
||||
print(f"Generating module for {DISEASE_NAME}...")
|
||||
|
||||
try:
|
||||
# Send a message to Claude
|
||||
message = client.messages.create(
|
||||
model="claude-3-7-sonnet-20250219",
|
||||
max_tokens=4000,
|
||||
temperature=0,
|
||||
messages=[
|
||||
{"role": "user", "content": f"""Create a Synthea disease module for {DISEASE_NAME} in JSON format.
|
||||
|
||||
The module should follow this structure:
|
||||
{{
|
||||
"name": "Module Name",
|
||||
"remarks": [
|
||||
"Description of the module"
|
||||
],
|
||||
"states": {{
|
||||
"Initial": {{
|
||||
"type": "Initial",
|
||||
"direct_transition": "Next State"
|
||||
}},
|
||||
"Terminal": {{
|
||||
"type": "Terminal"
|
||||
}}
|
||||
// Additional states with appropriate transitions
|
||||
}}
|
||||
}}
|
||||
|
||||
Make sure the JSON is properly formatted with no syntax errors.
|
||||
Do not include any markdown formatting, comments, or explanations outside the JSON.
|
||||
Output only the valid JSON object."""}
|
||||
]
|
||||
)
|
||||
|
||||
# Extract the JSON from the response
|
||||
module_json = message.content[0].text
|
||||
|
||||
# Save the raw response for debugging
|
||||
with open(f"{OUTPUT_FILE}.raw", "w") as f:
|
||||
f.write(module_json)
|
||||
|
||||
print(f"Raw response saved to {OUTPUT_FILE}.raw")
|
||||
|
||||
# Find the first { and last } to extract just the JSON part
|
||||
start = module_json.find("{")
|
||||
end = module_json.rfind("}") + 1
|
||||
if start >= 0 and end > start:
|
||||
module_json = module_json[start:end]
|
||||
|
||||
# Fix common JSON issues
|
||||
try:
|
||||
# Manual cleaning of known JSON issues
|
||||
# Find and remove lines with invalid syntax
|
||||
cleaned_lines = []
|
||||
for line in module_json.split('\n'):
|
||||
# Skip lines with "{%" or any other invalid JSON syntax
|
||||
if "{%" in line or "%}" in line or "//" in line:
|
||||
print(f"Removing invalid line: {line}")
|
||||
continue
|
||||
cleaned_lines.append(line)
|
||||
|
||||
cleaned_json = '\n'.join(cleaned_lines)
|
||||
|
||||
# Try to parse and fix the JSON
|
||||
parsed = json.loads(cleaned_json)
|
||||
formatted_json = json.dumps(parsed, indent=2)
|
||||
|
||||
# Write to file
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
f.write(formatted_json)
|
||||
|
||||
print(f"Successfully generated module and saved to {OUTPUT_FILE}")
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"JSON parsing error: {e}")
|
||||
print("Attempting secondary cleaning method...")
|
||||
|
||||
# Write the error details for debugging
|
||||
with open(f"{OUTPUT_FILE}.error", "w") as f:
|
||||
f.write(f"Error: {str(e)}\n\n")
|
||||
f.write("JSON that failed to parse:\n")
|
||||
f.write(module_json)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
sys.exit(1)
|
||||
1
scripts/test.py
Normal file
1
scripts/test.py
Normal file
@@ -0,0 +1 @@
|
||||
import anthropic; import os; client = anthropic.Anthropic(); print(client.messages.create(model="claude-3-7-sonnet-20250219", max_tokens=1000, messages=[{"role": "user", "content": "Hello"}]).content[0].text)
|
||||
123
scripts/test_module_exists.py
Executable file
123
scripts/test_module_exists.py
Executable file
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple script to check if a module exists for a given disease and generate it if not.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
# Constants
|
||||
DISEASE_NAME = sys.argv[1] if len(sys.argv) > 1 else "Excessive frequent and irregular menstruation"
|
||||
MODULES_DIR = "src/main/resources/modules"
|
||||
RUN_MODULE_GENERATOR_PATH = "src/main/python/run_module_generator.py"
|
||||
DISEASE_LIST_PATH = "src/main/resources/disease_list.json"
|
||||
|
||||
# Function to normalize disease name for filenames
|
||||
def normalize_filename(name):
|
||||
"""Convert disease name to normalized filename format."""
|
||||
filename = name.lower()
|
||||
filename = re.sub(r'[^a-zA-Z0-9]', '_', filename)
|
||||
filename = re.sub(r'_+', '_', filename)
|
||||
filename = filename.strip('_')
|
||||
return filename
|
||||
|
||||
# Main function
|
||||
def main():
|
||||
print(f"Checking if module exists for: {DISEASE_NAME}")
|
||||
|
||||
# Normalize disease name for filename
|
||||
normalized_name = normalize_filename(DISEASE_NAME)
|
||||
module_path = os.path.join(MODULES_DIR, f"{normalized_name}.json")
|
||||
|
||||
# Check if module already exists
|
||||
if os.path.exists(module_path):
|
||||
print(f"✅ Module already exists at: {module_path}")
|
||||
return
|
||||
|
||||
print(f"❌ Module not found at: {module_path}")
|
||||
print(f"Creating disease list entry for {DISEASE_NAME}...")
|
||||
|
||||
# Create temporary disease list with just this disease
|
||||
create_disease_list(DISEASE_NAME)
|
||||
|
||||
# Run the module generator
|
||||
print(f"Running module generator for {DISEASE_NAME}...")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[sys.executable, RUN_MODULE_GENERATOR_PATH, "--batch-size", "1"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
|
||||
print("Module generator output:")
|
||||
print(result.stdout)
|
||||
|
||||
# Check if module was created
|
||||
if os.path.exists(module_path):
|
||||
print(f"✅ Module created successfully at: {module_path}")
|
||||
else:
|
||||
print(f"❌ Module generation failed, file not found at: {module_path}")
|
||||
print("Error output:")
|
||||
print(result.stderr)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error running module generator: {e}")
|
||||
print("STDOUT:")
|
||||
print(e.stdout)
|
||||
print("STDERR:")
|
||||
print(e.stderr)
|
||||
|
||||
def create_disease_list(disease_name):
|
||||
"""Create a temporary disease list with just the requested disease."""
|
||||
disease_list = []
|
||||
|
||||
# Try to read original disease list if it exists
|
||||
if os.path.exists(DISEASE_LIST_PATH):
|
||||
try:
|
||||
with open(DISEASE_LIST_PATH, 'r') as f:
|
||||
original_diseases = json.load(f)
|
||||
|
||||
# Check if our disease already exists
|
||||
name_lower = disease_name.lower()
|
||||
for disease in original_diseases:
|
||||
if disease.get("disease_name", "").lower() == name_lower:
|
||||
disease_list.append(disease)
|
||||
print(f"Found existing disease entry for '{disease_name}'")
|
||||
break
|
||||
else:
|
||||
# Disease not found in list
|
||||
disease_list.append({
|
||||
"id": "", # ICD-10 code (empty as we don't have it)
|
||||
"disease_name": disease_name,
|
||||
"ICD-10_name": disease_name
|
||||
})
|
||||
print(f"Created new disease entry for '{disease_name}'")
|
||||
except Exception as e:
|
||||
print(f"Error reading disease list: {e}")
|
||||
# Create new entry
|
||||
disease_list.append({
|
||||
"id": "",
|
||||
"disease_name": disease_name,
|
||||
"ICD-10_name": disease_name
|
||||
})
|
||||
else:
|
||||
# Create new entry
|
||||
disease_list.append({
|
||||
"id": "",
|
||||
"disease_name": disease_name,
|
||||
"ICD-10_name": disease_name
|
||||
})
|
||||
print(f"Created new disease list with '{disease_name}'")
|
||||
|
||||
# Save disease list
|
||||
os.makedirs(os.path.dirname(DISEASE_LIST_PATH), exist_ok=True)
|
||||
with open(DISEASE_LIST_PATH, 'w') as f:
|
||||
json.dump(disease_list, f, indent=2)
|
||||
|
||||
print(f"Saved disease list with {len(disease_list)} entries.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
305
scripts/validate_module.py
Executable file
305
scripts/validate_module.py
Executable file
@@ -0,0 +1,305 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Disease Module Validator for Synthea
|
||||
|
||||
This script validates a Synthea disease module for JSON correctness and
|
||||
checks for common issues in the module structure.
|
||||
|
||||
Usage:
|
||||
python validate_module.py <path_to_module.json>
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
|
||||
def color_text(text, color_code):
|
||||
"""Add color to terminal output"""
|
||||
return f"\033[{color_code}m{text}\033[0m"
|
||||
|
||||
def red(text):
|
||||
return color_text(text, "91")
|
||||
|
||||
def green(text):
|
||||
return color_text(text, "92")
|
||||
|
||||
def yellow(text):
|
||||
return color_text(text, "93")
|
||||
|
||||
def check_required_fields(module_json):
|
||||
"""Check if the module has all required fields"""
|
||||
required_fields = ['name', 'states', 'gmf_version']
|
||||
missing_fields = []
|
||||
|
||||
for field in required_fields:
|
||||
if field not in module_json:
|
||||
missing_fields.append(field)
|
||||
|
||||
return missing_fields
|
||||
|
||||
def check_transitions(module_json):
|
||||
"""Check if all transitions are valid"""
|
||||
errors = []
|
||||
|
||||
# Skip if no states
|
||||
if 'states' not in module_json:
|
||||
return ["No 'states' field found"]
|
||||
|
||||
states = module_json['states']
|
||||
state_names = set(states.keys())
|
||||
|
||||
# Check each state
|
||||
for state_name, state_config in states.items():
|
||||
# Skip Terminal states (they should not have transitions)
|
||||
if state_config.get('type') == 'Terminal':
|
||||
continue
|
||||
|
||||
# Check if the state has any transition
|
||||
transition_found = False
|
||||
transition_types = ['direct_transition', 'distributed_transition',
|
||||
'conditional_transition', 'complex_transition']
|
||||
|
||||
for transition_type in transition_types:
|
||||
if transition_type in state_config:
|
||||
transition_found = True
|
||||
|
||||
# Check direct transitions
|
||||
if transition_type == 'direct_transition':
|
||||
target = state_config[transition_type]
|
||||
if target not in state_names:
|
||||
errors.append(f"State '{state_name}' has invalid direct_transition to non-existent state '{target}'")
|
||||
|
||||
# Check distributed transitions
|
||||
elif transition_type == 'distributed_transition':
|
||||
for transition in state_config[transition_type]:
|
||||
if 'transition' in transition and transition['transition'] not in state_names:
|
||||
errors.append(f"State '{state_name}' has invalid distributed_transition to non-existent state '{transition['transition']}'")
|
||||
|
||||
# Check conditional transitions
|
||||
elif transition_type == 'conditional_transition':
|
||||
for transition in state_config[transition_type]:
|
||||
if 'transition' in transition and transition['transition'] not in state_names:
|
||||
errors.append(f"State '{state_name}' has invalid conditional_transition to non-existent state '{transition['transition']}'")
|
||||
|
||||
# Check complex transitions
|
||||
elif transition_type == 'complex_transition':
|
||||
for transition in state_config[transition_type]:
|
||||
if 'transition' in transition and transition['transition'] not in state_names:
|
||||
errors.append(f"State '{state_name}' has invalid complex_transition to non-existent state '{transition['transition']}'")
|
||||
if 'distributions' in transition:
|
||||
for dist in transition['distributions']:
|
||||
if 'transition' in dist and dist['transition'] not in state_names:
|
||||
errors.append(f"State '{state_name}' has invalid complex_transition distribution to non-existent state '{dist['transition']}'")
|
||||
|
||||
if not transition_found and state_config.get('type') != 'Terminal':
|
||||
errors.append(f"State '{state_name}' has no transition defined")
|
||||
|
||||
return errors
|
||||
|
||||
def check_codes(module_json):
|
||||
"""Check if medical codes are properly formatted"""
|
||||
warnings = []
|
||||
|
||||
json_str = json.dumps(module_json)
|
||||
|
||||
# Check for codes in common formats
|
||||
code_patterns = {
|
||||
'SNOMED-CT': r'"system":\s*"SNOMED-CT",\s*"code":\s*"[0-9]+"',
|
||||
'LOINC': r'"system":\s*"LOINC",\s*"code":\s*"[0-9\\-]+"',
|
||||
'RxNorm': r'"system":\s*"RxNorm",\s*"code":\s*"[0-9]+"',
|
||||
'ICD-10': r'"system":\s*"ICD-10",\s*"code":\s*"[A-Z][0-9]+"'
|
||||
}
|
||||
|
||||
for code_type, pattern in code_patterns.items():
|
||||
if not re.search(pattern, json_str):
|
||||
warnings.append(f"No {code_type} codes found. This may be normal depending on the module.")
|
||||
|
||||
return warnings
|
||||
|
||||
def check_prevalence(module_json):
|
||||
"""Check if prevalence information exists"""
|
||||
warnings = []
|
||||
|
||||
json_str = json.dumps(module_json)
|
||||
|
||||
# Check for prevalence patterns
|
||||
if not any(pat in json_str for pat in ['"prevalence"', 'incidence', 'probability']):
|
||||
warnings.append("No prevalence, incidence, or probability data found. This may affect realism.")
|
||||
|
||||
return warnings
|
||||
|
||||
def check_circular_references(module_json):
|
||||
"""Check for circular references between states"""
|
||||
errors = []
|
||||
|
||||
# Skip if no states
|
||||
if 'states' not in module_json:
|
||||
return []
|
||||
|
||||
states = module_json['states']
|
||||
|
||||
# Build a directed graph representation
|
||||
graph = {}
|
||||
for state_name in states:
|
||||
graph[state_name] = []
|
||||
|
||||
# Add edges to the graph
|
||||
for state_name, state_config in states.items():
|
||||
# Add direct transitions
|
||||
if 'direct_transition' in state_config:
|
||||
target = state_config['direct_transition']
|
||||
graph[state_name].append(target)
|
||||
|
||||
# Add distributed transitions
|
||||
if 'distributed_transition' in state_config:
|
||||
for transition in state_config['distributed_transition']:
|
||||
if 'transition' in transition:
|
||||
graph[state_name].append(transition['transition'])
|
||||
|
||||
# Add conditional transitions
|
||||
if 'conditional_transition' in state_config:
|
||||
for transition in state_config['conditional_transition']:
|
||||
if 'transition' in transition:
|
||||
graph[state_name].append(transition['transition'])
|
||||
|
||||
# Add complex transitions
|
||||
if 'complex_transition' in state_config:
|
||||
for transition in state_config['complex_transition']:
|
||||
if 'transition' in transition:
|
||||
graph[state_name].append(transition['transition'])
|
||||
|
||||
# Check for cycles in the graph (simplified check)
|
||||
visited = set()
|
||||
path = []
|
||||
|
||||
def dfs(node):
|
||||
visited.add(node)
|
||||
path.append(node)
|
||||
|
||||
for neighbor in graph[node]:
|
||||
if neighbor in path:
|
||||
# Found a cycle
|
||||
cycle_start = path.index(neighbor)
|
||||
errors.append(f"Circular reference detected: {' -> '.join(path[cycle_start:] + [neighbor])}")
|
||||
elif neighbor not in visited:
|
||||
dfs(neighbor)
|
||||
|
||||
path.pop()
|
||||
|
||||
# Run DFS from all states to find cycles
|
||||
for state_name in states:
|
||||
if state_name not in visited:
|
||||
dfs(state_name)
|
||||
|
||||
return errors
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path_to_module.json>")
|
||||
sys.exit(1)
|
||||
|
||||
module_path = sys.argv[1]
|
||||
|
||||
if not os.path.exists(module_path):
|
||||
print(red(f"Error: File {module_path} does not exist"))
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
with open(module_path, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
# First check for valid JSON
|
||||
try:
|
||||
module_json = json.loads(content)
|
||||
print(green("✓ Valid JSON structure"))
|
||||
except json.JSONDecodeError as e:
|
||||
print(red(f"✗ Invalid JSON: {e}"))
|
||||
|
||||
# Try to fix common issues
|
||||
print(yellow("Attempting to fix common JSON issues..."))
|
||||
|
||||
# Fix trailing commas
|
||||
fixed_content = re.sub(r',\s*}', '}', content)
|
||||
fixed_content = re.sub(r',\s*]', ']', fixed_content)
|
||||
|
||||
# Count braces
|
||||
open_braces = fixed_content.count('{')
|
||||
close_braces = fixed_content.count('}')
|
||||
|
||||
if open_braces > close_braces:
|
||||
print(yellow(f"Adding {open_braces - close_braces} missing closing braces"))
|
||||
fixed_content += '}' * (open_braces - close_braces)
|
||||
elif close_braces > open_braces:
|
||||
print(yellow(f"Removing {close_braces - open_braces} excess closing braces"))
|
||||
for _ in range(close_braces - open_braces):
|
||||
fixed_content = fixed_content.rstrip().rstrip('}') + '}'
|
||||
|
||||
try:
|
||||
module_json = json.loads(fixed_content)
|
||||
print(green("✓ Fixed JSON issues successfully"))
|
||||
|
||||
# Write the fixed content back to the file
|
||||
with open(module_path, 'w') as f:
|
||||
f.write(json.dumps(module_json, indent=2))
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
print(red(f"✗ Could not fix JSON: {e}"))
|
||||
sys.exit(1)
|
||||
|
||||
# Check required fields
|
||||
missing_fields = check_required_fields(module_json)
|
||||
if missing_fields:
|
||||
print(red(f"✗ Missing required fields: {', '.join(missing_fields)}"))
|
||||
else:
|
||||
print(green("✓ All required fields present"))
|
||||
|
||||
# Check for invalid transitions
|
||||
transition_errors = check_transitions(module_json)
|
||||
if transition_errors:
|
||||
print(red("✗ Invalid transitions found:"))
|
||||
for error in transition_errors:
|
||||
print(red(f" - {error}"))
|
||||
else:
|
||||
print(green("✓ All transitions valid"))
|
||||
|
||||
# Check for circular references
|
||||
circular_errors = check_circular_references(module_json)
|
||||
if circular_errors:
|
||||
print(red("✗ Circular references found:"))
|
||||
for error in circular_errors:
|
||||
print(red(f" - {error}"))
|
||||
else:
|
||||
print(green("✓ No circular references detected"))
|
||||
|
||||
# Check for medical codes
|
||||
code_warnings = check_codes(module_json)
|
||||
if code_warnings:
|
||||
print(yellow("⚠ Possible code issues:"))
|
||||
for warning in code_warnings:
|
||||
print(yellow(f" - {warning}"))
|
||||
else:
|
||||
print(green("✓ Medical codes look good"))
|
||||
|
||||
# Check for prevalence information
|
||||
prevalence_warnings = check_prevalence(module_json)
|
||||
if prevalence_warnings:
|
||||
print(yellow("⚠ Possible prevalence issues:"))
|
||||
for warning in prevalence_warnings:
|
||||
print(yellow(f" - {warning}"))
|
||||
else:
|
||||
print(green("✓ Prevalence information looks good"))
|
||||
|
||||
# Display module stats
|
||||
print("\nModule Statistics:")
|
||||
print(f"- Name: {module_json.get('name', 'Unknown')}")
|
||||
print(f"- GMF Version: {module_json.get('gmf_version', 'Unknown')}")
|
||||
print(f"- States: {len(module_json.get('states', {}))}")
|
||||
print(f"- Remarks: {len(module_json.get('remarks', []))}")
|
||||
|
||||
except Exception as e:
|
||||
print(red(f"Error: {e}"))
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user