Trying to fix basic functionality again.

2025-03-23 11:53:47 -07:00
parent ebda48190a
commit 2141e81f42
406 changed files with 173963 additions and 69 deletions
--- a/scripts/.nextflow.log
+++ b/scripts/.nextflow.log
@@ -0,0 +1,14 @@
+Mar-20 17:36:01.178 [main] DEBUG nextflow.cli.Launcher - $> nextflow run main.nf --disease_name Migraine --generate_patients true --population 100 --gender 0.6 --modules_dir modules
+Mar-20 17:36:01.428 [main] DEBUG nextflow.cli.CmdRun - N E X T F L O W  ~  version 24.10.5
+Mar-20 17:36:01.492 [main] DEBUG nextflow.plugin.PluginsFacade - Setting up plugin manager > mode=prod; embedded=false; plugins-dir=/Users/richman/.nextflow/plugins; core-plugins: nf-amazon@2.9.2,nf-azure@1.10.2,nf-cloudcache@0.4.2,nf-codecommit@0.2.2,nf-console@1.1.4,nf-google@1.15.4,nf-tower@1.9.3,nf-wave@1.7.4
+Mar-20 17:36:01.528 [main] INFO  o.pf4j.DefaultPluginStatusProvider - Enabled plugins: []
+Mar-20 17:36:01.529 [main] INFO  o.pf4j.DefaultPluginStatusProvider - Disabled plugins: []
+Mar-20 17:36:01.538 [main] INFO  org.pf4j.DefaultPluginManager - PF4J version 3.12.0 in 'deployment' mode
+Mar-20 17:36:01.559 [main] INFO  org.pf4j.AbstractPluginManager - No plugins
+Mar-20 17:36:01.590 [main] DEBUG nextflow.scm.ProviderConfig - Using SCM config path: /Users/richman/.nextflow/scm
+Mar-20 17:36:01.624 [main] DEBUG nextflow.cli.Launcher - Operation aborted
+nextflow.exception.AbortOperationException: Cannot find script file: main.nf
+	at nextflow.cli.CmdRun.getScriptFile(CmdRun.groovy:536)
+	at nextflow.cli.CmdRun.run(CmdRun.groovy:325)
+	at nextflow.cli.Launcher.run(Launcher.groovy:503)
+	at nextflow.cli.Launcher.main(Launcher.groovy:658)
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -0,0 +1,12 @@
+# Utility Scripts
+
+This directory contains utility scripts for working with Synthea modules:
+
+- `analyze_patient_data.py` - Analyzes patient data generated by Synthea
+- `check_json.py` - Validates JSON structure of module files
+- `validate_module.py` - Performs comprehensive validation of modules
+- `test_module_exists.py` - Checks if a module exists for a given disease
+- `run_module.py` - Script to run a specific module with Synthea
+- Other utility scripts for module generation and testing
+
+These scripts complement the core module generator scripts found in the `module_generator/` directory.
--- a/scripts/analyze_patient_data.py
+++ b/scripts/analyze_patient_data.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+import glob
+import pandas as pd
+from collections import Counter
+from datetime import datetime
+
+def analyze_patient_data(input_dir, output_dir, report_format='html', disease_name=None):
+    """
+    Analyze Synthea-generated patient data and create reports.
+    
+    Args:
+        input_dir: Directory containing patient JSON files
+        output_dir: Directory to save analysis outputs
+        report_format: Format for the report (html or csv)
+        disease_name: Optional name of the disease being simulated
+    """
+    # Ensure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Find all patient JSON files
+    patients_files = glob.glob(f"{input_dir}/**/*.json", recursive=True)
+    print(f"Found {len(patients_files)} patient records for analysis")
+    
+    if len(patients_files) == 0:
+        print("No patient files found to analyze.")
+        with open(os.path.join(output_dir, 'patient_stats.json'), 'w') as f:
+            json.dump({"error": "No patient files found to analyze"}, f)
+        return
+    
+    # Initialize data collectors
+    patient_data = []
+    condition_counts = Counter()
+    medication_counts = Counter()
+    demographics = {'gender': Counter(), 'age': [], 'race': Counter(), 'ethnicity': Counter()}
+    
+    # Process each patient file
+    for patient_file in patients_files:
+        try:
+            with open(patient_file, 'r') as f:
+                data = json.load(f)
+                
+            # Basic patient info
+            if 'gender' in data:
+                demographics['gender'][data['gender']] += 1
+            
+            if 'birthDate' in data:
+                # Calculate age based on birth year
+                birth_year = int(data['birthDate'][:4])
+                current_year = datetime.now().year
+                age = current_year - birth_year
+                demographics['age'].append(age)
+            
+            # Process race and ethnicity extensions
+            if 'extension' in data:
+                for ext in data['extension']:
+                    if 'url' in ext and 'extension' in ext:
+                        if ext['url'].endswith('us-core-race'):
+                            for race_ext in ext['extension']:
+                                if 'valueCoding' in race_ext:
+                                    race = race_ext['valueCoding'].get('display', 'Unknown')
+                                    demographics['race'][race] += 1
+                        elif ext['url'].endswith('us-core-ethnicity'):
+                            for eth_ext in ext['extension']:
+                                if 'valueCoding' in eth_ext:
+                                    ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
+                                    demographics['ethnicity'][ethnicity] += 1
+            
+            # Collect conditions and medications
+            if 'entry' in data:
+                for entry in data['entry']:
+                    if 'resource' in entry:
+                        resource = entry['resource']
+                        
+                        # Check for conditions
+                        if resource.get('resourceType') == 'Condition':
+                            if 'code' in resource and 'coding' in resource['code']:
+                                for code in resource['code']['coding']:
+                                    if 'display' in code:
+                                        condition_counts[code['display']] += 1
+                        
+                        # Check for medications
+                        if resource.get('resourceType') == 'MedicationRequest':
+                            if 'medicationCodeableConcept' in resource and 'coding' in resource['medicationCodeableConcept']:
+                                for code in resource['medicationCodeableConcept']['coding']:
+                                    if 'display' in code:
+                                        medication_counts[code['display']] += 1
+            
+        except Exception as e:
+            print(f"Error processing {patient_file}: {e}")
+    
+    # Prepare statistics
+    stats = {
+        'total_patients': len(patients_files),
+        'disease_name': disease_name,
+        'demographics': {
+            'gender_distribution': {gender: count for gender, count in demographics['gender'].items()},
+            'age_distribution': {
+                'min': min(demographics['age']) if demographics['age'] else None,
+                'max': max(demographics['age']) if demographics['age'] else None,
+                'average': sum(demographics['age']) / len(demographics['age']) if demographics['age'] else None,
+                'distribution': {'0-18': 0, '19-44': 0, '45-64': 0, '65+': 0}
+            },
+            'race_distribution': {race: count for race, count in demographics['race'].items()},
+            'ethnicity_distribution': {ethnicity: count for ethnicity, count in demographics['ethnicity'].items()}
+        },
+        'disease_stats': {
+            'top_conditions': dict(condition_counts.most_common(15)),
+            'top_medications': dict(medication_counts.most_common(15))
+        }
+    }
+    
+    # Calculate age distribution
+    for age in demographics['age']:
+        if age <= 18:
+            stats['demographics']['age_distribution']['distribution']['0-18'] += 1
+        elif age <= 44:
+            stats['demographics']['age_distribution']['distribution']['19-44'] += 1
+        elif age <= 64:
+            stats['demographics']['age_distribution']['distribution']['45-64'] += 1
+        else:
+            stats['demographics']['age_distribution']['distribution']['65+'] += 1
+    
+    # Save statistics to file
+    with open(os.path.join(output_dir, 'patient_stats.json'), 'w') as f:
+        json.dump(stats, f, indent=2)
+    
+    # Generate report in requested format
+    if report_format == 'html':
+        generate_html_report(stats, output_dir)
+    elif report_format == 'csv':
+        generate_csv_reports(stats, output_dir)
+    else:
+        print(f"Unsupported report format: {report_format}. Only stats JSON file created.")
+    
+    print(f"Analysis complete. Reports generated in {os.path.abspath(output_dir)}")
+
+def generate_html_report(stats, output_dir):
+    """Generate an HTML report from the patient statistics."""
+    
+    disease_title = f" - {stats['disease_name']}" if stats['disease_name'] else ""
+    
+    html = f'''<!DOCTYPE html>
+<html>
+<head>
+    <title>Synthea Patient Analysis{disease_title}</title>
+    <style>
+        body {{ font-family: Arial, sans-serif; margin: 20px; }}
+        h1, h2, h3 {{ color: #333; }}
+        .container {{ max-width: 1000px; margin: 0 auto; }}
+        table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
+        th, td {{ text-align: left; padding: 8px; border-bottom: 1px solid #ddd; }}
+        th {{ background-color: #f2f2f2; }}
+        tr:hover {{background-color: #f5f5f5;}}
+        .chart {{ margin: 20px 0; height: 300px; }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Synthea Patient Analysis{disease_title}</h1>
+        <p>Total patients: {stats['total_patients']}</p>
+        
+        <h2>Demographics</h2>
+        <h3>Gender Distribution</h3>
+        <table>
+            <tr><th>Gender</th><th>Count</th><th>Percentage</th></tr>
+'''
+    
+    for gender, count in stats['demographics']['gender_distribution'].items():
+        percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+        html += f"<tr><td>{gender}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
+    
+    html += '''
+        </table>
+        
+        <h3>Age Distribution</h3>
+        <table>
+            <tr><th>Age Group</th><th>Count</th><th>Percentage</th></tr>
+'''
+    
+    for age_group, count in stats['demographics']['age_distribution']['distribution'].items():
+        percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+        html += f"<tr><td>{age_group}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
+    
+    html += f'''
+        </table>
+        <p>Min Age: {stats['demographics']['age_distribution']['min']}</p>
+        <p>Max Age: {stats['demographics']['age_distribution']['max']}</p>
+        <p>Average Age: {stats['demographics']['age_distribution']['average']:.1f}</p>
+        
+        <h3>Race Distribution</h3>
+        <table>
+            <tr><th>Race</th><th>Count</th><th>Percentage</th></tr>
+'''
+    
+    for race, count in stats['demographics']['race_distribution'].items():
+        percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+        html += f"<tr><td>{race}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
+    
+    html += '''
+        </table>
+        
+        <h3>Ethnicity Distribution</h3>
+        <table>
+            <tr><th>Ethnicity</th><th>Count</th><th>Percentage</th></tr>
+'''
+    
+    for ethnicity, count in stats['demographics']['ethnicity_distribution'].items():
+        percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+        html += f"<tr><td>{ethnicity}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
+    
+    html += '''
+        </table>
+        
+        <h2>Disease Statistics</h2>
+        <h3>Top Conditions</h3>
+        <table>
+            <tr><th>Condition</th><th>Count</th><th>Percentage</th></tr>
+'''
+    
+    for condition, count in stats['disease_stats']['top_conditions'].items():
+        percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+        html += f"<tr><td>{condition}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
+    
+    html += '''
+        </table>
+        
+        <h3>Top Medications</h3>
+        <table>
+            <tr><th>Medication</th><th>Count</th><th>Percentage of Patients</th></tr>
+'''
+    
+    for medication, count in stats['disease_stats']['top_medications'].items():
+        percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+        html += f"<tr><td>{medication}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n"
+    
+    html += '''
+        </table>
+    </div>
+</body>
+</html>'''
+    
+    with open(os.path.join(output_dir, 'patient_analysis.html'), 'w') as f:
+        f.write(html)
+
+def generate_csv_reports(stats, output_dir):
+    """Generate CSV reports from the patient statistics."""
+    import csv
+    
+    # Demographics CSV
+    with open(os.path.join(output_dir, 'demographics.csv'), 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(['Category', 'Type', 'Count', 'Percentage'])
+        
+        # Gender
+        for gender, count in stats['demographics']['gender_distribution'].items():
+            percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+            writer.writerow(['Gender', gender, count, f"{percentage:.1f}%"])
+        
+        # Age
+        for age_group, count in stats['demographics']['age_distribution']['distribution'].items():
+            percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+            writer.writerow(['Age', age_group, count, f"{percentage:.1f}%"])
+        
+        # Race
+        for race, count in stats['demographics']['race_distribution'].items():
+            percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+            writer.writerow(['Race', race, count, f"{percentage:.1f}%"])
+        
+        # Ethnicity
+        for ethnicity, count in stats['demographics']['ethnicity_distribution'].items():
+            percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+            writer.writerow(['Ethnicity', ethnicity, count, f"{percentage:.1f}%"])
+    
+    # Conditions CSV
+    with open(os.path.join(output_dir, 'conditions.csv'), 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(['Condition', 'Count', 'Percentage'])
+        for condition, count in stats['disease_stats']['top_conditions'].items():
+            percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+            writer.writerow([condition, count, f"{percentage:.1f}%"])
+    
+    # Medications CSV
+    with open(os.path.join(output_dir, 'medications.csv'), 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(['Medication', 'Count', 'Percentage'])
+        for medication, count in stats['disease_stats']['top_medications'].items():
+            percentage = (count / stats['total_patients']) * 100 if stats['total_patients'] > 0 else 0
+            writer.writerow([medication, count, f"{percentage:.1f}%"])
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Analyze Synthea patient data and generate reports")
+    parser.add_argument("--input_dir", required=True, help="Directory containing patient JSON files")
+    parser.add_argument("--output_dir", default="analysis_output", help="Directory to save analysis outputs")
+    parser.add_argument("--report_format", default="html", choices=["html", "csv"], help="Format for the reports")
+    parser.add_argument("--disease_name", help="Name of the disease being simulated")
+    
+    args = parser.parse_args()
+    
+    analyze_patient_data(args.input_dir, args.output_dir, args.report_format, args.disease_name) 
--- a/scripts/check_condition_structure.py
+++ b/scripts/check_condition_structure.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+"""
+Synthea Module Condition Structure Validator
+
+This script scans all existing Synthea disease modules to check if any violate
+the expected condition structure format, particularly looking for nested condition_type
+objects that can cause errors during simulation.
+
+Usage:
+python check_condition_structure.py [--modules_dir DIRECTORY] [--verbose]
+
+Arguments:
+  --modules_dir DIRECTORY   Path to the modules directory (default: src/main/resources/modules)
+  --verbose                 Enable verbose output with detailed issue descriptions
+  --fix                     Attempt to automatically fix simple structure issues (experimental)
+  
+Example:
+python check_condition_structure.py --modules_dir ../modules --verbose
+"""
+
+import os
+import sys
+import json
+import glob
+import argparse
+import logging
+from typing import Dict, List, Any, Tuple
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger(__name__)
+
+def validate_condition_format(module_json):
+    """Validate that conditions in the module follow Synthea's expected format"""
+    try:
+        module_dict = json.loads(module_json) if isinstance(module_json, str) else module_json
+        
+        # Function to recursively check objects for improper condition structure
+        def check_conditions(obj, path=[]):
+            issues = []
+            
+            if isinstance(obj, dict):
+                # Check if this is a condition object with nested condition_type
+                if "condition" in obj and isinstance(obj["condition"], dict):
+                    condition = obj["condition"]
+                    current_path = path + ["condition"]
+                    
+                    # Look for the improper nested structure
+                    if "condition_type" in condition and isinstance(condition["condition_type"], dict):
+                        issue_path = '.'.join(current_path + ["condition_type"])
+                        issues.append(f"Found nested condition_type in a condition object at path: {issue_path}")
+                
+                # Recursively check all dictionary values
+                for key, value in obj.items():
+                    child_issues = check_conditions(value, path + [key])
+                    issues.extend(child_issues)
+            
+            elif isinstance(obj, list):
+                # Recursively check all list items
+                for i, item in enumerate(obj):
+                    child_issues = check_conditions(item, path + [f"[{i}]"])
+                    issues.extend(child_issues)
+            
+            return issues
+        
+        # Check the entire module
+        issues = check_conditions(module_dict)
+        return len(issues) == 0, issues
+    
+    except Exception as e:
+        return False, [f"Validation error: {str(e)}"]
+
+def fix_condition_structure(module_json):
+    """Attempt to fix common condition structure issues"""
+    try:
+        # Parse the module if it's a string
+        module_dict = json.loads(module_json) if isinstance(module_json, str) else module_json
+        fixed = False
+
+        def fix_conditions(obj):
+            nonlocal fixed
+
+            if isinstance(obj, dict):
+                # Check if this is a condition object with nested condition_type
+                if "condition" in obj and isinstance(obj["condition"], dict):
+                    condition = obj["condition"]
+
+                    # Fix the improper nested structure:
+                    if "condition_type" in condition and isinstance(condition["condition_type"], dict):
+                        # Replace nested dict with its first key as a string
+                        first_key = list(condition["condition_type"].keys())[0]
+                        condition["condition_type"] = first_key
+                        fixed = True
+
+                # Recursively fix all dictionary values
+                for key, value in list(obj.items()):
+                    if isinstance(value, (dict, list)):
+                        fix_conditions(value)
+
+            elif isinstance(obj, list):
+                # Recursively fix all list items
+                for item in obj:
+                    if isinstance(item, (dict, list)):
+                        fix_conditions(item)
+
+        # Fix the entire module
+        fix_conditions(module_dict)
+
+        # Return the fixed module and whether changes were made
+        return module_dict, fixed
+
+    except Exception as e:
+        logger.error(f"Error fixing module structure: {e}")
+        return module_json, False
+    
+def main():
+    parser = argparse.ArgumentParser(description='Validate condition structure in Synthea modules')
+    parser.add_argument('--modules_dir', type=str, default='src/main/resources/modules',
+                       help='Path to the modules directory')
+    parser.add_argument('--verbose', action='store_true',
+                       help='Enable verbose output with detailed issue descriptions')
+    parser.add_argument('--fix', action='store_true',
+                       help='Attempt to automatically fix simple structure issues (experimental)')
+    args = parser.parse_args()
+    
+    # Check if modules directory exists
+    if not os.path.exists(args.modules_dir):
+        logger.error(f"Modules directory not found: {args.modules_dir}")
+        sys.exit(1)
+    
+    # Get all module files
+    module_files = glob.glob(os.path.join(args.modules_dir, "*.json"))
+    logger.info(f"Found {len(module_files)} module files to check")
+    
+    # Track statistics
+    valid_modules = 0
+    invalid_modules = 0
+    fixed_modules = 0
+    
+    # Check each module
+    for module_path in module_files:
+        module_name = os.path.basename(module_path)
+        
+        try:
+            # Load module content
+            with open(module_path, 'r') as f:
+                module_content = f.read()
+            
+            # Validate condition structure
+            try:
+                module_json = json.loads(module_content)
+                valid, issues = validate_condition_format(module_json)
+                
+                if valid:
+                    valid_modules += 1
+                    if args.verbose:
+                        logger.info(f"✅ {module_name}: Valid condition structure")
+                else:
+                    invalid_modules += 1
+                    
+                    if args.fix:
+                        # Attempt to fix the module
+                        fixed_module, was_fixed = fix_condition_structure(module_json)
+                        
+                        if was_fixed:
+                            # Write the fixed module back
+                            with open(module_path, 'w') as f:
+                                json.dump(fixed_module, f, indent=2)
+                            
+                            # Validate again to confirm
+                            valid_after_fix, remaining_issues = validate_condition_format(fixed_module)
+                            
+                            if valid_after_fix:
+                                fixed_modules += 1
+                                logger.info(f"🔧 {module_name}: Fixed condition structure issues")
+                            else:
+                                logger.warning(f"⚠️ {module_name}: Could not fix all condition structure issues")
+                                if args.verbose:
+                                    for issue in remaining_issues:
+                                        logger.warning(f"  - {issue}")
+                        else:
+                            logger.warning(f"⚠️ {module_name}: Could not fix condition structure issues")
+                            if args.verbose:
+                                for issue in issues:
+                                    logger.warning(f"  - {issue}")
+                    else:
+                        logger.warning(f"⚠️ {module_name}: Invalid condition structure")
+                        if args.verbose:
+                            for issue in issues:
+                                logger.warning(f"  - {issue}")
+            
+            except json.JSONDecodeError as e:
+                invalid_modules += 1
+                logger.error(f"❌ {module_name}: Invalid JSON format - {str(e)}")
+                
+        except Exception as e:
+            invalid_modules += 1
+            logger.error(f"❌ {module_name}: Error processing file - {str(e)}")
+    
+    # Print summary
+    logger.info("\nSummary:")
+    logger.info(f"Total modules checked: {len(module_files)}")
+    logger.info(f"Valid modules: {valid_modules}")
+    logger.info(f"Invalid modules: {invalid_modules}")
+    
+    if args.fix:
+        logger.info(f"Modules fixed: {fixed_modules}")
+    
+    if invalid_modules > 0:
+        logger.warning("Some modules have condition structure issues that may cause problems in Synthea")
+        if not args.fix:
+            logger.info("Run with --fix to attempt automatic fixes for the issues")
+    else:
+        logger.info("All modules have valid condition structure")
+
+if __name__ == "__main__":
+    main() 
--- a/scripts/check_json.py
+++ b/scripts/check_json.py
@@ -0,0 +1,14 @@
+import json
+import sys
+
+path = sys.argv[1]
+with open(path, 'r') as f:
+    content = f.read()
+
+print(f"File has {content.count('{')} opening braces and {content.count('}')} closing braces")
+
+try:
+    json.loads(content)
+    print('Valid JSON')
+except Exception as e:
+    print(f'Invalid JSON: {e}')
--- a/scripts/cleanup.sh
+++ b/scripts/cleanup.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+
+# Synthea-All-Diseases Repository Cleanup Script
+# This script removes unnecessary files and keeps only the essential ones for the Synthea module generator
+
+echo "Starting repository cleanup..."
+
+# Create backup directory
+mkdir -p backup
+mkdir -p backup/scripts  # Create a directory for Python scripts
+mkdir -p backup/modules  # Create a directory for modules
+mkdir -p backup/module_generator  # Create a directory for module generator scripts
+
+# 1. Keep essential Nextflow files, remove others
+echo "Handling Nextflow files..."
+cp main.nf backup/
+rm -f build_synthea.nf synthea_module_generator_old.nf minimal_test.nf fresh_synthea.nf simple_synthea.nf test_workflow.nf new_synthea.nf minimal_working.nf
+# Keep synthea_module_generator.nf around for reference but not used
+mv synthea_module_generator.nf backup/
+
+# 2. Keep essential Python scripts, Docker files, and configuration files
+echo "Keeping essential files..."
+if [ -d "src/main/python" ]; then
+  cp -r src/main/python backup/module_generator/
+fi
+if [ -d "src/main/resources/modules" ]; then
+  cp -r src/main/resources/modules/* backup/modules/
+fi
+cp Dockerfile docker-compose.yml CLAUDE.md .env.example nextflow.config README.md backup/
+cp DOCKER_README.md SYNTHEA_GUIDE.md backup/ 2>/dev/null || :  # Keep additional README files
+# Backup .env file if it exists
+if [ -f ".env" ]; then
+  cp .env backup/
+fi
+
+# 3. Back up all Python scripts in the root directory
+echo "Backing up Python scripts..."
+# Key scripts - these will be kept in the scripts directory after cleanup
+# The only key script we'll keep in the root is check_condition_structure.py for convenience
+key_scripts=("check_condition_structure.py")
+for script in "${key_scripts[@]}"; do
+  if [ -f "$script" ]; then
+    cp "$script" backup/
+    echo "  Backed up key script for root: $script"
+  fi
+done
+
+# All other Python scripts - these will be moved to scripts/ directory
+for script in *.py; do
+  if [ -f "$script" ] && [[ ! " ${key_scripts[@]} " =~ " ${script} " ]]; then
+    cp "$script" backup/scripts/
+    echo "  Backed up utility script: $script"
+  fi
+done
+
+# 4. Remove ALL Synthea code - we'll clone it in Docker
+echo "Removing ALL Synthea source code..."
+echo "This will be cloned during Docker build based on Dockerfile..."
+# Remove the entire src directory - we'll recreate what we need
+rm -rf src/
+# Remove any Synthea build files/directories
+rm -rf build/ output/ simulator/ lib/ logs/
+# Remove any Synthea run files
+rm -f run_synthea run_synthea.bat synthea *.jar
+
+# 5. Remove other experimental or temporary files
+echo "Removing experimental and temporary files..."
+rm -f *.log *.txt trace.txt
+rm -f error_output.txt generate_module.sh run_synthetic_data_generation.sh
+rm -f build_docker.sh entrypoint.sh run_module_generator.sh generate_samples.sh test_run.sh
+rm -f run_flexporter .DS_Store
+
+# 6. Remove gradle files and other unnecessary files
+echo "Removing additional unnecessary files..."
+rm -f build.gradle gradlew gradlew.bat settings.gradle
+rm -rf gradle/ .gradle/
+# Note: Keeping .git, .gitignore, .github for version control
+rm -rf .nextflow/ .nextflow.log* nextflow-*.zip
+# Keep LICENSE files but remove other documentation that will be included from Synthea
+rm -f CODE_OF_CONDUCT.md NOTICE
+rm -f *.bak *~
+
+# 7. Create clean directory structure for the repository
+echo "Creating clean directory structure..."
+mkdir -p modules  # Top-level modules directory
+mkdir -p module_generator  # Module generator directory (renamed from python)
+mkdir -p scripts  # Utility scripts directory
+
+# 8. Restore files to the cleaned structure
+echo "Restoring files to cleaned structure..."
+
+# Restore modules
+if [ -d "backup/modules" ]; then
+  cp -r backup/modules/* modules/ 2>/dev/null || :
+  echo "  Restored modules to modules/ directory"
+fi
+
+# Restore Module generator core scripts
+if [ -d "backup/module_generator" ]; then
+  cp -r backup/module_generator/* module_generator/ 2>/dev/null || :
+  echo "  Restored core generator scripts to module_generator/ directory"
+fi
+
+# Restore key Python scripts to root directory
+for script in "${key_scripts[@]}"; do
+  if [ -f "backup/$script" ]; then
+    cp "backup/$script" ./
+    echo "  Restored key script to root: $script"
+  fi
+done
+
+# Restore utility Python scripts to scripts/ directory
+if [ -d "backup/scripts" ]; then
+  cp backup/scripts/* scripts/ 2>/dev/null || :
+  echo "  Restored utility scripts to scripts/ directory"
+fi
+
+# Restore root files
+cp backup/main.nf backup/Dockerfile backup/docker-compose.yml backup/CLAUDE.md backup/nextflow.config backup/.env.example backup/README.md ./ 2>/dev/null || :
+if [ -f "backup/DOCKER_README.md" ]; then
+  cp backup/DOCKER_README.md ./
+fi
+if [ -f "backup/SYNTHEA_GUIDE.md" ]; then
+  cp backup/SYNTHEA_GUIDE.md ./
+fi
+# Restore .env if it existed
+if [ -f "backup/.env" ]; then
+  cp backup/.env ./
+  echo "  Restored .env file"
+fi
+
+# 9. Update the Dockerfile to reference the new directory structure
+echo "Updating Dockerfile references if needed..."
+if [ -f "Dockerfile" ]; then
+  # Update the Dockerfile to use the new directory structure
+  sed -i.bak 's|COPY src/main/python/|COPY module_generator/|g' Dockerfile
+  sed -i.bak 's|COPY python/|COPY module_generator/|g' Dockerfile
+  sed -i.bak 's|COPY src/main/resources/modules/|COPY modules/|g' Dockerfile
+  sed -i.bak 's|src/main/python/|module_generator/|g' Dockerfile
+  sed -i.bak 's|src/main/resources/modules|modules|g' Dockerfile
+  # Update script paths if needed
+  sed -i.bak 's|/app/src/main/python/|/app/module_generator/|g' Dockerfile
+  rm -f Dockerfile.bak
+fi
+
+# 10. Update the main.nf file to reference the new directory structure
+echo "Updating main.nf references if needed..."
+if [ -f "main.nf" ]; then
+  # Update the main.nf file to use the new directory structure
+  sed -i.bak 's|src/main/resources/modules|modules|g' main.nf
+  sed -i.bak 's|src/main/python|module_generator|g' main.nf
+  sed -i.bak 's|python/|module_generator/|g' main.nf
+  rm -f main.nf.bak
+fi
+
+# 11. Create a simple README for the scripts directory
+echo "Creating README for scripts directory..."
+cat > scripts/README.md << 'EOF'
+# Utility Scripts
+
+This directory contains utility scripts for working with Synthea modules:
+
+- `analyze_patient_data.py` - Analyzes patient data generated by Synthea
+- `check_json.py` - Validates JSON structure of module files
+- `validate_module.py` - Performs comprehensive validation of modules
+- `test_module_exists.py` - Checks if a module exists for a given disease
+- `run_module.py` - Script to run a specific module with Synthea
+- Other utility scripts for module generation and testing
+
+These scripts complement the core module generator scripts found in the `module_generator/` directory.
+EOF
+
+# Update README.md with instructions on using the cleaned repo
+echo "Updating README..."
+cat > README.md << 'EOF'
+# Synthea All Diseases
+
+This repository contains a workflow for generating Synthea disease modules using Claude AI and generating synthetic patient data.
+
+## Repository Structure
+
+- `module_generator/` - Core module generation scripts
+- `modules/` - Generated disease modules
+- `scripts/` - Utility Python scripts and tools
+- `main.nf` - Nextflow workflow for module generation and patient data generation
+- `Dockerfile` and `docker-compose.yml` - Docker configuration
+
+## Key Files
+
+- `check_condition_structure.py` - Validates condition structure in modules
+- `main.nf` - Main workflow file for generating modules and patient data
+
+## Running the Pipeline
+
+### Prerequisites
+
+- Docker and Docker Compose
+- Nextflow
+- Anthropic API key (for Claude AI)
+
+### Setup
+
+1. Clone this repository
+2. Copy `.env.example` to `.env` and add your Anthropic API key
+3. Run with Docker Compose:
+
+```
+docker-compose up
+```
+
+### Running the Workflow
+
+```
+nextflow run main.nf --disease_name "Disease Name" [OPTIONS]
+```
+
+For a full list of options, run:
+
+```
+nextflow run main.nf --help
+```
+
+## Key Features
+
+- Generate modules for diseases that don't exist in Synthea
+- Validate condition structure in generated modules
+- Generate synthetic patient data using the modules
+- Analyze generated patient data
+
+## Documentation
+
+- See `CLAUDE.md` for additional development guidelines
+- See `DOCKER_README.md` for Docker setup and usage
+- See `SYNTHEA_GUIDE.md` for detailed usage examples and scenarios
+- See `scripts/README.md` for information about utility scripts
+
+## License
+
+This project uses the same license as Synthea.
+EOF
+
+# 12. Update docker-compose.yml to reference the new directory structure
+echo "Updating docker-compose.yml references if needed..."
+if [ -f "docker-compose.yml" ]; then
+  # No changes needed as it uses relative paths and volume mounts
+  echo "  No changes needed for docker-compose.yml"
+fi
+
+echo "Cleanup complete! All essential files have been kept, and unnecessary files have been removed."
+echo "Repository structure has been simplified to:"
+echo "  - module_generator/ (core module generation scripts)"
+echo "  - modules/ (disease modules)"
+echo "  - scripts/ (utility scripts)"
+echo "  - check_condition_structure.py (in root for easy access)"
+echo ""
+echo "The following important files were preserved:"
+echo "  - .git/ (version control repository)"
+echo "  - .gitignore (version control configuration)" 
+echo "  - .env (environment configuration if it existed)"
+echo "  - .github/ (GitHub configuration if it existed)"
+echo ""
+echo "Optional: Remove the backup directory after verifying everything works: rm -rf backup" 
--- a/scripts/generate_batch.py
+++ b/scripts/generate_batch.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import csv
+import time
+import argparse
+import subprocess
+import concurrent.futures
+import re
+
+# Global variables
+ARGS = None
+
+def normalize_disease_name(name):
+    """Convert a disease name to a normalized filename"""
+    # Convert to lowercase
+    name = name.lower()
+    
+    # Replace special characters with underscores
+    name = re.sub(r'[^a-z0-9]+', '_', name)
+    
+    # Remove leading/trailing underscores
+    name = name.strip('_')
+    
+    # Ensure the name is not empty
+    if not name:
+        name = "unknown_disease"
+    
+    return name
+
+def process_disease(disease_entry):
+    """Process a single disease from the CSV"""
+    disease_name = disease_entry.get("disease_name", "")
+    normalized_name = normalize_disease_name(disease_name)
+    icd10 = disease_entry.get("id", "")
+    category = disease_entry.get("disease_category", "")
+    
+    print(f"\n{'='*80}")
+    print(f"Processing disease: {disease_name}")
+    print(f"ICD-10 code: {icd10}")
+    print(f"Category: {category}")
+    
+    # Skip if module already exists (unless --force flag is used)
+    module_path = f"src/main/resources/modules/{normalized_name}.json"
+    if os.path.exists(module_path) and not ARGS.force:
+        print(f"✅ Module already exists at {module_path}, skipping")
+        return {"name": disease_name, "status": "skipped", "path": module_path}
+    
+    # Create the command
+    cmd = ["python3", "generate_module.py", "--disease", disease_name, "--no-interactive"]
+    
+    if icd10:
+        cmd.extend(["--icd10", icd10])
+    
+    if category:
+        cmd.extend(["--category", category])
+        
+    # Add auto-fallback option if requested
+    if ARGS.auto_fallback:
+        cmd.append("--auto-fallback")
+    
+    # Run the generator
+    try:
+        print(f"Executing: {' '.join(cmd)}")
+        process = subprocess.run(cmd, check=True, text=True, capture_output=True)
+        print(f"✅ Successfully generated module for {disease_name}")
+        print(process.stdout.strip())
+        return {"name": disease_name, "status": "success", "path": module_path}
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Failed to generate module for {disease_name}")
+        print(f"Error: {str(e)}")
+        print(f"STDOUT: {e.stdout}")
+        print(f"STDERR: {e.stderr}")
+        return {"name": disease_name, "status": "error", "error": str(e), "path": None}
+
+def main():
+    """Main function to process diseases from the CSV"""
+    global ARGS
+    parser = argparse.ArgumentParser(description='Generate Synthea modules for diseases')
+    parser.add_argument('--category', help='Only process diseases in this category')
+    parser.add_argument('--disease', help='Only process a specific disease (by name)')
+    parser.add_argument('--limit', type=int, help='Limit number of diseases to process')
+    parser.add_argument('--parallel', type=int, default=1, help='Number of parallel processes')
+    parser.add_argument('--skip-existing', action='store_true', help='Skip diseases that already have modules')
+    parser.add_argument('--csv-path', default='src/main/resources/disease_list.csv', help='Path to disease list CSV')
+    parser.add_argument('--force', action='store_true', help='Force generation even if module already exists')
+    parser.add_argument('--auto-fallback', action='store_true', help='Enable auto-fallback option')
+    args = parser.parse_args()
+    
+    ARGS = args
+    
+    # Read the disease list CSV
+    if not os.path.exists(args.csv_path):
+        print(f"Error: Disease list CSV not found at {args.csv_path}")
+        sys.exit(1)
+    
+    print(f"Reading disease list from {args.csv_path}")
+    try:
+        with open(args.csv_path, 'r') as f:
+            reader = csv.DictReader(f)
+            diseases = list(reader)
+    except Exception as e:
+        print(f"Error reading CSV: {str(e)}")
+        sys.exit(1)
+    
+    print(f"Found {len(diseases)} diseases in the CSV")
+    
+    # Filter diseases
+    if args.category:
+        diseases = [d for d in diseases if (d.get('disease_category', '').lower() == args.category.lower())]
+        print(f"Filtered to {len(diseases)} diseases in category '{args.category}'")
+    
+    if args.disease:
+        # Try to find an exact match first
+        disease_name_lower = args.disease.lower()
+        exact_match = [d for d in diseases if d.get("disease_name", "").lower() == disease_name_lower]
+        
+        if exact_match:
+            diseases = exact_match
+        else:
+            # Try to find a disease that contains the specified name
+            partial_matches = [d for d in diseases if disease_name_lower in d.get("disease_name", "").lower()]
+            
+            if partial_matches:
+                diseases = partial_matches
+                print(f"Found {len(diseases)} partial matches for '{args.disease}'")
+            else:
+                print(f"No matches found for disease '{args.disease}'")
+                sys.exit(1)
+    
+    if args.limit and args.limit > 0:
+        diseases = diseases[:args.limit]
+        print(f"Limited to {args.limit} diseases")
+    
+    # Process the diseases
+    total = len(diseases)
+    print(f"\nProcessing {total} diseases with {args.parallel} parallel workers")
+    
+    start_time = time.time()
+    
+    if args.parallel > 1:
+        with concurrent.futures.ProcessPoolExecutor(max_workers=args.parallel) as executor:
+            results = list(executor.map(process_disease, diseases))
+    else:
+        results = [process_disease(disease) for disease in diseases]
+    
+    end_time = time.time()
+    elapsed = end_time - start_time
+    
+    # Summarize results
+    success_count = sum(1 for r in results if r["status"] == "success")
+    skipped_count = sum(1 for r in results if r["status"] == "skipped")
+    error_count = sum(1 for r in results if r["status"] == "error")
+    
+    print("\n" + "="*80)
+    print(f"SUMMARY: Processed {total} diseases in {elapsed:.2f} seconds")
+    print(f"- Successfully generated: {success_count}")
+    print(f"- Skipped (already exist): {skipped_count}")
+    print(f"- Failed: {error_count}")
+    
+    # List errors if any
+    if error_count > 0:
+        print("\nFAILED DISEASES:")
+        for result in results:
+            if result["status"] == "error":
+                print(f"- {result['name']}: {result.get('error', 'Unknown error')}")
+        
+        # Exit with error code if any failures
+        sys.exit(1)
+    
+    print("\nAll done! 🎉")
+
+if __name__ == "__main__":
+    main() 
--- a/scripts/generate_module.py
+++ b/scripts/generate_module.py
@@ -0,0 +1,468 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import json
+import re
+import anthropic
+import csv
+import argparse
+
+# Define global variables
+DISEASE_NAME = ""
+OUTPUT_FILE = ""
+icd10_code = ""
+disease_category = ""
+use_interactive = True
+use_auto_fallback = False
+
+def normalize_disease_name(name):
+    # Normalize disease name for filename
+    normalized_name = name.lower().replace(' ', '_').replace('-', '_')
+    normalized_name = ''.join(c if c.isalnum() or c == '_' else '_' for c in normalized_name)
+    normalized_name = '_'.join(filter(None, normalized_name.split('_')))
+    return normalized_name
+
+def generate_module():
+    # Initialize the Anthropic client
+    api_key = os.environ.get('ANTHROPIC_API_KEY')
+    if not api_key:
+        print("Error: ANTHROPIC_API_KEY environment variable is not set")
+        sys.exit(1)
+
+    client = anthropic.Anthropic(api_key=api_key)
+    
+    # Step 1: Ask Claude for clinical details in a structured format, not JSON
+    print("Step 1: Getting clinical details from Claude...")
+    
+    # Prepare disease information for the prompt
+    disease_info = f"Disease name: {DISEASE_NAME}"
+    if icd10_code:
+        disease_info += f"\nICD-10 code: {icd10_code}"
+    if disease_category:
+        disease_info += f"\nCategory: {disease_category}"
+
+    # Construct a prompt that asks for structured data, not JSON
+    prompt = f"""I need information to create a Synthea disease module for {DISEASE_NAME}. 
+    
+{disease_info}
+
+Please provide the following information in a structured format (NOT JSON):
+
+1. DESCRIPTION: A brief description of the disease (3-5 sentences)
+
+2. RISK_FACTORS: Age ranges, gender factors, and/or other risk factors for this disease
+   
+3. SYMPTOMS: List the main symptoms of the disease
+   
+4. DIAGNOSTIC_TESTS: Tests typically used to diagnose this condition
+   
+5. TREATMENTS: List treatments for this disease, including:
+   - Medications (with their class and purpose)
+   - Procedures
+   - Other interventions
+   
+6. COMPLICATIONS: Possible complications of this disease
+   
+7. PROGRESSION: Typical progression of the disease, including:
+   - How it starts
+   - How it typically develops
+   - Possible outcomes
+
+8. FOLLOW_UP: Typical follow-up care needed
+
+For medications, just provide generic names - I'll handle the RxNorm codes.
+For conditions and procedures, just provide plain English names - I'll handle the coding.
+
+Please provide this information in a straightforward, structured text format. Do NOT use JSON.
+"""
+
+    # Get clinical details
+    try:
+        response = client.messages.create(
+            model="claude-3-7-sonnet-20250219",
+            max_tokens=4000,
+            temperature=0.2,
+            messages=[
+                {"role": "user", "content": prompt}
+            ]
+        )
+        
+        # Extract the response text
+        clinical_details = response.content[0].text
+        
+        # Save the raw clinical details for reference
+        os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
+        with open(f"{OUTPUT_FILE}.details", "w") as f:
+            f.write(clinical_details)
+        
+        print(f"Clinical details saved to {OUTPUT_FILE}.details")
+        
+        # Step 2: Now build a valid module with the clinical details
+        print("Step 2: Building module with clinical details...")
+        
+        # Extract information from the structured response
+        description_match = re.search(r'##\s*1\.\s*DESCRIPTION(.*?)(?=##\s*2\.)', clinical_details, re.DOTALL)
+        risk_factors_match = re.search(r'##\s*2\.\s*RISK_FACTORS(.*?)(?=##\s*3\.)', clinical_details, re.DOTALL)
+        symptoms_match = re.search(r'##\s*3\.\s*SYMPTOMS(.*?)(?=##\s*4\.)', clinical_details, re.DOTALL)
+        diagnostic_tests_match = re.search(r'##\s*4\.\s*DIAGNOSTIC_TESTS(.*?)(?=##\s*5\.)', clinical_details, re.DOTALL)
+        treatments_match = re.search(r'##\s*5\.\s*TREATMENTS(.*?)(?=##\s*6\.)', clinical_details, re.DOTALL)
+        complications_match = re.search(r'##\s*6\.\s*COMPLICATIONS(.*?)(?=##\s*7\.)', clinical_details, re.DOTALL)
+        progression_match = re.search(r'##\s*7\.\s*PROGRESSION(.*?)(?=##\s*8\.)', clinical_details, re.DOTALL)
+        follow_up_match = re.search(r'##\s*8\.\s*FOLLOW_UP(.*?)(?=$)', clinical_details, re.DOTALL)
+        
+        # Extract text from matches
+        description = description_match.group(1).strip() if description_match else f"A module for {DISEASE_NAME}"
+        risk_factors = risk_factors_match.group(1).strip() if risk_factors_match else ""
+        symptoms = symptoms_match.group(1).strip() if symptoms_match else ""
+        diagnostic_tests = diagnostic_tests_match.group(1).strip() if diagnostic_tests_match else ""
+        treatments = treatments_match.group(1).strip() if treatments_match else ""
+        complications = complications_match.group(1).strip() if complications_match else ""
+        progression = progression_match.group(1).strip() if progression_match else ""
+        follow_up = follow_up_match.group(1).strip() if follow_up_match else ""
+        
+        print(f"Extracted: {len(description)} chars description, {len(symptoms)} chars symptoms, {len(treatments)} chars treatments")
+        
+        # Extract symptoms as a list
+        symptom_list = []
+        if symptoms_match:
+            # For structured format with bullet points
+            symptom_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', symptoms, re.MULTILINE)
+            if symptom_lines:
+                symptom_list = [s.strip() for s in symptom_lines if s.strip()]
+            else:
+                # Try to split by newlines for a less structured format
+                symptom_list = [s.strip() for s in symptoms.split('\n') if s.strip()]
+                # If that doesn't give us anything, just add the whole text as one symptom
+                if not symptom_list and symptoms.strip():
+                    symptom_list = [symptoms.strip()]
+                    
+        print(f"Found {len(symptom_list)} symptoms")
+        
+        # Extract medications from treatments
+        medications = []
+        if treatments_match:
+            # Find the "Medications:" section
+            meds_section_match = re.search(r'Medications?:\s*(.*?)(?=(?:- Procedures:|$))', treatments, re.DOTALL)
+            if meds_section_match:
+                meds_section = meds_section_match.group(1).strip()
+                # Extract main medication names from list items with explanations
+                # Format is typically: "- Medication name (explanation)" or "- Medication name - explanation"
+                med_items = re.findall(r'[-*]\s*(.*?)(?=[-*]|\n\n|\n- |$)', meds_section, re.DOTALL)
+                
+                for item in med_items:
+                    item = item.strip()
+                    if not item:
+                        continue
+                    
+                    # Extract just the medication name, not the explanation
+                    med_match = re.match(r'([^(]+)(?:\(|-).*', item)
+                    if med_match:
+                        med_name = med_match.group(1).strip()
+                    else:
+                        med_name = item.split('(')[0].strip()
+                    
+                    # Clean up common patterns
+                    med_name = re.sub(r'\s*\([^)]*\)', '', med_name)  # Remove parentheticals
+                    med_name = re.sub(r'\s*-.*', '', med_name)  # Remove after dash
+                    
+                    # Skip descriptive phrases that aren't medications
+                    if any(phrase in med_name.lower() for phrase in ['reduce', 'block', 'prevent', 'for']):
+                        continue
+                    
+                    # Look for multiple medications in parentheses
+                    if ',' in med_name:
+                        for m in med_name.split(','):
+                            m = m.strip()
+                            if m and not any(m.lower() in existing.lower() for existing in medications):
+                                medications.append(m)
+                    else:
+                        if med_name and not any(med_name.lower() in existing.lower() for existing in medications):
+                            medications.append(med_name)
+            
+        print(f"Found {len(medications)} medications: {', '.join(medications[:5])}")
+        
+        # Start with a basic module template
+        module = {
+            "name": DISEASE_NAME,
+            "remarks": [],
+            "states": {
+                "Initial": {
+                    "type": "Initial",
+                    "direct_transition": "Age_Guard"
+                },
+                "Age_Guard": {
+                    "type": "Guard",
+                    "allow": {
+                        "condition_type": "Age",
+                        "operator": ">=",
+                        "quantity": 18,
+                        "unit": "years"
+                    },
+                    "direct_transition": "Delay_Until_Onset"
+                },
+                "Delay_Until_Onset": {
+                    "type": "Delay",
+                    "range": {
+                        "low": 0,
+                        "high": 20,
+                        "unit": "years"
+                    },
+                    "direct_transition": "Onset"
+                },
+                "Onset": {
+                    "type": "ConditionOnset",
+                    "target_encounter": "Diagnosis_Encounter",
+                    "codes": [
+                        {
+                            "system": "SNOMED-CT",
+                            "code": "64109004",
+                            "display": DISEASE_NAME
+                        }
+                    ],
+                    "direct_transition": "Diagnosis_Encounter"
+                },
+                "Diagnosis_Encounter": {
+                    "type": "Encounter",
+                    "encounter_class": "ambulatory",
+                    "reason": "Onset",
+                    "codes": [
+                        {
+                            "system": "SNOMED-CT",
+                            "code": "185347001",
+                            "display": "Encounter for problem"
+                        }
+                    ],
+                    "direct_transition": "End_Diagnosis_Encounter"
+                },
+                "End_Diagnosis_Encounter": {
+                    "type": "EncounterEnd"
+                    # direct_transition will be set later
+                }
+            }
+        }
+        
+        # Add description to remarks
+        if description:
+            for line in description.split('\n'):
+                line = line.strip()
+                if line:
+                    module["remarks"].append(line)
+        
+        # Add ICD-10 code if available
+        if icd10_code:
+            module["remarks"].append(f"ICD-10 code: {icd10_code}")
+        
+        # Add category if available
+        if disease_category:
+            module["remarks"].append(f"Category: {disease_category}")
+        
+        # Make sure remarks is not empty
+        if not module["remarks"]:
+            module["remarks"].append(f"Generated module for {DISEASE_NAME}")
+            
+        # Add risk factors to remarks if available
+        if risk_factors:
+            module["remarks"].append(f"Risk factors: {risk_factors.replace('\n', ' ')}")
+        
+        # Add symptoms if available
+        symptom_i = 0
+        current_state = "End_Diagnosis_Encounter"
+        
+        if symptom_list:
+            for i, symptom in enumerate(symptom_list[:3]):  # Limit to 3 symptoms
+                symptom_name = symptom.strip()
+                if not symptom_name:
+                    continue
+                    
+                state_name = f"Symptom_{symptom_i}"
+                symptom_i += 1
+                
+                module["states"][state_name] = {
+                    "type": "ConditionOnset",
+                    "target_encounter": "Diagnosis_Encounter",
+                    "codes": [
+                        {
+                            "system": "SNOMED-CT",
+                            "code": "418107008",  # Default symptom code
+                            "display": symptom_name
+                        }
+                    ]
+                }
+                
+                # Connect states
+                module["states"][current_state]["direct_transition"] = state_name
+                current_state = state_name
+            
+        # Add diagnostic tests if available
+        if diagnostic_tests:
+            test_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', diagnostic_tests, re.MULTILINE)
+            tests = [t.strip() for t in test_lines if t.strip()]
+            
+            if tests:
+                for i, test in enumerate(tests[:2]):  # Limit to 2 tests
+                    test_name = test.strip()
+                    if not test_name:
+                        continue
+                        
+                    state_name = f"DiagnosticTest_{i}"
+                    
+                    module["states"][state_name] = {
+                        "type": "Procedure",
+                        "target_encounter": "Diagnosis_Encounter",
+                        "reason": "Onset",
+                        "codes": [
+                            {
+                                "system": "SNOMED-CT",
+                                "code": "386053000",  # Default diagnostic procedure code
+                                "display": test_name
+                            }
+                        ]
+                    }
+                    
+                    # Connect states
+                    module["states"][current_state]["direct_transition"] = state_name
+                    current_state = state_name
+        
+        # Add medications if found
+        if medications:
+            for i, med in enumerate(medications[:3]):  # Limit to 3 medications to keep module manageable
+                med_name = med.strip()
+                if not med_name:
+                    continue
+                    
+                state_name = f"Prescribe_{i}"
+                
+                module["states"][state_name] = {
+                    "type": "MedicationOrder",
+                    "target_encounter": "Diagnosis_Encounter",
+                    "reason": "Onset",
+                    "codes": [
+                        {
+                            "system": "RxNorm",
+                            "code": "308047",  # Default code
+                            "display": med_name
+                        }
+                    ]
+                }
+                
+                # Connect states
+                module["states"][current_state]["direct_transition"] = state_name
+                current_state = state_name
+                
+        # Connect to Follow-up
+        module["states"][current_state]["direct_transition"] = "Follow_Up_Encounter"
+        
+        # Add follow-up encounter
+        module["states"]["Follow_Up_Encounter"] = {
+            "type": "Encounter",
+            "encounter_class": "ambulatory",
+            "reason": "Onset",
+            "codes": [
+                {
+                    "system": "SNOMED-CT",
+                    "code": "390906007",
+                    "display": "Follow-up encounter"
+                }
+            ],
+            "direct_transition": "End_Follow_Up_Encounter"
+        }
+        
+        module["states"]["End_Follow_Up_Encounter"] = {
+            "type": "EncounterEnd",
+            "direct_transition": "Terminal"
+        }
+        
+        module["states"]["Terminal"] = {
+            "type": "Terminal"
+        }
+        
+        # Format the JSON for output
+        formatted_json = json.dumps(module, indent=2)
+        
+        # Save the module
+        with open(OUTPUT_FILE, "w") as f:
+            f.write(formatted_json)
+        
+        print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}")
+        return 0
+            
+    except Exception as e:
+        print(f"❌ Error: {str(e)}")
+        
+        # Write the error to a file
+        with open(f"{OUTPUT_FILE}.error", "w") as f:
+            f.write(f"Error: {str(e)}")
+        return 1
+
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='Generate a Synthea module for a disease')
+    parser.add_argument('--disease', type=str, required=True, help='Disease name')
+    parser.add_argument('--icd10', type=str, help='ICD-10 code')
+    parser.add_argument('--category', type=str, help='Disease category')
+    parser.add_argument('--no-interactive', action='store_true', help='Skip interactive prompts, for batch processing')
+    parser.add_argument('--auto-fallback', action='store_true', help='Automatically use fallback template if JSON parsing fails')
+    parser.add_argument('--force', action='store_true', help='Force regeneration even if module already exists')
+    args = parser.parse_args()
+    
+    # Set global variables
+    global DISEASE_NAME, OUTPUT_FILE, use_interactive, use_auto_fallback
+    DISEASE_NAME = args.disease
+    normalized_name = normalize_disease_name(DISEASE_NAME)
+    OUTPUT_FILE = f"src/main/resources/modules/{normalized_name}.json"
+    use_interactive = not args.no_interactive
+    use_auto_fallback = args.auto_fallback
+    
+    # Use provided ICD-10 code or category if specified
+    global icd10_code, disease_category
+    icd10_code = args.icd10
+    disease_category = args.category
+    
+    print(f"Disease name: {DISEASE_NAME}")
+    print(f"Normalized filename: {normalized_name}.json")
+    
+    # Check if module already exists
+    if os.path.exists(OUTPUT_FILE):
+        print(f"✅ Module already exists at: {OUTPUT_FILE}")
+        if args.force:
+            print("Force flag set. Regenerating module.")
+        else:
+            return 0
+    else:
+        print(f"❌ Module not found at: {OUTPUT_FILE}")
+    
+    # If no ICD-10 code provided, look up in disease list CSV
+    if not icd10_code:
+        csv_path = "src/main/resources/disease_list.csv"
+        if os.path.exists(csv_path):
+            print(f"Looking up disease information in {csv_path}...")
+            try:
+                with open(csv_path, 'r') as f:
+                    reader = csv.DictReader(f)
+                    for row in reader:
+                        if row.get("disease_name", "").lower() == DISEASE_NAME.lower():
+                            icd10_code = row.get("id", "")
+                            disease_category = row.get("disease_category", "")
+                            print(f"Found matching disease in CSV:")
+                            print(f"- ICD-10 code: {icd10_code}")
+                            print(f"- Category: {disease_category}")
+                            break
+                    else:
+                        print("No matching disease found in CSV. Continuing without ICD-10 code.")
+            except Exception as e:
+                print(f"Error reading CSV: {str(e)}")
+                print("Continuing without ICD-10 code.")
+    
+    # Ensure ANTHROPIC_API_KEY is set
+    if not os.environ.get('ANTHROPIC_API_KEY'):
+        print("Error: ANTHROPIC_API_KEY environment variable is not set")
+        return 1
+    
+    # Ask Claude to generate the module
+    generate_module()
+    
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main()) 
--- a/scripts/patient_analysis.py
+++ b/scripts/patient_analysis.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+import glob
+import argparse
+from collections import Counter
+from datetime import datetime
+
+def analyze_patient_data(disease_name, input_dir, output_dir, format_type="html"):
+    print(f"Analyzing patient data for {disease_name}...")
+    
+    # Create the output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Find all patient JSON files
+    patients_files = glob.glob(f"{input_dir}/*.json")
+    patients_files = [f for f in patients_files if not 'hospitalInformation' in f and not 'practitionerInformation' in f]
+    
+    print(f"Found {len(patients_files)} patient records for analysis")
+    
+    if len(patients_files) == 0:
+        print("No patient files found to analyze.")
+        with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.html"), 'w') as f:
+            f.write(f"<html><body><h1>Analysis Report for {disease_name}</h1><p>No patient files found to analyze.</p></body></html>")
+        # Create empty CSV and JSON files
+        with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.csv"), 'w') as f:
+            f.write("No patient files found to analyze.\n")
+        with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.json"), 'w') as f:
+            f.write('{"error": "No patient files found to analyze."}\n')
+        return
+    
+    # Initialize data collectors
+    demographics = {'gender': Counter(), 'age': [], 'race': Counter(), 'ethnicity': Counter()}
+    condition_counts = Counter()
+    medication_counts = Counter()
+    
+    # Process each patient file
+    for patient_file in patients_files:
+        try:
+            with open(patient_file, 'r') as f:
+                data = json.load(f)
+            
+            # Skip non-patient resources
+            if 'resourceType' in data and data['resourceType'] == 'Patient':
+                # Basic patient info
+                if 'gender' in data:
+                    demographics['gender'][data['gender']] += 1
+                
+                if 'birthDate' in data:
+                    # Calculate age based on birth year
+                    birth_year = int(data['birthDate'][:4])
+                    current_year = datetime.now().year
+                    age = current_year - birth_year
+                    demographics['age'].append(age)
+                
+                # Process race and ethnicity extensions
+                if 'extension' in data:
+                    for ext in data.get('extension', []):
+                        if 'url' in ext and 'extension' in ext:
+                            if ext['url'].endswith('us-core-race'):
+                                for race_ext in ext['extension']:
+                                    if 'valueCoding' in race_ext:
+                                        race = race_ext['valueCoding'].get('display', 'Unknown')
+                                        demographics['race'][race] += 1
+                            elif ext['url'].endswith('us-core-ethnicity'):
+                                for eth_ext in ext['extension']:
+                                    if 'valueCoding' in eth_ext:
+                                        ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
+                                        demographics['ethnicity'][ethnicity] += 1
+            
+            # Check for Bundle resources with entries
+            if 'resourceType' in data and data['resourceType'] == 'Bundle' and 'entry' in data:
+                bundle_has_patient = False
+                for entry in data['entry']:
+                    if 'resource' in entry:
+                        resource = entry['resource']
+                        
+                        # Check if this bundle contains a patient
+                        if resource.get('resourceType') == 'Patient':
+                            bundle_has_patient = True
+                            # Basic patient info
+                            if 'gender' in resource:
+                                demographics['gender'][resource['gender']] += 1
+                            
+                            if 'birthDate' in resource:
+                                # Calculate age based on birth year
+                                birth_year = int(resource['birthDate'][:4])
+                                current_year = datetime.now().year
+                                age = current_year - birth_year
+                                demographics['age'].append(age)
+                            
+                            # Process race and ethnicity extensions
+                            if 'extension' in resource:
+                                for ext in resource.get('extension', []):
+                                    if 'url' in ext and 'extension' in ext:
+                                        if ext['url'].endswith('us-core-race'):
+                                            for race_ext in ext['extension']:
+                                                if 'valueCoding' in race_ext:
+                                                    race = race_ext['valueCoding'].get('display', 'Unknown')
+                                                    demographics['race'][race] += 1
+                                        elif ext['url'].endswith('us-core-ethnicity'):
+                                            for eth_ext in ext['extension']:
+                                                if 'valueCoding' in eth_ext:
+                                                    ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
+                                                    demographics['ethnicity'][ethnicity] += 1
+                        
+                        # Check for conditions
+                        if resource.get('resourceType') == 'Condition':
+                            if 'code' in resource and 'coding' in resource['code']:
+                                for code in resource['code']['coding']:
+                                    if 'display' in code:
+                                        condition_counts[code['display']] += 1
+                        
+                        # Check for medications
+                        if resource.get('resourceType') == 'MedicationRequest':
+                            if 'medicationCodeableConcept' in resource and 'coding' in resource['medicationCodeableConcept']:
+                                for code in resource['medicationCodeableConcept']['coding']:
+                                    if 'display' in code:
+                                        medication_counts[code['display']] += 1
+        
+        except Exception as e:
+            print(f"Error processing {patient_file}: {e}")
+    
+    # Calculate total patients (count unique patient files)
+    total_patients = sum(demographics['gender'].values())
+    if total_patients == 0:
+        print("Warning: No patient demographics found. Setting total_patients to file count.")
+        total_patients = len(patients_files)
+    
+    print(f"Total patients found: {total_patients}")
+    print(f"Gender distribution: {dict(demographics['gender'])}")
+    
+    if total_patients == 0:
+        total_patients = 1  # Avoid division by zero
+    
+    # Generate HTML report
+    if format_type.lower() in ["html", "all"]:
+        create_html_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
+    
+    # Generate CSV report
+    if format_type.lower() in ["csv", "all"]:
+        create_csv_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
+    
+    # Generate JSON report
+    if format_type.lower() in ["json", "all"]:
+        create_json_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
+    
+    print(f"Analysis complete. Reports generated in {output_dir}")
+
+
+def create_html_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
+    with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.html"), 'w') as f:
+        f.write(f'''<!DOCTYPE html>
+<html>
+<head>
+    <title>Synthea Patient Analysis - {disease_name}</title>
+    <style>
+        body {{ font-family: Arial, sans-serif; margin: 20px; }}
+        h1, h2, h3 {{ color: #333; }}
+        .container {{ max-width: 1000px; margin: 0 auto; }}
+        table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
+        th, td {{ text-align: left; padding: 8px; border-bottom: 1px solid #ddd; }}
+        th {{ background-color: #f2f2f2; }}
+        tr:hover {{background-color: #f5f5f5;}}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Synthea Patient Analysis - {disease_name}</h1>
+        <p>Total patients analyzed: {total_patients}</p>
+        
+        <h2>Demographics</h2>
+        <h3>Gender Distribution</h3>
+        <table>
+            <tr><th>Gender</th><th>Count</th><th>Percentage</th></tr>
+''')
+        
+        for gender, count in demographics['gender'].items():
+            percentage = (count / total_patients) * 100
+            f.write(f"<tr><td>{gender}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
+        
+        f.write('''
+        </table>
+        
+        <h3>Age Statistics</h3>
+        <table>
+''')
+        
+        if demographics['age']:
+            min_age = min(demographics['age'])
+            max_age = max(demographics['age'])
+            avg_age = sum(demographics['age']) / len(demographics['age'])
+            f.write(f"<tr><td>Minimum Age</td><td>{min_age}</td></tr>\n")
+            f.write(f"<tr><td>Maximum Age</td><td>{max_age}</td></tr>\n")
+            f.write(f"<tr><td>Average Age</td><td>{avg_age:.1f}</td></tr>\n")
+        else:
+            f.write("<tr><td colspan='2'>No age data available</td></tr>\n")
+        
+        f.write('''
+        </table>
+        
+        <h3>Top Conditions</h3>
+        <table>
+            <tr><th>Condition</th><th>Count</th><th>Percentage of Patients</th></tr>
+''')
+        
+        for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
+            percentage = (count / total_patients) * 100
+            f.write(f"<tr><td>{condition}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
+        
+        f.write('''
+        </table>
+        
+        <h3>Top Medications</h3>
+        <table>
+            <tr><th>Medication</th><th>Count</th><th>Percentage of Patients</th></tr>
+''')
+        
+        for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
+            percentage = (count / total_patients) * 100
+            f.write(f"<tr><td>{medication}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
+        
+        f.write('''
+        </table>
+    </div>
+</body>
+</html>''')
+
+
+def create_csv_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
+    with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.csv"), 'w') as f:
+        # Write header
+        f.write(f"Synthea Patient Analysis - {disease_name}\n")
+        f.write(f"Total patients analyzed,{total_patients}\n\n")
+        
+        # Gender distribution
+        f.write("Gender Distribution\n")
+        f.write("Gender,Count,Percentage\n")
+        for gender, count in demographics['gender'].items():
+            percentage = (count / total_patients) * 100
+            f.write(f"{gender},{count},{percentage:.1f}%\n")
+        f.write("\n")
+        
+        # Age statistics
+        f.write("Age Statistics\n")
+        if demographics['age']:
+            min_age = min(demographics['age'])
+            max_age = max(demographics['age'])
+            avg_age = sum(demographics['age']) / len(demographics['age'])
+            f.write(f"Minimum Age,{min_age}\n")
+            f.write(f"Maximum Age,{max_age}\n")
+            f.write(f"Average Age,{avg_age:.1f}\n")
+        else:
+            f.write("No age data available\n")
+        f.write("\n")
+        
+        # Top conditions
+        f.write("Top Conditions\n")
+        f.write("Condition,Count,Percentage of Patients\n")
+        for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
+            percentage = (count / total_patients) * 100
+            f.write(f"{condition},{count},{percentage:.1f}%\n")
+        f.write("\n")
+        
+        # Top medications
+        f.write("Top Medications\n")
+        f.write("Medication,Count,Percentage of Patients\n")
+        for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
+            percentage = (count / total_patients) * 100
+            f.write(f"{medication},{count},{percentage:.1f}%\n")
+
+
+def create_json_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
+    # Prepare the report data
+    report_data = {
+        "disease": disease_name,
+        "total_patients": total_patients,
+        "demographics": {
+            "gender": {k: v for k, v in demographics['gender'].items()},
+            "race": {k: v for k, v in demographics['race'].items()},
+            "ethnicity": {k: v for k, v in demographics['ethnicity'].items()}
+        },
+        "age_statistics": {}
+    }
+    
+    if demographics['age']:
+        report_data["age_statistics"] = {
+            "min_age": min(demographics['age']),
+            "max_age": max(demographics['age']),
+            "avg_age": sum(demographics['age']) / len(demographics['age'])
+        }
+    
+    # Add top conditions
+    report_data["top_conditions"] = [
+        {"name": condition, "count": count, "percentage": (count / total_patients) * 100}
+        for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]
+    ]
+    
+    # Add top medications
+    report_data["top_medications"] = [
+        {"name": medication, "count": count, "percentage": (count / total_patients) * 100}
+        for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]
+    ]
+    
+    # Write to JSON file
+    with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.json"), 'w') as f:
+        json.dump(report_data, f, indent=2)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Analyze Synthea patient data")
+    parser.add_argument("--disease", required=True, help="Disease name")
+    parser.add_argument("--input_dir", required=True, help="Input directory with FHIR files")
+    parser.add_argument("--output_dir", default=".", help="Output directory for reports")
+    parser.add_argument("--format", default="html", choices=["html", "csv", "json", "all"], 
+                       help="Output format (html, csv, json, or all)")
+    
+    args = parser.parse_args()
+    
+    analyze_patient_data(args.disease, args.input_dir, args.output_dir, args.format) 
--- a/scripts/prepare_environment.sh
+++ b/scripts/prepare_environment.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+# prepare_environment.sh
+#
+# This script prepares the environment for running the Synthea pipeline.
+# It rebuilds the Docker containers, ensures the directory structure is correct,
+# and starts the necessary services.
+
+set -e  # Exit on error
+
+# Display a header
+echo "=================================================="
+echo "Synthea All Diseases Pipeline - Environment Setup"
+echo "=================================================="
+
+# Check for Docker
+if ! command -v docker &> /dev/null; then
+    echo "ERROR: Docker is not installed or not in the PATH."
+    echo "Please install Docker and try again."
+    exit 1
+fi
+
+# Check for docker-compose
+if ! command -v docker-compose &> /dev/null; then
+    echo "ERROR: docker-compose is not installed or not in the PATH."
+    echo "Please install docker-compose and try again."
+    exit 1
+fi
+
+# Create necessary directories if they don't exist
+echo "Creating required directories..."
+mkdir -p module_generator
+mkdir -p src/main/resources/modules
+mkdir -p src/main/resources/disease
+mkdir -p modules
+mkdir -p output
+
+# Check if the module_generator directory has the required scripts
+if [ ! -f "module_generator/module_generator.py" ] || [ ! -f "module_generator/run_module_generator.py" ]; then
+    echo "ERROR: Required Python scripts not found in module_generator directory!"
+    # Check if they're in the python directory and we need to move them
+    if [ -d "python" ] && [ -f "python/module_generator.py" ] && [ -f "python/run_module_generator.py" ]; then
+        echo "Found scripts in python directory, moving them to module_generator directory..."
+        mkdir -p module_generator
+        mv python/module_generator.py module_generator/
+        mv python/run_module_generator.py module_generator/
+        # Copy README if exists
+        if [ -f "python/README_module_generator.md" ]; then
+            mv python/README_module_generator.md module_generator/
+        fi
+    else
+        echo "Scripts not found in python directory either. Please ensure the scripts are in the module_generator directory."
+        exit 1
+    fi
+fi
+
+# Make the Python scripts executable
+chmod +x module_generator/module_generator.py
+chmod +x module_generator/run_module_generator.py
+
+# Create symlinks for convenience
+ln -sf "$(pwd)/module_generator/run_module_generator.py" "$(pwd)/run_module_generator.py"
+ln -sf "$(pwd)/module_generator/module_generator.py" "$(pwd)/module_generator.py"
+
+# Create an empty disease_list.json if it doesn't exist
+if [ ! -f "src/main/resources/disease_list.json" ]; then
+    echo "Creating empty disease_list.json..."
+    echo "[]" > src/main/resources/disease_list.json
+fi
+
+# Check if .env file exists
+if [ ! -f ".env" ]; then
+    echo ".env file is missing. Creating from .env.example..."
+    if [ -f ".env.example" ]; then
+        cp .env.example .env
+        echo "   Created .env from .env.example. Please edit it with your API key."
+    else
+        echo "WARNING: .env.example not found. Creating minimal .env file..."
+        echo "ANTHROPIC_API_KEY=your_api_key_here" > .env
+    fi
+fi
+
+# Stop any running containers
+echo "Stopping any running containers..."
+docker-compose down || true
+
+# Rebuild the containers
+echo "Building Docker containers..."
+docker-compose build
+
+# Start the Synthea container
+echo "Starting Synthea container..."
+docker-compose up -d synthea
+
+# Give the container a moment to start
+echo "Waiting for Synthea container to be ready..."
+sleep 10
+
+# Check if the container is healthy
+CONTAINER_ID=$(docker-compose ps -q synthea)
+if [ -z "$CONTAINER_ID" ]; then
+    echo "ERROR: Failed to start Synthea container."
+    exit 1
+fi
+
+# Check if the container is using proper health checks
+HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' $CONTAINER_ID 2>/dev/null || echo "unknown")
+if [ "$HEALTH_STATUS" != "healthy" ] && [ "$HEALTH_STATUS" != "unknown" ]; then
+    echo "WARNING: Synthea container is not reporting as healthy (status: $HEALTH_STATUS)."
+    echo "It may take some time for the container to become fully operational."
+    echo "You can proceed, but the pipeline may fail if the container is not ready."
+fi
+
+# Check if we should generate modules
+if [ "$1" == "--generate-modules" ]; then
+    echo "Starting module generator..."
+    docker-compose --profile generator up module-generator
+fi
+
+echo "=================================================="
+echo "Environment is ready!"
+echo ""
+echo "To generate patients for a disease module:"
+echo "nextflow run main.nf --disease_name \"Disease Name\" --generate_patients true"
+echo ""
+echo "To generate a new module and patients:"
+echo "nextflow run main.nf --disease_name \"Disease Name\" --force_generate true --generate_patients true"
+echo "=================================================="
--- a/scripts/run_module.py
+++ b/scripts/run_module.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import json
+import anthropic
+
+DISEASE_NAME = "Excessive frequent and irregular menstruation"
+OUTPUT_FILE = "excessive_frequent_and_irregular_menstruation.json"
+
+# Initialize the Anthropic client with your API key
+client = anthropic.Anthropic(
+    # This is the default and can be omitted
+    api_key=os.environ.get("ANTHROPIC_API_KEY"),
+)
+
+# Define the prompt for generating the module
+def generate_module_prompt(disease_name):
+    return f"""
+You are a medical expert creating a disease module for the Synthea patient simulation system. 
+I need you to create a structured JSON module for {disease_name}.
+
+The module will be used to simulate patients with this condition in the Synthea healthcare simulation system. 
+The JSON should follow the Synthea module format which includes:
+
+1. Basic module information (name, remarks)
+2. States representing the progression of the disease
+3. Transitions between states
+4. Guard conditions based on patient attributes where appropriate
+5. Care plans and medications that would be prescribed
+
+Your output should be valid JSON that follows the Synthea module structure precisely. Format it as a complete,
+well-structured Synthea module. Make sure it's medically accurate and includes all relevant clinical details,
+treatment options, and disease progression patterns.
+
+Output only the JSON with no additional commentary or markdown formatting. The output will be directly saved as a file.
+"""
+
+print(f"Generating module for {DISEASE_NAME}...")
+
+try:
+    # Send a message to Claude
+    message = client.messages.create(
+        model="claude-3-7-sonnet-20250219",
+        max_tokens=4000,
+        temperature=0,
+        messages=[
+            {"role": "user", "content": generate_module_prompt(DISEASE_NAME)}
+        ]
+    )
+
+    # Extract the JSON from the response
+    module_json = message.content[0].text
+    
+    # Find the first '{' and last '}' to extract just the JSON part
+    start = module_json.find('{')
+    end = module_json.rfind('}') + 1
+    if start >= 0 and end > start:
+        module_json = module_json[start:end]
+    
+    # Parse and format the JSON
+    parsed = json.loads(module_json)
+    formatted_json = json.dumps(parsed, indent=2)
+    
+    # Write to file
+    with open(OUTPUT_FILE, 'w') as f:
+        f.write(formatted_json)
+    
+    print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}")
+    
+except Exception as e:
+    print(f"Error: {e}")
+    # Save the raw response for debugging
+    with open("error_output.txt", 'w') as f:
+        f.write(str(e))
+    print("Error details saved to error_output.txt")
+    sys.exit(1)
--- a/scripts/run_pipeline.sh
+++ b/scripts/run_pipeline.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+# run_pipeline.sh
+#
+# This script runs the Nextflow pipeline for generating disease modules and synthetic patients
+
+set -e  # Exit on error
+
+# Default values
+DISEASE_NAME=""
+FORCE_GENERATE=false
+GENERATE_PATIENTS=false
+POPULATION=100
+GENDER=0.5
+MIN_AGE=0
+MAX_AGE=90
+SEED=""
+ANALYZE_DATA=false
+REPORT_FORMAT="html"
+VERIFY_DOCKER=true
+
+# Parse command line arguments
+function show_help {
+    echo "Usage: $0 [options] --disease \"Disease Name\""
+    echo ""
+    echo "Options:"
+    echo "  --disease, -d NAME     Disease name to generate a module for (required)"
+    echo "  --force-generate, -f   Force regeneration of disease module even if it exists"
+    echo "  --patients, -p         Generate synthetic patients (default: false)"
+    echo "  --population, -n NUM   Number of patients to generate (default: 100)"
+    echo "  --gender, -g VALUE     Gender distribution (0-1 for % female, default: 0.5)"
+    echo "  --min-age, -a NUM      Minimum patient age (default: 0)"
+    echo "  --max-age, -m NUM      Maximum patient age (default: 90)"
+    echo "  --seed, -s SEED        Random seed for reproducibility"
+    echo "  --analyze, -A          Analyze patient data after generation"
+    echo "  --report-format, -r FMT Report format for analysis (html, json, csv, default: html)"
+    echo "  --skip-docker-check     Skip Docker container verification"
+    echo "  --help, -h             Show this help message"
+    echo ""
+    echo "Example:"
+    echo "  $0 --disease \"Multiple Sclerosis\" --patients --population 50"
+    exit 1
+}
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --disease|-d)
+            DISEASE_NAME="$2"
+            shift 2
+            ;;
+        --force-generate|-f)
+            FORCE_GENERATE=true
+            shift
+            ;;
+        --patients|-p)
+            GENERATE_PATIENTS=true
+            shift
+            ;;
+        --population|-n)
+            POPULATION="$2"
+            shift 2
+            ;;
+        --gender|-g)
+            GENDER="$2"
+            shift 2
+            ;;
+        --min-age|-a)
+            MIN_AGE="$2"
+            shift 2
+            ;;
+        --max-age|-m)
+            MAX_AGE="$2"
+            shift 2
+            ;;
+        --seed|-s)
+            SEED="$2"
+            shift 2
+            ;;
+        --analyze|-A)
+            ANALYZE_DATA=true
+            shift
+            ;;
+        --report-format|-r)
+            REPORT_FORMAT="$2"
+            shift 2
+            ;;
+        --skip-docker-check)
+            VERIFY_DOCKER=false
+            shift
+            ;;
+        --help|-h)
+            show_help
+            ;;
+        *)
+            echo "Unknown option: $1"
+            show_help
+            ;;
+    esac
+done
+
+# Check if disease name is provided
+if [ -z "$DISEASE_NAME" ]; then
+    echo "ERROR: Disease name is required!"
+    show_help
+fi
+
+# Check if Docker container is running if we need to generate patients
+if [ "$VERIFY_DOCKER" = true ] && [ "$GENERATE_PATIENTS" = true ]; then
+    CONTAINER_RUNNING=$(docker ps | grep synthea | wc -l)
+    if [ "$CONTAINER_RUNNING" -eq 0 ]; then
+        echo "No Synthea Docker containers are running!"
+        echo "Running environment setup script to start containers..."
+        
+        if [ -f "./scripts/prepare_environment.sh" ]; then
+            ./scripts/prepare_environment.sh
+        elif [ -f "./prepare_environment.sh" ]; then
+            ./prepare_environment.sh
+        else
+            echo "ERROR: prepare_environment.sh not found!"
+            exit 1
+        fi
+    else
+        echo "Synthea Docker container is running. Proceeding with pipeline execution."
+    fi
+fi
+
+# Create params.json file with our configuration
+echo "Creating params.json file with pipeline configuration..."
+cat > params.json << EOF
+{
+  "disease_name": "$DISEASE_NAME",
+  "modules_dir": "modules",
+  "output_dir": "output",
+  "generate_patients": $GENERATE_PATIENTS,
+  "population": $POPULATION,
+  "gender": $GENDER,
+  "min_age": $MIN_AGE,
+  "max_age": $MAX_AGE,
+  "analyze_patient_data": $ANALYZE_DATA,
+  "report_format": "$REPORT_FORMAT",
+  "force_generate": $FORCE_GENERATE,
+  "publish_dir": "published_output"
+EOF
+
+# Add seed if provided
+if [ ! -z "$SEED" ]; then
+    echo ",  \"seed\": $SEED" >> params.json
+fi
+
+# Close JSON object
+echo "}" >> params.json
+
+# Display execution details
+echo "=================================================="
+echo "Running Synthea Pipeline for: $DISEASE_NAME"
+echo "Generate patients: $GENERATE_PATIENTS"
+if [ "$GENERATE_PATIENTS" = true ]; then
+    echo "Population: $POPULATION"
+    echo "Gender ratio (proportion female): $GENDER"
+    echo "Age range: $MIN_AGE-$MAX_AGE"
+    if [ ! -z "$SEED" ]; then
+        echo "Random seed: $SEED"
+    fi
+fi
+echo "Force module generation: $FORCE_GENERATE"
+echo "Analyze patient data: $ANALYZE_DATA"
+echo "=================================================="
+
+# Execute the Nextflow command
+echo "Starting Nextflow pipeline..."
+nextflow run main.nf
+
+echo ""
+echo "Pipeline execution complete!"
+if [ "$GENERATE_PATIENTS" = true ]; then
+    echo "Check the 'output' directory and 'published_output/$DISEASE_NAME_NORMALIZED' for generated patient data."
+    if [ "$ANALYZE_DATA" = true ]; then
+        echo "Analysis reports can be found in 'published_output/$DISEASE_NAME_NORMALIZED/analysis'."
+    fi
+fi
+echo "Generated modules can be found in the 'modules' directory and 'published_output/modules'."
+echo "" 
--- a/scripts/simple_module.py
+++ b/scripts/simple_module.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import json
+import anthropic
+
+DISEASE_NAME = "Excessive frequent and irregular menstruation"
+OUTPUT_FILE = "excessive_frequent_and_irregular_menstruation.json"
+
+# Initialize the Anthropic client
+client = anthropic.Anthropic()
+
+print(f"Generating module for {DISEASE_NAME}...")
+
+try:
+    # Send a message to Claude
+    message = client.messages.create(
+        model="claude-3-7-sonnet-20250219",
+        max_tokens=4000,
+        temperature=0,
+        messages=[
+            {"role": "user", "content": f"""Create a Synthea disease module for {DISEASE_NAME} in JSON format. 
+
+The module should follow this structure:
+{{
+  "name": "Module Name",
+  "remarks": [
+    "Description of the module"
+  ],
+  "states": {{
+    "Initial": {{
+      "type": "Initial",
+      "direct_transition": "Next State"
+    }},
+    "Terminal": {{
+      "type": "Terminal"
+    }}
+    // Additional states with appropriate transitions
+  }}
+}}
+
+Make sure the JSON is properly formatted with no syntax errors. 
+Do not include any markdown formatting, comments, or explanations outside the JSON.
+Output only the valid JSON object."""}
+        ]
+    )
+
+    # Extract the JSON from the response
+    module_json = message.content[0].text
+    
+    # Save the raw response for debugging
+    with open(f"{OUTPUT_FILE}.raw", "w") as f:
+        f.write(module_json)
+        
+    print(f"Raw response saved to {OUTPUT_FILE}.raw")
+    
+    # Find the first { and last } to extract just the JSON part
+    start = module_json.find("{")
+    end = module_json.rfind("}") + 1
+    if start >= 0 and end > start:
+        module_json = module_json[start:end]
+        
+    # Fix common JSON issues
+    try:
+        # Manual cleaning of known JSON issues
+        # Find and remove lines with invalid syntax
+        cleaned_lines = []
+        for line in module_json.split('\n'):
+            # Skip lines with "{%" or any other invalid JSON syntax
+            if "{%" in line or "%}" in line or "//" in line:
+                print(f"Removing invalid line: {line}")
+                continue
+            cleaned_lines.append(line)
+        
+        cleaned_json = '\n'.join(cleaned_lines)
+        
+        # Try to parse and fix the JSON
+        parsed = json.loads(cleaned_json)
+        formatted_json = json.dumps(parsed, indent=2)
+        
+        # Write to file
+        with open(OUTPUT_FILE, "w") as f:
+            f.write(formatted_json)
+        
+        print(f"Successfully generated module and saved to {OUTPUT_FILE}")
+    except json.JSONDecodeError as e:
+        print(f"JSON parsing error: {e}")
+        print("Attempting secondary cleaning method...")
+        
+        # Write the error details for debugging
+        with open(f"{OUTPUT_FILE}.error", "w") as f:
+            f.write(f"Error: {str(e)}\n\n")
+            f.write("JSON that failed to parse:\n")
+            f.write(module_json)
+    
+except Exception as e:
+    print(f"Error: {e}")
+    sys.exit(1) 
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -0,0 +1 @@
+import anthropic; import os; client = anthropic.Anthropic(); print(client.messages.create(model="claude-3-7-sonnet-20250219", max_tokens=1000, messages=[{"role": "user", "content": "Hello"}]).content[0].text)
--- a/scripts/test_module_exists.py
+++ b/scripts/test_module_exists.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""
+Simple script to check if a module exists for a given disease and generate it if not.
+"""
+
+import os
+import sys
+import json
+import subprocess
+import re
+
+# Constants
+DISEASE_NAME = sys.argv[1] if len(sys.argv) > 1 else "Excessive frequent and irregular menstruation"
+MODULES_DIR = "src/main/resources/modules"
+RUN_MODULE_GENERATOR_PATH = "src/main/python/run_module_generator.py"
+DISEASE_LIST_PATH = "src/main/resources/disease_list.json"
+
+# Function to normalize disease name for filenames
+def normalize_filename(name):
+    """Convert disease name to normalized filename format."""
+    filename = name.lower()
+    filename = re.sub(r'[^a-zA-Z0-9]', '_', filename)
+    filename = re.sub(r'_+', '_', filename)
+    filename = filename.strip('_')
+    return filename
+
+# Main function
+def main():
+    print(f"Checking if module exists for: {DISEASE_NAME}")
+    
+    # Normalize disease name for filename
+    normalized_name = normalize_filename(DISEASE_NAME)
+    module_path = os.path.join(MODULES_DIR, f"{normalized_name}.json")
+    
+    # Check if module already exists
+    if os.path.exists(module_path):
+        print(f"✅ Module already exists at: {module_path}")
+        return
+    
+    print(f"❌ Module not found at: {module_path}")
+    print(f"Creating disease list entry for {DISEASE_NAME}...")
+    
+    # Create temporary disease list with just this disease
+    create_disease_list(DISEASE_NAME)
+    
+    # Run the module generator
+    print(f"Running module generator for {DISEASE_NAME}...")
+    try:
+        result = subprocess.run(
+            [sys.executable, RUN_MODULE_GENERATOR_PATH, "--batch-size", "1"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        
+        print("Module generator output:")
+        print(result.stdout)
+        
+        # Check if module was created
+        if os.path.exists(module_path):
+            print(f"✅ Module created successfully at: {module_path}")
+        else:
+            print(f"❌ Module generation failed, file not found at: {module_path}")
+            print("Error output:")
+            print(result.stderr)
+    except subprocess.CalledProcessError as e:
+        print(f"Error running module generator: {e}")
+        print("STDOUT:")
+        print(e.stdout)
+        print("STDERR:")
+        print(e.stderr)
+
+def create_disease_list(disease_name):
+    """Create a temporary disease list with just the requested disease."""
+    disease_list = []
+    
+    # Try to read original disease list if it exists
+    if os.path.exists(DISEASE_LIST_PATH):
+        try:
+            with open(DISEASE_LIST_PATH, 'r') as f:
+                original_diseases = json.load(f)
+            
+            # Check if our disease already exists
+            name_lower = disease_name.lower()
+            for disease in original_diseases:
+                if disease.get("disease_name", "").lower() == name_lower:
+                    disease_list.append(disease)
+                    print(f"Found existing disease entry for '{disease_name}'")
+                    break
+            else:
+                # Disease not found in list
+                disease_list.append({
+                    "id": "",  # ICD-10 code (empty as we don't have it)
+                    "disease_name": disease_name,
+                    "ICD-10_name": disease_name
+                })
+                print(f"Created new disease entry for '{disease_name}'")
+        except Exception as e:
+            print(f"Error reading disease list: {e}")
+            # Create new entry
+            disease_list.append({
+                "id": "",
+                "disease_name": disease_name,
+                "ICD-10_name": disease_name
+            })
+    else:
+        # Create new entry
+        disease_list.append({
+            "id": "",
+            "disease_name": disease_name,
+            "ICD-10_name": disease_name
+        })
+        print(f"Created new disease list with '{disease_name}'")
+    
+    # Save disease list
+    os.makedirs(os.path.dirname(DISEASE_LIST_PATH), exist_ok=True)
+    with open(DISEASE_LIST_PATH, 'w') as f:
+        json.dump(disease_list, f, indent=2)
+    
+    print(f"Saved disease list with {len(disease_list)} entries.")
+
+if __name__ == "__main__":
+    main() 
--- a/scripts/validate_module.py
+++ b/scripts/validate_module.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env python3
+"""
+Disease Module Validator for Synthea
+
+This script validates a Synthea disease module for JSON correctness and
+checks for common issues in the module structure.
+
+Usage:
+python validate_module.py <path_to_module.json>
+"""
+
+import json
+import sys
+import os
+import re
+
+def color_text(text, color_code):
+    """Add color to terminal output"""
+    return f"\033[{color_code}m{text}\033[0m"
+
+def red(text):
+    return color_text(text, "91")
+
+def green(text):
+    return color_text(text, "92")
+
+def yellow(text):
+    return color_text(text, "93")
+
+def check_required_fields(module_json):
+    """Check if the module has all required fields"""
+    required_fields = ['name', 'states', 'gmf_version']
+    missing_fields = []
+    
+    for field in required_fields:
+        if field not in module_json:
+            missing_fields.append(field)
+    
+    return missing_fields
+
+def check_transitions(module_json):
+    """Check if all transitions are valid"""
+    errors = []
+    
+    # Skip if no states
+    if 'states' not in module_json:
+        return ["No 'states' field found"]
+    
+    states = module_json['states']
+    state_names = set(states.keys())
+    
+    # Check each state
+    for state_name, state_config in states.items():
+        # Skip Terminal states (they should not have transitions)
+        if state_config.get('type') == 'Terminal':
+            continue
+        
+        # Check if the state has any transition
+        transition_found = False
+        transition_types = ['direct_transition', 'distributed_transition', 
+                           'conditional_transition', 'complex_transition']
+        
+        for transition_type in transition_types:
+            if transition_type in state_config:
+                transition_found = True
+                
+                # Check direct transitions
+                if transition_type == 'direct_transition':
+                    target = state_config[transition_type]
+                    if target not in state_names:
+                        errors.append(f"State '{state_name}' has invalid direct_transition to non-existent state '{target}'")
+                
+                # Check distributed transitions
+                elif transition_type == 'distributed_transition':
+                    for transition in state_config[transition_type]:
+                        if 'transition' in transition and transition['transition'] not in state_names:
+                            errors.append(f"State '{state_name}' has invalid distributed_transition to non-existent state '{transition['transition']}'")
+                
+                # Check conditional transitions
+                elif transition_type == 'conditional_transition':
+                    for transition in state_config[transition_type]:
+                        if 'transition' in transition and transition['transition'] not in state_names:
+                            errors.append(f"State '{state_name}' has invalid conditional_transition to non-existent state '{transition['transition']}'")
+                
+                # Check complex transitions
+                elif transition_type == 'complex_transition':
+                    for transition in state_config[transition_type]:
+                        if 'transition' in transition and transition['transition'] not in state_names:
+                            errors.append(f"State '{state_name}' has invalid complex_transition to non-existent state '{transition['transition']}'")
+                        if 'distributions' in transition:
+                            for dist in transition['distributions']:
+                                if 'transition' in dist and dist['transition'] not in state_names:
+                                    errors.append(f"State '{state_name}' has invalid complex_transition distribution to non-existent state '{dist['transition']}'")
+        
+        if not transition_found and state_config.get('type') != 'Terminal':
+            errors.append(f"State '{state_name}' has no transition defined")
+    
+    return errors
+
+def check_codes(module_json):
+    """Check if medical codes are properly formatted"""
+    warnings = []
+    
+    json_str = json.dumps(module_json)
+    
+    # Check for codes in common formats
+    code_patterns = {
+        'SNOMED-CT': r'"system":\s*"SNOMED-CT",\s*"code":\s*"[0-9]+"',
+        'LOINC': r'"system":\s*"LOINC",\s*"code":\s*"[0-9\\-]+"',
+        'RxNorm': r'"system":\s*"RxNorm",\s*"code":\s*"[0-9]+"',
+        'ICD-10': r'"system":\s*"ICD-10",\s*"code":\s*"[A-Z][0-9]+"'
+    }
+    
+    for code_type, pattern in code_patterns.items():
+        if not re.search(pattern, json_str):
+            warnings.append(f"No {code_type} codes found. This may be normal depending on the module.")
+    
+    return warnings
+
+def check_prevalence(module_json):
+    """Check if prevalence information exists"""
+    warnings = []
+    
+    json_str = json.dumps(module_json)
+    
+    # Check for prevalence patterns
+    if not any(pat in json_str for pat in ['"prevalence"', 'incidence', 'probability']):
+        warnings.append("No prevalence, incidence, or probability data found. This may affect realism.")
+    
+    return warnings
+
+def check_circular_references(module_json):
+    """Check for circular references between states"""
+    errors = []
+    
+    # Skip if no states
+    if 'states' not in module_json:
+        return []
+    
+    states = module_json['states']
+    
+    # Build a directed graph representation
+    graph = {}
+    for state_name in states:
+        graph[state_name] = []
+    
+    # Add edges to the graph
+    for state_name, state_config in states.items():
+        # Add direct transitions
+        if 'direct_transition' in state_config:
+            target = state_config['direct_transition']
+            graph[state_name].append(target)
+        
+        # Add distributed transitions
+        if 'distributed_transition' in state_config:
+            for transition in state_config['distributed_transition']:
+                if 'transition' in transition:
+                    graph[state_name].append(transition['transition'])
+        
+        # Add conditional transitions
+        if 'conditional_transition' in state_config:
+            for transition in state_config['conditional_transition']:
+                if 'transition' in transition:
+                    graph[state_name].append(transition['transition'])
+        
+        # Add complex transitions
+        if 'complex_transition' in state_config:
+            for transition in state_config['complex_transition']:
+                if 'transition' in transition:
+                    graph[state_name].append(transition['transition'])
+    
+    # Check for cycles in the graph (simplified check)
+    visited = set()
+    path = []
+    
+    def dfs(node):
+        visited.add(node)
+        path.append(node)
+        
+        for neighbor in graph[node]:
+            if neighbor in path:
+                # Found a cycle
+                cycle_start = path.index(neighbor)
+                errors.append(f"Circular reference detected: {' -> '.join(path[cycle_start:] + [neighbor])}")
+            elif neighbor not in visited:
+                dfs(neighbor)
+        
+        path.pop()
+    
+    # Run DFS from all states to find cycles
+    for state_name in states:
+        if state_name not in visited:
+            dfs(state_name)
+    
+    return errors
+
+def main():
+    if len(sys.argv) != 2:
+        print(f"Usage: python {sys.argv[0]} <path_to_module.json>")
+        sys.exit(1)
+    
+    module_path = sys.argv[1]
+    
+    if not os.path.exists(module_path):
+        print(red(f"Error: File {module_path} does not exist"))
+        sys.exit(1)
+    
+    try:
+        with open(module_path, 'r') as f:
+            content = f.read()
+            
+        # First check for valid JSON
+        try:
+            module_json = json.loads(content)
+            print(green("✓ Valid JSON structure"))
+        except json.JSONDecodeError as e:
+            print(red(f"✗ Invalid JSON: {e}"))
+            
+            # Try to fix common issues
+            print(yellow("Attempting to fix common JSON issues..."))
+            
+            # Fix trailing commas
+            fixed_content = re.sub(r',\s*}', '}', content)
+            fixed_content = re.sub(r',\s*]', ']', fixed_content)
+            
+            # Count braces
+            open_braces = fixed_content.count('{')
+            close_braces = fixed_content.count('}')
+            
+            if open_braces > close_braces:
+                print(yellow(f"Adding {open_braces - close_braces} missing closing braces"))
+                fixed_content += '}' * (open_braces - close_braces)
+            elif close_braces > open_braces:
+                print(yellow(f"Removing {close_braces - open_braces} excess closing braces"))
+                for _ in range(close_braces - open_braces):
+                    fixed_content = fixed_content.rstrip().rstrip('}') + '}'
+            
+            try:
+                module_json = json.loads(fixed_content)
+                print(green("✓ Fixed JSON issues successfully"))
+                
+                # Write the fixed content back to the file
+                with open(module_path, 'w') as f:
+                    f.write(json.dumps(module_json, indent=2))
+                    
+            except json.JSONDecodeError as e:
+                print(red(f"✗ Could not fix JSON: {e}"))
+                sys.exit(1)
+        
+        # Check required fields
+        missing_fields = check_required_fields(module_json)
+        if missing_fields:
+            print(red(f"✗ Missing required fields: {', '.join(missing_fields)}"))
+        else:
+            print(green("✓ All required fields present"))
+        
+        # Check for invalid transitions
+        transition_errors = check_transitions(module_json)
+        if transition_errors:
+            print(red("✗ Invalid transitions found:"))
+            for error in transition_errors:
+                print(red(f"  - {error}"))
+        else:
+            print(green("✓ All transitions valid"))
+        
+        # Check for circular references
+        circular_errors = check_circular_references(module_json)
+        if circular_errors:
+            print(red("✗ Circular references found:"))
+            for error in circular_errors:
+                print(red(f"  - {error}"))
+        else:
+            print(green("✓ No circular references detected"))
+        
+        # Check for medical codes
+        code_warnings = check_codes(module_json)
+        if code_warnings:
+            print(yellow("⚠ Possible code issues:"))
+            for warning in code_warnings:
+                print(yellow(f"  - {warning}"))
+        else:
+            print(green("✓ Medical codes look good"))
+        
+        # Check for prevalence information
+        prevalence_warnings = check_prevalence(module_json)
+        if prevalence_warnings:
+            print(yellow("⚠ Possible prevalence issues:"))
+            for warning in prevalence_warnings:
+                print(yellow(f"  - {warning}"))
+        else:
+            print(green("✓ Prevalence information looks good"))
+        
+        # Display module stats
+        print("\nModule Statistics:")
+        print(f"- Name: {module_json.get('name', 'Unknown')}")
+        print(f"- GMF Version: {module_json.get('gmf_version', 'Unknown')}")
+        print(f"- States: {len(module_json.get('states', {}))}")
+        print(f"- Remarks: {len(module_json.get('remarks', []))}")
+        
+    except Exception as e:
+        print(red(f"Error: {e}"))
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
				`@@ -0,0 +1 @@`
				`import anthropic; import os; client = anthropic.Anthropic(); print(client.messages.create(model="claude-3-7-sonnet-20250219", max_tokens=1000, messages=[{"role": "user", "content": "Hello"}]).content[0].text)`