synthea-alldiseases/scripts/generate_module.py

#!/usr/bin/env python3

import os
import sys
import json
import re
import anthropic
import csv
import argparse

# Define global variables
DISEASE_NAME = ""
OUTPUT_FILE = ""
icd10_code = ""
disease_category = ""
use_interactive = True
use_auto_fallback = False

def normalize_disease_name(name):
    # Normalize disease name for filename
    normalized_name = name.lower().replace(' ', '_').replace('-', '_')
    normalized_name = ''.join(c if c.isalnum() or c == '_' else '_' for c in normalized_name)
    normalized_name = '_'.join(filter(None, normalized_name.split('_')))
    return normalized_name

def generate_module():
    # Initialize the Anthropic client
    api_key = os.environ.get('ANTHROPIC_API_KEY')
    if not api_key:
        print("Error: ANTHROPIC_API_KEY environment variable is not set")
        sys.exit(1)

    client = anthropic.Anthropic(api_key=api_key)

    # Step 1: Ask Claude for clinical details in a structured format, not JSON
    print("Step 1: Getting clinical details from Claude...")

    # Prepare disease information for the prompt
    disease_info = f"Disease name: {DISEASE_NAME}"
    if icd10_code:
        disease_info += f"\nICD-10 code: {icd10_code}"
    if disease_category:
        disease_info += f"\nCategory: {disease_category}"

    # Construct a prompt that asks for structured data, not JSON
    prompt = f"""I need information to create a Synthea disease module for {DISEASE_NAME}.

{disease_info}

Please provide the following information in a structured format (NOT JSON):

1. DESCRIPTION: A brief description of the disease (3-5 sentences)

2. RISK_FACTORS: Age ranges, gender factors, and/or other risk factors for this disease

3. SYMPTOMS: List the main symptoms of the disease

4. DIAGNOSTIC_TESTS: Tests typically used to diagnose this condition

5. TREATMENTS: List treatments for this disease, including:
   - Medications (with their class and purpose)
   - Procedures
   - Other interventions

6. COMPLICATIONS: Possible complications of this disease

7. PROGRESSION: Typical progression of the disease, including:
   - How it starts
   - How it typically develops
   - Possible outcomes

8. FOLLOW_UP: Typical follow-up care needed

For medications, just provide generic names - I'll handle the RxNorm codes.
For conditions and procedures, just provide plain English names - I'll handle the coding.

Please provide this information in a straightforward, structured text format. Do NOT use JSON.
"""

    # Get clinical details
    try:
        response = client.messages.create(
            model="claude-3-7-sonnet-20250219",
            max_tokens=4000,
            temperature=0.2,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )

        # Extract the response text
        clinical_details = response.content[0].text

        # Save the raw clinical details for reference
        os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
        with open(f"{OUTPUT_FILE}.details", "w") as f:
            f.write(clinical_details)

        print(f"Clinical details saved to {OUTPUT_FILE}.details")

        # Step 2: Now build a valid module with the clinical details
        print("Step 2: Building module with clinical details...")

        # Extract information from the structured response
        description_match = re.search(r'##\s*1\.\s*DESCRIPTION(.*?)(?=##\s*2\.)', clinical_details, re.DOTALL)
        risk_factors_match = re.search(r'##\s*2\.\s*RISK_FACTORS(.*?)(?=##\s*3\.)', clinical_details, re.DOTALL)
        symptoms_match = re.search(r'##\s*3\.\s*SYMPTOMS(.*?)(?=##\s*4\.)', clinical_details, re.DOTALL)
        diagnostic_tests_match = re.search(r'##\s*4\.\s*DIAGNOSTIC_TESTS(.*?)(?=##\s*5\.)', clinical_details, re.DOTALL)
        treatments_match = re.search(r'##\s*5\.\s*TREATMENTS(.*?)(?=##\s*6\.)', clinical_details, re.DOTALL)
        complications_match = re.search(r'##\s*6\.\s*COMPLICATIONS(.*?)(?=##\s*7\.)', clinical_details, re.DOTALL)
        progression_match = re.search(r'##\s*7\.\s*PROGRESSION(.*?)(?=##\s*8\.)', clinical_details, re.DOTALL)
        follow_up_match = re.search(r'##\s*8\.\s*FOLLOW_UP(.*?)(?=$)', clinical_details, re.DOTALL)

        # Extract text from matches
        description = description_match.group(1).strip() if description_match else f"A module for {DISEASE_NAME}"
        risk_factors = risk_factors_match.group(1).strip() if risk_factors_match else ""
        symptoms = symptoms_match.group(1).strip() if symptoms_match else ""
        diagnostic_tests = diagnostic_tests_match.group(1).strip() if diagnostic_tests_match else ""
        treatments = treatments_match.group(1).strip() if treatments_match else ""
        complications = complications_match.group(1).strip() if complications_match else ""
        progression = progression_match.group(1).strip() if progression_match else ""
        follow_up = follow_up_match.group(1).strip() if follow_up_match else ""

        print(f"Extracted: {len(description)} chars description, {len(symptoms)} chars symptoms, {len(treatments)} chars treatments")

        # Extract symptoms as a list
        symptom_list = []
        if symptoms_match:
            # For structured format with bullet points
            symptom_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', symptoms, re.MULTILINE)
            if symptom_lines:
                symptom_list = [s.strip() for s in symptom_lines if s.strip()]
            else:
                # Try to split by newlines for a less structured format
                symptom_list = [s.strip() for s in symptoms.split('\n') if s.strip()]
                # If that doesn't give us anything, just add the whole text as one symptom
                if not symptom_list and symptoms.strip():
                    symptom_list = [symptoms.strip()]

        print(f"Found {len(symptom_list)} symptoms")

        # Extract medications from treatments
        medications = []
        if treatments_match:
            # Find the "Medications:" section
            meds_section_match = re.search(r'Medications?:\s*(.*?)(?=(?:- Procedures:|$))', treatments, re.DOTALL)
            if meds_section_match:
                meds_section = meds_section_match.group(1).strip()
                # Extract main medication names from list items with explanations
                # Format is typically: "- Medication name (explanation)" or "- Medication name - explanation"
                med_items = re.findall(r'[-*]\s*(.*?)(?=[-*]|\n\n|\n- |$)', meds_section, re.DOTALL)

                for item in med_items:
                    item = item.strip()
                    if not item:
                        continue

                    # Extract just the medication name, not the explanation
                    med_match = re.match(r'([^(]+)(?:\(|-).*', item)
                    if med_match:
                        med_name = med_match.group(1).strip()
                    else:
                        med_name = item.split('(')[0].strip()

                    # Clean up common patterns
                    med_name = re.sub(r'\s*\([^)]*\)', '', med_name)  # Remove parentheticals
                    med_name = re.sub(r'\s*-.*', '', med_name)  # Remove after dash

                    # Skip descriptive phrases that aren't medications
                    if any(phrase in med_name.lower() for phrase in ['reduce', 'block', 'prevent', 'for']):
                        continue

                    # Look for multiple medications in parentheses
                    if ',' in med_name:
                        for m in med_name.split(','):
                            m = m.strip()
                            if m and not any(m.lower() in existing.lower() for existing in medications):
                                medications.append(m)
                    else:
                        if med_name and not any(med_name.lower() in existing.lower() for existing in medications):
                            medications.append(med_name)

        print(f"Found {len(medications)} medications: {', '.join(medications[:5])}")

        # Start with a basic module template
        module = {
            "name": DISEASE_NAME,
            "remarks": [],
            "states": {
                "Initial": {
                    "type": "Initial",
                    "direct_transition": "Age_Guard"
                },
                "Age_Guard": {
                    "type": "Guard",
                    "allow": {
                        "condition_type": "Age",
                        "operator": ">=",
                        "quantity": 18,
                        "unit": "years"
                    },
                    "direct_transition": "Delay_Until_Onset"
                },
                "Delay_Until_Onset": {
                    "type": "Delay",
                    "range": {
                        "low": 0,
                        "high": 20,
                        "unit": "years"
                    },
                    "direct_transition": "Onset"
                },
                "Onset": {
                    "type": "ConditionOnset",
                    "target_encounter": "Diagnosis_Encounter",
                    "codes": [
                        {
                            "system": "SNOMED-CT",
                            "code": "64109004",
                            "display": DISEASE_NAME
                        }
                    ],
                    "direct_transition": "Diagnosis_Encounter"
                },
                "Diagnosis_Encounter": {
                    "type": "Encounter",
                    "encounter_class": "ambulatory",
                    "reason": "Onset",
                    "codes": [
                        {
                            "system": "SNOMED-CT",
                            "code": "185347001",
                            "display": "Encounter for problem"
                        }
                    ],
                    "direct_transition": "End_Diagnosis_Encounter"
                },
                "End_Diagnosis_Encounter": {
                    "type": "EncounterEnd"
                    # direct_transition will be set later
                }
            }
        }

        # Add description to remarks
        if description:
            for line in description.split('\n'):
                line = line.strip()
                if line:
                    module["remarks"].append(line)

        # Add ICD-10 code if available
        if icd10_code:
            module["remarks"].append(f"ICD-10 code: {icd10_code}")

        # Add category if available
        if disease_category:
            module["remarks"].append(f"Category: {disease_category}")

        # Make sure remarks is not empty
        if not module["remarks"]:
            module["remarks"].append(f"Generated module for {DISEASE_NAME}")

        # Add risk factors to remarks if available
        if risk_factors:
            module["remarks"].append(f"Risk factors: {risk_factors.replace('\n', ' ')}")

        # Add symptoms if available
        symptom_i = 0
        current_state = "End_Diagnosis_Encounter"

        if symptom_list:
            for i, symptom in enumerate(symptom_list[:3]):  # Limit to 3 symptoms
                symptom_name = symptom.strip()
                if not symptom_name:
                    continue

                state_name = f"Symptom_{symptom_i}"
                symptom_i += 1

                module["states"][state_name] = {
                    "type": "ConditionOnset",
                    "target_encounter": "Diagnosis_Encounter",
                    "codes": [
                        {
                            "system": "SNOMED-CT",
                            "code": "418107008",  # Default symptom code
                            "display": symptom_name
                        }
                    ]
                }

                # Connect states
                module["states"][current_state]["direct_transition"] = state_name
                current_state = state_name

        # Add diagnostic tests if available
        if diagnostic_tests:
            test_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', diagnostic_tests, re.MULTILINE)
            tests = [t.strip() for t in test_lines if t.strip()]

            if tests:
                for i, test in enumerate(tests[:2]):  # Limit to 2 tests
                    test_name = test.strip()
                    if not test_name:
                        continue

                    state_name = f"DiagnosticTest_{i}"

                    module["states"][state_name] = {
                        "type": "Procedure",
                        "target_encounter": "Diagnosis_Encounter",
                        "reason": "Onset",
                        "codes": [
                            {
                                "system": "SNOMED-CT",
                                "code": "386053000",  # Default diagnostic procedure code
                                "display": test_name
                            }
                        ]
                    }

                    # Connect states
                    module["states"][current_state]["direct_transition"] = state_name
                    current_state = state_name

        # Add medications if found
        if medications:
            for i, med in enumerate(medications[:3]):  # Limit to 3 medications to keep module manageable
                med_name = med.strip()
                if not med_name:
                    continue

                state_name = f"Prescribe_{i}"

                module["states"][state_name] = {
                    "type": "MedicationOrder",
                    "target_encounter": "Diagnosis_Encounter",
                    "reason": "Onset",
                    "codes": [
                        {
                            "system": "RxNorm",
                            "code": "308047",  # Default code
                            "display": med_name
                        }
                    ]
                }

                # Connect states
                module["states"][current_state]["direct_transition"] = state_name
                current_state = state_name

        # Connect to Follow-up
        module["states"][current_state]["direct_transition"] = "Follow_Up_Encounter"

        # Add follow-up encounter
        module["states"]["Follow_Up_Encounter"] = {
            "type": "Encounter",
            "encounter_class": "ambulatory",
            "reason": "Onset",
            "codes": [
                {
                    "system": "SNOMED-CT",
                    "code": "390906007",
                    "display": "Follow-up encounter"
                }
            ],
            "direct_transition": "End_Follow_Up_Encounter"
        }

        module["states"]["End_Follow_Up_Encounter"] = {
            "type": "EncounterEnd",
            "direct_transition": "Terminal"
        }

        module["states"]["Terminal"] = {
            "type": "Terminal"
        }

        # Format the JSON for output
        formatted_json = json.dumps(module, indent=2)

        # Save the module
        with open(OUTPUT_FILE, "w") as f:
            f.write(formatted_json)

        print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}")
        return 0

    except Exception as e:
        print(f"❌ Error: {str(e)}")

        # Write the error to a file
        with open(f"{OUTPUT_FILE}.error", "w") as f:
            f.write(f"Error: {str(e)}")
        return 1

def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Generate a Synthea module for a disease')
    parser.add_argument('--disease', type=str, required=True, help='Disease name')
    parser.add_argument('--icd10', type=str, help='ICD-10 code')
    parser.add_argument('--category', type=str, help='Disease category')
    parser.add_argument('--no-interactive', action='store_true', help='Skip interactive prompts, for batch processing')
    parser.add_argument('--auto-fallback', action='store_true', help='Automatically use fallback template if JSON parsing fails')
    parser.add_argument('--force', action='store_true', help='Force regeneration even if module already exists')
    args = parser.parse_args()

    # Set global variables
    global DISEASE_NAME, OUTPUT_FILE, use_interactive, use_auto_fallback
    DISEASE_NAME = args.disease
    normalized_name = normalize_disease_name(DISEASE_NAME)
    OUTPUT_FILE = f"src/main/resources/modules/{normalized_name}.json"
    use_interactive = not args.no_interactive
    use_auto_fallback = args.auto_fallback

    # Use provided ICD-10 code or category if specified
    global icd10_code, disease_category
    icd10_code = args.icd10
    disease_category = args.category

    print(f"Disease name: {DISEASE_NAME}")
    print(f"Normalized filename: {normalized_name}.json")

    # Check if module already exists
    if os.path.exists(OUTPUT_FILE):
        print(f"✅ Module already exists at: {OUTPUT_FILE}")
        if args.force:
            print("Force flag set. Regenerating module.")
        else:
            return 0
    else:
        print(f"❌ Module not found at: {OUTPUT_FILE}")

    # If no ICD-10 code provided, look up in disease list CSV
    if not icd10_code:
        csv_path = "src/main/resources/disease_list.csv"
        if os.path.exists(csv_path):
            print(f"Looking up disease information in {csv_path}...")
            try:
                with open(csv_path, 'r') as f:
                    reader = csv.DictReader(f)
                    for row in reader:
                        if row.get("disease_name", "").lower() == DISEASE_NAME.lower():
                            icd10_code = row.get("id", "")
                            disease_category = row.get("disease_category", "")
                            print(f"Found matching disease in CSV:")
                            print(f"- ICD-10 code: {icd10_code}")
                            print(f"- Category: {disease_category}")
                            break
                    else:
                        print("No matching disease found in CSV. Continuing without ICD-10 code.")
            except Exception as e:
                print(f"Error reading CSV: {str(e)}")
                print("Continuing without ICD-10 code.")

    # Ensure ANTHROPIC_API_KEY is set
    if not os.environ.get('ANTHROPIC_API_KEY'):
        print("Error: ANTHROPIC_API_KEY environment variable is not set")
        return 1

    # Ask Claude to generate the module
    generate_module()

    return 0

if __name__ == "__main__":
    sys.exit(main())