Trying to fix basic functionality again.

2025-03-23 11:53:47 -07:00
parent ebda48190a
commit 2141e81f42
406 changed files with 173963 additions and 69 deletions
--- a/scripts/generate_module.py
+++ b/scripts/generate_module.py
@@ -0,0 +1,468 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import json
+import re
+import anthropic
+import csv
+import argparse
+
+# Define global variables
+DISEASE_NAME = ""
+OUTPUT_FILE = ""
+icd10_code = ""
+disease_category = ""
+use_interactive = True
+use_auto_fallback = False
+
+def normalize_disease_name(name):
+    # Normalize disease name for filename
+    normalized_name = name.lower().replace(' ', '_').replace('-', '_')
+    normalized_name = ''.join(c if c.isalnum() or c == '_' else '_' for c in normalized_name)
+    normalized_name = '_'.join(filter(None, normalized_name.split('_')))
+    return normalized_name
+
+def generate_module():
+    # Initialize the Anthropic client
+    api_key = os.environ.get('ANTHROPIC_API_KEY')
+    if not api_key:
+        print("Error: ANTHROPIC_API_KEY environment variable is not set")
+        sys.exit(1)
+
+    client = anthropic.Anthropic(api_key=api_key)
+    
+    # Step 1: Ask Claude for clinical details in a structured format, not JSON
+    print("Step 1: Getting clinical details from Claude...")
+    
+    # Prepare disease information for the prompt
+    disease_info = f"Disease name: {DISEASE_NAME}"
+    if icd10_code:
+        disease_info += f"\nICD-10 code: {icd10_code}"
+    if disease_category:
+        disease_info += f"\nCategory: {disease_category}"
+
+    # Construct a prompt that asks for structured data, not JSON
+    prompt = f"""I need information to create a Synthea disease module for {DISEASE_NAME}. 
+    
+{disease_info}
+
+Please provide the following information in a structured format (NOT JSON):
+
+1. DESCRIPTION: A brief description of the disease (3-5 sentences)
+
+2. RISK_FACTORS: Age ranges, gender factors, and/or other risk factors for this disease
+   
+3. SYMPTOMS: List the main symptoms of the disease
+   
+4. DIAGNOSTIC_TESTS: Tests typically used to diagnose this condition
+   
+5. TREATMENTS: List treatments for this disease, including:
+   - Medications (with their class and purpose)
+   - Procedures
+   - Other interventions
+   
+6. COMPLICATIONS: Possible complications of this disease
+   
+7. PROGRESSION: Typical progression of the disease, including:
+   - How it starts
+   - How it typically develops
+   - Possible outcomes
+
+8. FOLLOW_UP: Typical follow-up care needed
+
+For medications, just provide generic names - I'll handle the RxNorm codes.
+For conditions and procedures, just provide plain English names - I'll handle the coding.
+
+Please provide this information in a straightforward, structured text format. Do NOT use JSON.
+"""
+
+    # Get clinical details
+    try:
+        response = client.messages.create(
+            model="claude-3-7-sonnet-20250219",
+            max_tokens=4000,
+            temperature=0.2,
+            messages=[
+                {"role": "user", "content": prompt}
+            ]
+        )
+        
+        # Extract the response text
+        clinical_details = response.content[0].text
+        
+        # Save the raw clinical details for reference
+        os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
+        with open(f"{OUTPUT_FILE}.details", "w") as f:
+            f.write(clinical_details)
+        
+        print(f"Clinical details saved to {OUTPUT_FILE}.details")
+        
+        # Step 2: Now build a valid module with the clinical details
+        print("Step 2: Building module with clinical details...")
+        
+        # Extract information from the structured response
+        description_match = re.search(r'##\s*1\.\s*DESCRIPTION(.*?)(?=##\s*2\.)', clinical_details, re.DOTALL)
+        risk_factors_match = re.search(r'##\s*2\.\s*RISK_FACTORS(.*?)(?=##\s*3\.)', clinical_details, re.DOTALL)
+        symptoms_match = re.search(r'##\s*3\.\s*SYMPTOMS(.*?)(?=##\s*4\.)', clinical_details, re.DOTALL)
+        diagnostic_tests_match = re.search(r'##\s*4\.\s*DIAGNOSTIC_TESTS(.*?)(?=##\s*5\.)', clinical_details, re.DOTALL)
+        treatments_match = re.search(r'##\s*5\.\s*TREATMENTS(.*?)(?=##\s*6\.)', clinical_details, re.DOTALL)
+        complications_match = re.search(r'##\s*6\.\s*COMPLICATIONS(.*?)(?=##\s*7\.)', clinical_details, re.DOTALL)
+        progression_match = re.search(r'##\s*7\.\s*PROGRESSION(.*?)(?=##\s*8\.)', clinical_details, re.DOTALL)
+        follow_up_match = re.search(r'##\s*8\.\s*FOLLOW_UP(.*?)(?=$)', clinical_details, re.DOTALL)
+        
+        # Extract text from matches
+        description = description_match.group(1).strip() if description_match else f"A module for {DISEASE_NAME}"
+        risk_factors = risk_factors_match.group(1).strip() if risk_factors_match else ""
+        symptoms = symptoms_match.group(1).strip() if symptoms_match else ""
+        diagnostic_tests = diagnostic_tests_match.group(1).strip() if diagnostic_tests_match else ""
+        treatments = treatments_match.group(1).strip() if treatments_match else ""
+        complications = complications_match.group(1).strip() if complications_match else ""
+        progression = progression_match.group(1).strip() if progression_match else ""
+        follow_up = follow_up_match.group(1).strip() if follow_up_match else ""
+        
+        print(f"Extracted: {len(description)} chars description, {len(symptoms)} chars symptoms, {len(treatments)} chars treatments")
+        
+        # Extract symptoms as a list
+        symptom_list = []
+        if symptoms_match:
+            # For structured format with bullet points
+            symptom_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', symptoms, re.MULTILINE)
+            if symptom_lines:
+                symptom_list = [s.strip() for s in symptom_lines if s.strip()]
+            else:
+                # Try to split by newlines for a less structured format
+                symptom_list = [s.strip() for s in symptoms.split('\n') if s.strip()]
+                # If that doesn't give us anything, just add the whole text as one symptom
+                if not symptom_list and symptoms.strip():
+                    symptom_list = [symptoms.strip()]
+                    
+        print(f"Found {len(symptom_list)} symptoms")
+        
+        # Extract medications from treatments
+        medications = []
+        if treatments_match:
+            # Find the "Medications:" section
+            meds_section_match = re.search(r'Medications?:\s*(.*?)(?=(?:- Procedures:|$))', treatments, re.DOTALL)
+            if meds_section_match:
+                meds_section = meds_section_match.group(1).strip()
+                # Extract main medication names from list items with explanations
+                # Format is typically: "- Medication name (explanation)" or "- Medication name - explanation"
+                med_items = re.findall(r'[-*]\s*(.*?)(?=[-*]|\n\n|\n- |$)', meds_section, re.DOTALL)
+                
+                for item in med_items:
+                    item = item.strip()
+                    if not item:
+                        continue
+                    
+                    # Extract just the medication name, not the explanation
+                    med_match = re.match(r'([^(]+)(?:\(|-).*', item)
+                    if med_match:
+                        med_name = med_match.group(1).strip()
+                    else:
+                        med_name = item.split('(')[0].strip()
+                    
+                    # Clean up common patterns
+                    med_name = re.sub(r'\s*\([^)]*\)', '', med_name)  # Remove parentheticals
+                    med_name = re.sub(r'\s*-.*', '', med_name)  # Remove after dash
+                    
+                    # Skip descriptive phrases that aren't medications
+                    if any(phrase in med_name.lower() for phrase in ['reduce', 'block', 'prevent', 'for']):
+                        continue
+                    
+                    # Look for multiple medications in parentheses
+                    if ',' in med_name:
+                        for m in med_name.split(','):
+                            m = m.strip()
+                            if m and not any(m.lower() in existing.lower() for existing in medications):
+                                medications.append(m)
+                    else:
+                        if med_name and not any(med_name.lower() in existing.lower() for existing in medications):
+                            medications.append(med_name)
+            
+        print(f"Found {len(medications)} medications: {', '.join(medications[:5])}")
+        
+        # Start with a basic module template
+        module = {
+            "name": DISEASE_NAME,
+            "remarks": [],
+            "states": {
+                "Initial": {
+                    "type": "Initial",
+                    "direct_transition": "Age_Guard"
+                },
+                "Age_Guard": {
+                    "type": "Guard",
+                    "allow": {
+                        "condition_type": "Age",
+                        "operator": ">=",
+                        "quantity": 18,
+                        "unit": "years"
+                    },
+                    "direct_transition": "Delay_Until_Onset"
+                },
+                "Delay_Until_Onset": {
+                    "type": "Delay",
+                    "range": {
+                        "low": 0,
+                        "high": 20,
+                        "unit": "years"
+                    },
+                    "direct_transition": "Onset"
+                },
+                "Onset": {
+                    "type": "ConditionOnset",
+                    "target_encounter": "Diagnosis_Encounter",
+                    "codes": [
+                        {
+                            "system": "SNOMED-CT",
+                            "code": "64109004",
+                            "display": DISEASE_NAME
+                        }
+                    ],
+                    "direct_transition": "Diagnosis_Encounter"
+                },
+                "Diagnosis_Encounter": {
+                    "type": "Encounter",
+                    "encounter_class": "ambulatory",
+                    "reason": "Onset",
+                    "codes": [
+                        {
+                            "system": "SNOMED-CT",
+                            "code": "185347001",
+                            "display": "Encounter for problem"
+                        }
+                    ],
+                    "direct_transition": "End_Diagnosis_Encounter"
+                },
+                "End_Diagnosis_Encounter": {
+                    "type": "EncounterEnd"
+                    # direct_transition will be set later
+                }
+            }
+        }
+        
+        # Add description to remarks
+        if description:
+            for line in description.split('\n'):
+                line = line.strip()
+                if line:
+                    module["remarks"].append(line)
+        
+        # Add ICD-10 code if available
+        if icd10_code:
+            module["remarks"].append(f"ICD-10 code: {icd10_code}")
+        
+        # Add category if available
+        if disease_category:
+            module["remarks"].append(f"Category: {disease_category}")
+        
+        # Make sure remarks is not empty
+        if not module["remarks"]:
+            module["remarks"].append(f"Generated module for {DISEASE_NAME}")
+            
+        # Add risk factors to remarks if available
+        if risk_factors:
+            module["remarks"].append(f"Risk factors: {risk_factors.replace('\n', ' ')}")
+        
+        # Add symptoms if available
+        symptom_i = 0
+        current_state = "End_Diagnosis_Encounter"
+        
+        if symptom_list:
+            for i, symptom in enumerate(symptom_list[:3]):  # Limit to 3 symptoms
+                symptom_name = symptom.strip()
+                if not symptom_name:
+                    continue
+                    
+                state_name = f"Symptom_{symptom_i}"
+                symptom_i += 1
+                
+                module["states"][state_name] = {
+                    "type": "ConditionOnset",
+                    "target_encounter": "Diagnosis_Encounter",
+                    "codes": [
+                        {
+                            "system": "SNOMED-CT",
+                            "code": "418107008",  # Default symptom code
+                            "display": symptom_name
+                        }
+                    ]
+                }
+                
+                # Connect states
+                module["states"][current_state]["direct_transition"] = state_name
+                current_state = state_name
+            
+        # Add diagnostic tests if available
+        if diagnostic_tests:
+            test_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', diagnostic_tests, re.MULTILINE)
+            tests = [t.strip() for t in test_lines if t.strip()]
+            
+            if tests:
+                for i, test in enumerate(tests[:2]):  # Limit to 2 tests
+                    test_name = test.strip()
+                    if not test_name:
+                        continue
+                        
+                    state_name = f"DiagnosticTest_{i}"
+                    
+                    module["states"][state_name] = {
+                        "type": "Procedure",
+                        "target_encounter": "Diagnosis_Encounter",
+                        "reason": "Onset",
+                        "codes": [
+                            {
+                                "system": "SNOMED-CT",
+                                "code": "386053000",  # Default diagnostic procedure code
+                                "display": test_name
+                            }
+                        ]
+                    }
+                    
+                    # Connect states
+                    module["states"][current_state]["direct_transition"] = state_name
+                    current_state = state_name
+        
+        # Add medications if found
+        if medications:
+            for i, med in enumerate(medications[:3]):  # Limit to 3 medications to keep module manageable
+                med_name = med.strip()
+                if not med_name:
+                    continue
+                    
+                state_name = f"Prescribe_{i}"
+                
+                module["states"][state_name] = {
+                    "type": "MedicationOrder",
+                    "target_encounter": "Diagnosis_Encounter",
+                    "reason": "Onset",
+                    "codes": [
+                        {
+                            "system": "RxNorm",
+                            "code": "308047",  # Default code
+                            "display": med_name
+                        }
+                    ]
+                }
+                
+                # Connect states
+                module["states"][current_state]["direct_transition"] = state_name
+                current_state = state_name
+                
+        # Connect to Follow-up
+        module["states"][current_state]["direct_transition"] = "Follow_Up_Encounter"
+        
+        # Add follow-up encounter
+        module["states"]["Follow_Up_Encounter"] = {
+            "type": "Encounter",
+            "encounter_class": "ambulatory",
+            "reason": "Onset",
+            "codes": [
+                {
+                    "system": "SNOMED-CT",
+                    "code": "390906007",
+                    "display": "Follow-up encounter"
+                }
+            ],
+            "direct_transition": "End_Follow_Up_Encounter"
+        }
+        
+        module["states"]["End_Follow_Up_Encounter"] = {
+            "type": "EncounterEnd",
+            "direct_transition": "Terminal"
+        }
+        
+        module["states"]["Terminal"] = {
+            "type": "Terminal"
+        }
+        
+        # Format the JSON for output
+        formatted_json = json.dumps(module, indent=2)
+        
+        # Save the module
+        with open(OUTPUT_FILE, "w") as f:
+            f.write(formatted_json)
+        
+        print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}")
+        return 0
+            
+    except Exception as e:
+        print(f"❌ Error: {str(e)}")
+        
+        # Write the error to a file
+        with open(f"{OUTPUT_FILE}.error", "w") as f:
+            f.write(f"Error: {str(e)}")
+        return 1
+
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='Generate a Synthea module for a disease')
+    parser.add_argument('--disease', type=str, required=True, help='Disease name')
+    parser.add_argument('--icd10', type=str, help='ICD-10 code')
+    parser.add_argument('--category', type=str, help='Disease category')
+    parser.add_argument('--no-interactive', action='store_true', help='Skip interactive prompts, for batch processing')
+    parser.add_argument('--auto-fallback', action='store_true', help='Automatically use fallback template if JSON parsing fails')
+    parser.add_argument('--force', action='store_true', help='Force regeneration even if module already exists')
+    args = parser.parse_args()
+    
+    # Set global variables
+    global DISEASE_NAME, OUTPUT_FILE, use_interactive, use_auto_fallback
+    DISEASE_NAME = args.disease
+    normalized_name = normalize_disease_name(DISEASE_NAME)
+    OUTPUT_FILE = f"src/main/resources/modules/{normalized_name}.json"
+    use_interactive = not args.no_interactive
+    use_auto_fallback = args.auto_fallback
+    
+    # Use provided ICD-10 code or category if specified
+    global icd10_code, disease_category
+    icd10_code = args.icd10
+    disease_category = args.category
+    
+    print(f"Disease name: {DISEASE_NAME}")
+    print(f"Normalized filename: {normalized_name}.json")
+    
+    # Check if module already exists
+    if os.path.exists(OUTPUT_FILE):
+        print(f"✅ Module already exists at: {OUTPUT_FILE}")
+        if args.force:
+            print("Force flag set. Regenerating module.")
+        else:
+            return 0
+    else:
+        print(f"❌ Module not found at: {OUTPUT_FILE}")
+    
+    # If no ICD-10 code provided, look up in disease list CSV
+    if not icd10_code:
+        csv_path = "src/main/resources/disease_list.csv"
+        if os.path.exists(csv_path):
+            print(f"Looking up disease information in {csv_path}...")
+            try:
+                with open(csv_path, 'r') as f:
+                    reader = csv.DictReader(f)
+                    for row in reader:
+                        if row.get("disease_name", "").lower() == DISEASE_NAME.lower():
+                            icd10_code = row.get("id", "")
+                            disease_category = row.get("disease_category", "")
+                            print(f"Found matching disease in CSV:")
+                            print(f"- ICD-10 code: {icd10_code}")
+                            print(f"- Category: {disease_category}")
+                            break
+                    else:
+                        print("No matching disease found in CSV. Continuing without ICD-10 code.")
+            except Exception as e:
+                print(f"Error reading CSV: {str(e)}")
+                print("Continuing without ICD-10 code.")
+    
+    # Ensure ANTHROPIC_API_KEY is set
+    if not os.environ.get('ANTHROPIC_API_KEY'):
+        print("Error: ANTHROPIC_API_KEY environment variable is not set")
+        return 1
+    
+    # Ask Claude to generate the module
+    generate_module()
+    
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())