#!/usr/bin/env python3 import os import sys import json import re import anthropic import csv import argparse # Define global variables DISEASE_NAME = "" OUTPUT_FILE = "" icd10_code = "" disease_category = "" use_interactive = True use_auto_fallback = False def normalize_disease_name(name): # Normalize disease name for filename normalized_name = name.lower().replace(' ', '_').replace('-', '_') normalized_name = ''.join(c if c.isalnum() or c == '_' else '_' for c in normalized_name) normalized_name = '_'.join(filter(None, normalized_name.split('_'))) return normalized_name def generate_module(): # Initialize the Anthropic client api_key = os.environ.get('ANTHROPIC_API_KEY') if not api_key: print("Error: ANTHROPIC_API_KEY environment variable is not set") sys.exit(1) client = anthropic.Anthropic(api_key=api_key) # Step 1: Ask Claude for clinical details in a structured format, not JSON print("Step 1: Getting clinical details from Claude...") # Prepare disease information for the prompt disease_info = f"Disease name: {DISEASE_NAME}" if icd10_code: disease_info += f"\nICD-10 code: {icd10_code}" if disease_category: disease_info += f"\nCategory: {disease_category}" # Construct a prompt that asks for structured data, not JSON prompt = f"""I need information to create a Synthea disease module for {DISEASE_NAME}. {disease_info} Please provide the following information in a structured format (NOT JSON): 1. DESCRIPTION: A brief description of the disease (3-5 sentences) 2. RISK_FACTORS: Age ranges, gender factors, and/or other risk factors for this disease 3. SYMPTOMS: List the main symptoms of the disease 4. DIAGNOSTIC_TESTS: Tests typically used to diagnose this condition 5. TREATMENTS: List treatments for this disease, including: - Medications (with their class and purpose) - Procedures - Other interventions 6. COMPLICATIONS: Possible complications of this disease 7. PROGRESSION: Typical progression of the disease, including: - How it starts - How it typically develops - Possible outcomes 8. FOLLOW_UP: Typical follow-up care needed For medications, just provide generic names - I'll handle the RxNorm codes. For conditions and procedures, just provide plain English names - I'll handle the coding. Please provide this information in a straightforward, structured text format. Do NOT use JSON. """ # Get clinical details try: response = client.messages.create( model="claude-3-7-sonnet-20250219", max_tokens=4000, temperature=0.2, messages=[ {"role": "user", "content": prompt} ] ) # Extract the response text clinical_details = response.content[0].text # Save the raw clinical details for reference os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) with open(f"{OUTPUT_FILE}.details", "w") as f: f.write(clinical_details) print(f"Clinical details saved to {OUTPUT_FILE}.details") # Step 2: Now build a valid module with the clinical details print("Step 2: Building module with clinical details...") # Extract information from the structured response description_match = re.search(r'##\s*1\.\s*DESCRIPTION(.*?)(?=##\s*2\.)', clinical_details, re.DOTALL) risk_factors_match = re.search(r'##\s*2\.\s*RISK_FACTORS(.*?)(?=##\s*3\.)', clinical_details, re.DOTALL) symptoms_match = re.search(r'##\s*3\.\s*SYMPTOMS(.*?)(?=##\s*4\.)', clinical_details, re.DOTALL) diagnostic_tests_match = re.search(r'##\s*4\.\s*DIAGNOSTIC_TESTS(.*?)(?=##\s*5\.)', clinical_details, re.DOTALL) treatments_match = re.search(r'##\s*5\.\s*TREATMENTS(.*?)(?=##\s*6\.)', clinical_details, re.DOTALL) complications_match = re.search(r'##\s*6\.\s*COMPLICATIONS(.*?)(?=##\s*7\.)', clinical_details, re.DOTALL) progression_match = re.search(r'##\s*7\.\s*PROGRESSION(.*?)(?=##\s*8\.)', clinical_details, re.DOTALL) follow_up_match = re.search(r'##\s*8\.\s*FOLLOW_UP(.*?)(?=$)', clinical_details, re.DOTALL) # Extract text from matches description = description_match.group(1).strip() if description_match else f"A module for {DISEASE_NAME}" risk_factors = risk_factors_match.group(1).strip() if risk_factors_match else "" symptoms = symptoms_match.group(1).strip() if symptoms_match else "" diagnostic_tests = diagnostic_tests_match.group(1).strip() if diagnostic_tests_match else "" treatments = treatments_match.group(1).strip() if treatments_match else "" complications = complications_match.group(1).strip() if complications_match else "" progression = progression_match.group(1).strip() if progression_match else "" follow_up = follow_up_match.group(1).strip() if follow_up_match else "" print(f"Extracted: {len(description)} chars description, {len(symptoms)} chars symptoms, {len(treatments)} chars treatments") # Extract symptoms as a list symptom_list = [] if symptoms_match: # For structured format with bullet points symptom_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', symptoms, re.MULTILINE) if symptom_lines: symptom_list = [s.strip() for s in symptom_lines if s.strip()] else: # Try to split by newlines for a less structured format symptom_list = [s.strip() for s in symptoms.split('\n') if s.strip()] # If that doesn't give us anything, just add the whole text as one symptom if not symptom_list and symptoms.strip(): symptom_list = [symptoms.strip()] print(f"Found {len(symptom_list)} symptoms") # Extract medications from treatments medications = [] if treatments_match: # Find the "Medications:" section meds_section_match = re.search(r'Medications?:\s*(.*?)(?=(?:- Procedures:|$))', treatments, re.DOTALL) if meds_section_match: meds_section = meds_section_match.group(1).strip() # Extract main medication names from list items with explanations # Format is typically: "- Medication name (explanation)" or "- Medication name - explanation" med_items = re.findall(r'[-*]\s*(.*?)(?=[-*]|\n\n|\n- |$)', meds_section, re.DOTALL) for item in med_items: item = item.strip() if not item: continue # Extract just the medication name, not the explanation med_match = re.match(r'([^(]+)(?:\(|-).*', item) if med_match: med_name = med_match.group(1).strip() else: med_name = item.split('(')[0].strip() # Clean up common patterns med_name = re.sub(r'\s*\([^)]*\)', '', med_name) # Remove parentheticals med_name = re.sub(r'\s*-.*', '', med_name) # Remove after dash # Skip descriptive phrases that aren't medications if any(phrase in med_name.lower() for phrase in ['reduce', 'block', 'prevent', 'for']): continue # Look for multiple medications in parentheses if ',' in med_name: for m in med_name.split(','): m = m.strip() if m and not any(m.lower() in existing.lower() for existing in medications): medications.append(m) else: if med_name and not any(med_name.lower() in existing.lower() for existing in medications): medications.append(med_name) print(f"Found {len(medications)} medications: {', '.join(medications[:5])}") # Start with a basic module template module = { "name": DISEASE_NAME, "remarks": [], "states": { "Initial": { "type": "Initial", "direct_transition": "Age_Guard" }, "Age_Guard": { "type": "Guard", "allow": { "condition_type": "Age", "operator": ">=", "quantity": 18, "unit": "years" }, "direct_transition": "Delay_Until_Onset" }, "Delay_Until_Onset": { "type": "Delay", "range": { "low": 0, "high": 20, "unit": "years" }, "direct_transition": "Onset" }, "Onset": { "type": "ConditionOnset", "target_encounter": "Diagnosis_Encounter", "codes": [ { "system": "SNOMED-CT", "code": "64109004", "display": DISEASE_NAME } ], "direct_transition": "Diagnosis_Encounter" }, "Diagnosis_Encounter": { "type": "Encounter", "encounter_class": "ambulatory", "reason": "Onset", "codes": [ { "system": "SNOMED-CT", "code": "185347001", "display": "Encounter for problem" } ], "direct_transition": "End_Diagnosis_Encounter" }, "End_Diagnosis_Encounter": { "type": "EncounterEnd" # direct_transition will be set later } } } # Add description to remarks if description: for line in description.split('\n'): line = line.strip() if line: module["remarks"].append(line) # Add ICD-10 code if available if icd10_code: module["remarks"].append(f"ICD-10 code: {icd10_code}") # Add category if available if disease_category: module["remarks"].append(f"Category: {disease_category}") # Make sure remarks is not empty if not module["remarks"]: module["remarks"].append(f"Generated module for {DISEASE_NAME}") # Add risk factors to remarks if available if risk_factors: module["remarks"].append(f"Risk factors: {risk_factors.replace('\n', ' ')}") # Add symptoms if available symptom_i = 0 current_state = "End_Diagnosis_Encounter" if symptom_list: for i, symptom in enumerate(symptom_list[:3]): # Limit to 3 symptoms symptom_name = symptom.strip() if not symptom_name: continue state_name = f"Symptom_{symptom_i}" symptom_i += 1 module["states"][state_name] = { "type": "ConditionOnset", "target_encounter": "Diagnosis_Encounter", "codes": [ { "system": "SNOMED-CT", "code": "418107008", # Default symptom code "display": symptom_name } ] } # Connect states module["states"][current_state]["direct_transition"] = state_name current_state = state_name # Add diagnostic tests if available if diagnostic_tests: test_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', diagnostic_tests, re.MULTILINE) tests = [t.strip() for t in test_lines if t.strip()] if tests: for i, test in enumerate(tests[:2]): # Limit to 2 tests test_name = test.strip() if not test_name: continue state_name = f"DiagnosticTest_{i}" module["states"][state_name] = { "type": "Procedure", "target_encounter": "Diagnosis_Encounter", "reason": "Onset", "codes": [ { "system": "SNOMED-CT", "code": "386053000", # Default diagnostic procedure code "display": test_name } ] } # Connect states module["states"][current_state]["direct_transition"] = state_name current_state = state_name # Add medications if found if medications: for i, med in enumerate(medications[:3]): # Limit to 3 medications to keep module manageable med_name = med.strip() if not med_name: continue state_name = f"Prescribe_{i}" module["states"][state_name] = { "type": "MedicationOrder", "target_encounter": "Diagnosis_Encounter", "reason": "Onset", "codes": [ { "system": "RxNorm", "code": "308047", # Default code "display": med_name } ] } # Connect states module["states"][current_state]["direct_transition"] = state_name current_state = state_name # Connect to Follow-up module["states"][current_state]["direct_transition"] = "Follow_Up_Encounter" # Add follow-up encounter module["states"]["Follow_Up_Encounter"] = { "type": "Encounter", "encounter_class": "ambulatory", "reason": "Onset", "codes": [ { "system": "SNOMED-CT", "code": "390906007", "display": "Follow-up encounter" } ], "direct_transition": "End_Follow_Up_Encounter" } module["states"]["End_Follow_Up_Encounter"] = { "type": "EncounterEnd", "direct_transition": "Terminal" } module["states"]["Terminal"] = { "type": "Terminal" } # Format the JSON for output formatted_json = json.dumps(module, indent=2) # Save the module with open(OUTPUT_FILE, "w") as f: f.write(formatted_json) print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}") return 0 except Exception as e: print(f"❌ Error: {str(e)}") # Write the error to a file with open(f"{OUTPUT_FILE}.error", "w") as f: f.write(f"Error: {str(e)}") return 1 def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Generate a Synthea module for a disease') parser.add_argument('--disease', type=str, required=True, help='Disease name') parser.add_argument('--icd10', type=str, help='ICD-10 code') parser.add_argument('--category', type=str, help='Disease category') parser.add_argument('--no-interactive', action='store_true', help='Skip interactive prompts, for batch processing') parser.add_argument('--auto-fallback', action='store_true', help='Automatically use fallback template if JSON parsing fails') parser.add_argument('--force', action='store_true', help='Force regeneration even if module already exists') args = parser.parse_args() # Set global variables global DISEASE_NAME, OUTPUT_FILE, use_interactive, use_auto_fallback DISEASE_NAME = args.disease normalized_name = normalize_disease_name(DISEASE_NAME) OUTPUT_FILE = f"src/main/resources/modules/{normalized_name}.json" use_interactive = not args.no_interactive use_auto_fallback = args.auto_fallback # Use provided ICD-10 code or category if specified global icd10_code, disease_category icd10_code = args.icd10 disease_category = args.category print(f"Disease name: {DISEASE_NAME}") print(f"Normalized filename: {normalized_name}.json") # Check if module already exists if os.path.exists(OUTPUT_FILE): print(f"✅ Module already exists at: {OUTPUT_FILE}") if args.force: print("Force flag set. Regenerating module.") else: return 0 else: print(f"❌ Module not found at: {OUTPUT_FILE}") # If no ICD-10 code provided, look up in disease list CSV if not icd10_code: csv_path = "src/main/resources/disease_list.csv" if os.path.exists(csv_path): print(f"Looking up disease information in {csv_path}...") try: with open(csv_path, 'r') as f: reader = csv.DictReader(f) for row in reader: if row.get("disease_name", "").lower() == DISEASE_NAME.lower(): icd10_code = row.get("id", "") disease_category = row.get("disease_category", "") print(f"Found matching disease in CSV:") print(f"- ICD-10 code: {icd10_code}") print(f"- Category: {disease_category}") break else: print("No matching disease found in CSV. Continuing without ICD-10 code.") except Exception as e: print(f"Error reading CSV: {str(e)}") print("Continuing without ICD-10 code.") # Ensure ANTHROPIC_API_KEY is set if not os.environ.get('ANTHROPIC_API_KEY'): print("Error: ANTHROPIC_API_KEY environment variable is not set") return 1 # Ask Claude to generate the module generate_module() return 0 if __name__ == "__main__": sys.exit(main())