Files
synthea-alldiseases/scripts/generate_module.py

468 lines
19 KiB
Python
Executable File

#!/usr/bin/env python3
import os
import sys
import json
import re
import anthropic
import csv
import argparse
# Define global variables
DISEASE_NAME = ""
OUTPUT_FILE = ""
icd10_code = ""
disease_category = ""
use_interactive = True
use_auto_fallback = False
def normalize_disease_name(name):
# Normalize disease name for filename
normalized_name = name.lower().replace(' ', '_').replace('-', '_')
normalized_name = ''.join(c if c.isalnum() or c == '_' else '_' for c in normalized_name)
normalized_name = '_'.join(filter(None, normalized_name.split('_')))
return normalized_name
def generate_module():
# Initialize the Anthropic client
api_key = os.environ.get('ANTHROPIC_API_KEY')
if not api_key:
print("Error: ANTHROPIC_API_KEY environment variable is not set")
sys.exit(1)
client = anthropic.Anthropic(api_key=api_key)
# Step 1: Ask Claude for clinical details in a structured format, not JSON
print("Step 1: Getting clinical details from Claude...")
# Prepare disease information for the prompt
disease_info = f"Disease name: {DISEASE_NAME}"
if icd10_code:
disease_info += f"\nICD-10 code: {icd10_code}"
if disease_category:
disease_info += f"\nCategory: {disease_category}"
# Construct a prompt that asks for structured data, not JSON
prompt = f"""I need information to create a Synthea disease module for {DISEASE_NAME}.
{disease_info}
Please provide the following information in a structured format (NOT JSON):
1. DESCRIPTION: A brief description of the disease (3-5 sentences)
2. RISK_FACTORS: Age ranges, gender factors, and/or other risk factors for this disease
3. SYMPTOMS: List the main symptoms of the disease
4. DIAGNOSTIC_TESTS: Tests typically used to diagnose this condition
5. TREATMENTS: List treatments for this disease, including:
- Medications (with their class and purpose)
- Procedures
- Other interventions
6. COMPLICATIONS: Possible complications of this disease
7. PROGRESSION: Typical progression of the disease, including:
- How it starts
- How it typically develops
- Possible outcomes
8. FOLLOW_UP: Typical follow-up care needed
For medications, just provide generic names - I'll handle the RxNorm codes.
For conditions and procedures, just provide plain English names - I'll handle the coding.
Please provide this information in a straightforward, structured text format. Do NOT use JSON.
"""
# Get clinical details
try:
response = client.messages.create(
model="claude-3-7-sonnet-20250219",
max_tokens=4000,
temperature=0.2,
messages=[
{"role": "user", "content": prompt}
]
)
# Extract the response text
clinical_details = response.content[0].text
# Save the raw clinical details for reference
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
with open(f"{OUTPUT_FILE}.details", "w") as f:
f.write(clinical_details)
print(f"Clinical details saved to {OUTPUT_FILE}.details")
# Step 2: Now build a valid module with the clinical details
print("Step 2: Building module with clinical details...")
# Extract information from the structured response
description_match = re.search(r'##\s*1\.\s*DESCRIPTION(.*?)(?=##\s*2\.)', clinical_details, re.DOTALL)
risk_factors_match = re.search(r'##\s*2\.\s*RISK_FACTORS(.*?)(?=##\s*3\.)', clinical_details, re.DOTALL)
symptoms_match = re.search(r'##\s*3\.\s*SYMPTOMS(.*?)(?=##\s*4\.)', clinical_details, re.DOTALL)
diagnostic_tests_match = re.search(r'##\s*4\.\s*DIAGNOSTIC_TESTS(.*?)(?=##\s*5\.)', clinical_details, re.DOTALL)
treatments_match = re.search(r'##\s*5\.\s*TREATMENTS(.*?)(?=##\s*6\.)', clinical_details, re.DOTALL)
complications_match = re.search(r'##\s*6\.\s*COMPLICATIONS(.*?)(?=##\s*7\.)', clinical_details, re.DOTALL)
progression_match = re.search(r'##\s*7\.\s*PROGRESSION(.*?)(?=##\s*8\.)', clinical_details, re.DOTALL)
follow_up_match = re.search(r'##\s*8\.\s*FOLLOW_UP(.*?)(?=$)', clinical_details, re.DOTALL)
# Extract text from matches
description = description_match.group(1).strip() if description_match else f"A module for {DISEASE_NAME}"
risk_factors = risk_factors_match.group(1).strip() if risk_factors_match else ""
symptoms = symptoms_match.group(1).strip() if symptoms_match else ""
diagnostic_tests = diagnostic_tests_match.group(1).strip() if diagnostic_tests_match else ""
treatments = treatments_match.group(1).strip() if treatments_match else ""
complications = complications_match.group(1).strip() if complications_match else ""
progression = progression_match.group(1).strip() if progression_match else ""
follow_up = follow_up_match.group(1).strip() if follow_up_match else ""
print(f"Extracted: {len(description)} chars description, {len(symptoms)} chars symptoms, {len(treatments)} chars treatments")
# Extract symptoms as a list
symptom_list = []
if symptoms_match:
# For structured format with bullet points
symptom_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', symptoms, re.MULTILINE)
if symptom_lines:
symptom_list = [s.strip() for s in symptom_lines if s.strip()]
else:
# Try to split by newlines for a less structured format
symptom_list = [s.strip() for s in symptoms.split('\n') if s.strip()]
# If that doesn't give us anything, just add the whole text as one symptom
if not symptom_list and symptoms.strip():
symptom_list = [symptoms.strip()]
print(f"Found {len(symptom_list)} symptoms")
# Extract medications from treatments
medications = []
if treatments_match:
# Find the "Medications:" section
meds_section_match = re.search(r'Medications?:\s*(.*?)(?=(?:- Procedures:|$))', treatments, re.DOTALL)
if meds_section_match:
meds_section = meds_section_match.group(1).strip()
# Extract main medication names from list items with explanations
# Format is typically: "- Medication name (explanation)" or "- Medication name - explanation"
med_items = re.findall(r'[-*]\s*(.*?)(?=[-*]|\n\n|\n- |$)', meds_section, re.DOTALL)
for item in med_items:
item = item.strip()
if not item:
continue
# Extract just the medication name, not the explanation
med_match = re.match(r'([^(]+)(?:\(|-).*', item)
if med_match:
med_name = med_match.group(1).strip()
else:
med_name = item.split('(')[0].strip()
# Clean up common patterns
med_name = re.sub(r'\s*\([^)]*\)', '', med_name) # Remove parentheticals
med_name = re.sub(r'\s*-.*', '', med_name) # Remove after dash
# Skip descriptive phrases that aren't medications
if any(phrase in med_name.lower() for phrase in ['reduce', 'block', 'prevent', 'for']):
continue
# Look for multiple medications in parentheses
if ',' in med_name:
for m in med_name.split(','):
m = m.strip()
if m and not any(m.lower() in existing.lower() for existing in medications):
medications.append(m)
else:
if med_name and not any(med_name.lower() in existing.lower() for existing in medications):
medications.append(med_name)
print(f"Found {len(medications)} medications: {', '.join(medications[:5])}")
# Start with a basic module template
module = {
"name": DISEASE_NAME,
"remarks": [],
"states": {
"Initial": {
"type": "Initial",
"direct_transition": "Age_Guard"
},
"Age_Guard": {
"type": "Guard",
"allow": {
"condition_type": "Age",
"operator": ">=",
"quantity": 18,
"unit": "years"
},
"direct_transition": "Delay_Until_Onset"
},
"Delay_Until_Onset": {
"type": "Delay",
"range": {
"low": 0,
"high": 20,
"unit": "years"
},
"direct_transition": "Onset"
},
"Onset": {
"type": "ConditionOnset",
"target_encounter": "Diagnosis_Encounter",
"codes": [
{
"system": "SNOMED-CT",
"code": "64109004",
"display": DISEASE_NAME
}
],
"direct_transition": "Diagnosis_Encounter"
},
"Diagnosis_Encounter": {
"type": "Encounter",
"encounter_class": "ambulatory",
"reason": "Onset",
"codes": [
{
"system": "SNOMED-CT",
"code": "185347001",
"display": "Encounter for problem"
}
],
"direct_transition": "End_Diagnosis_Encounter"
},
"End_Diagnosis_Encounter": {
"type": "EncounterEnd"
# direct_transition will be set later
}
}
}
# Add description to remarks
if description:
for line in description.split('\n'):
line = line.strip()
if line:
module["remarks"].append(line)
# Add ICD-10 code if available
if icd10_code:
module["remarks"].append(f"ICD-10 code: {icd10_code}")
# Add category if available
if disease_category:
module["remarks"].append(f"Category: {disease_category}")
# Make sure remarks is not empty
if not module["remarks"]:
module["remarks"].append(f"Generated module for {DISEASE_NAME}")
# Add risk factors to remarks if available
if risk_factors:
module["remarks"].append(f"Risk factors: {risk_factors.replace('\n', ' ')}")
# Add symptoms if available
symptom_i = 0
current_state = "End_Diagnosis_Encounter"
if symptom_list:
for i, symptom in enumerate(symptom_list[:3]): # Limit to 3 symptoms
symptom_name = symptom.strip()
if not symptom_name:
continue
state_name = f"Symptom_{symptom_i}"
symptom_i += 1
module["states"][state_name] = {
"type": "ConditionOnset",
"target_encounter": "Diagnosis_Encounter",
"codes": [
{
"system": "SNOMED-CT",
"code": "418107008", # Default symptom code
"display": symptom_name
}
]
}
# Connect states
module["states"][current_state]["direct_transition"] = state_name
current_state = state_name
# Add diagnostic tests if available
if diagnostic_tests:
test_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', diagnostic_tests, re.MULTILINE)
tests = [t.strip() for t in test_lines if t.strip()]
if tests:
for i, test in enumerate(tests[:2]): # Limit to 2 tests
test_name = test.strip()
if not test_name:
continue
state_name = f"DiagnosticTest_{i}"
module["states"][state_name] = {
"type": "Procedure",
"target_encounter": "Diagnosis_Encounter",
"reason": "Onset",
"codes": [
{
"system": "SNOMED-CT",
"code": "386053000", # Default diagnostic procedure code
"display": test_name
}
]
}
# Connect states
module["states"][current_state]["direct_transition"] = state_name
current_state = state_name
# Add medications if found
if medications:
for i, med in enumerate(medications[:3]): # Limit to 3 medications to keep module manageable
med_name = med.strip()
if not med_name:
continue
state_name = f"Prescribe_{i}"
module["states"][state_name] = {
"type": "MedicationOrder",
"target_encounter": "Diagnosis_Encounter",
"reason": "Onset",
"codes": [
{
"system": "RxNorm",
"code": "308047", # Default code
"display": med_name
}
]
}
# Connect states
module["states"][current_state]["direct_transition"] = state_name
current_state = state_name
# Connect to Follow-up
module["states"][current_state]["direct_transition"] = "Follow_Up_Encounter"
# Add follow-up encounter
module["states"]["Follow_Up_Encounter"] = {
"type": "Encounter",
"encounter_class": "ambulatory",
"reason": "Onset",
"codes": [
{
"system": "SNOMED-CT",
"code": "390906007",
"display": "Follow-up encounter"
}
],
"direct_transition": "End_Follow_Up_Encounter"
}
module["states"]["End_Follow_Up_Encounter"] = {
"type": "EncounterEnd",
"direct_transition": "Terminal"
}
module["states"]["Terminal"] = {
"type": "Terminal"
}
# Format the JSON for output
formatted_json = json.dumps(module, indent=2)
# Save the module
with open(OUTPUT_FILE, "w") as f:
f.write(formatted_json)
print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}")
return 0
except Exception as e:
print(f"❌ Error: {str(e)}")
# Write the error to a file
with open(f"{OUTPUT_FILE}.error", "w") as f:
f.write(f"Error: {str(e)}")
return 1
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='Generate a Synthea module for a disease')
parser.add_argument('--disease', type=str, required=True, help='Disease name')
parser.add_argument('--icd10', type=str, help='ICD-10 code')
parser.add_argument('--category', type=str, help='Disease category')
parser.add_argument('--no-interactive', action='store_true', help='Skip interactive prompts, for batch processing')
parser.add_argument('--auto-fallback', action='store_true', help='Automatically use fallback template if JSON parsing fails')
parser.add_argument('--force', action='store_true', help='Force regeneration even if module already exists')
args = parser.parse_args()
# Set global variables
global DISEASE_NAME, OUTPUT_FILE, use_interactive, use_auto_fallback
DISEASE_NAME = args.disease
normalized_name = normalize_disease_name(DISEASE_NAME)
OUTPUT_FILE = f"src/main/resources/modules/{normalized_name}.json"
use_interactive = not args.no_interactive
use_auto_fallback = args.auto_fallback
# Use provided ICD-10 code or category if specified
global icd10_code, disease_category
icd10_code = args.icd10
disease_category = args.category
print(f"Disease name: {DISEASE_NAME}")
print(f"Normalized filename: {normalized_name}.json")
# Check if module already exists
if os.path.exists(OUTPUT_FILE):
print(f"✅ Module already exists at: {OUTPUT_FILE}")
if args.force:
print("Force flag set. Regenerating module.")
else:
return 0
else:
print(f"❌ Module not found at: {OUTPUT_FILE}")
# If no ICD-10 code provided, look up in disease list CSV
if not icd10_code:
csv_path = "src/main/resources/disease_list.csv"
if os.path.exists(csv_path):
print(f"Looking up disease information in {csv_path}...")
try:
with open(csv_path, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
if row.get("disease_name", "").lower() == DISEASE_NAME.lower():
icd10_code = row.get("id", "")
disease_category = row.get("disease_category", "")
print(f"Found matching disease in CSV:")
print(f"- ICD-10 code: {icd10_code}")
print(f"- Category: {disease_category}")
break
else:
print("No matching disease found in CSV. Continuing without ICD-10 code.")
except Exception as e:
print(f"Error reading CSV: {str(e)}")
print("Continuing without ICD-10 code.")
# Ensure ANTHROPIC_API_KEY is set
if not os.environ.get('ANTHROPIC_API_KEY'):
print("Error: ANTHROPIC_API_KEY environment variable is not set")
return 1
# Ask Claude to generate the module
generate_module()
return 0
if __name__ == "__main__":
sys.exit(main())