Trying to fix basic functionality again.
This commit is contained in:
468
scripts/generate_module.py
Executable file
468
scripts/generate_module.py
Executable file
@@ -0,0 +1,468 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import anthropic
|
||||
import csv
|
||||
import argparse
|
||||
|
||||
# Define global variables
|
||||
DISEASE_NAME = ""
|
||||
OUTPUT_FILE = ""
|
||||
icd10_code = ""
|
||||
disease_category = ""
|
||||
use_interactive = True
|
||||
use_auto_fallback = False
|
||||
|
||||
def normalize_disease_name(name):
|
||||
# Normalize disease name for filename
|
||||
normalized_name = name.lower().replace(' ', '_').replace('-', '_')
|
||||
normalized_name = ''.join(c if c.isalnum() or c == '_' else '_' for c in normalized_name)
|
||||
normalized_name = '_'.join(filter(None, normalized_name.split('_')))
|
||||
return normalized_name
|
||||
|
||||
def generate_module():
|
||||
# Initialize the Anthropic client
|
||||
api_key = os.environ.get('ANTHROPIC_API_KEY')
|
||||
if not api_key:
|
||||
print("Error: ANTHROPIC_API_KEY environment variable is not set")
|
||||
sys.exit(1)
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
|
||||
# Step 1: Ask Claude for clinical details in a structured format, not JSON
|
||||
print("Step 1: Getting clinical details from Claude...")
|
||||
|
||||
# Prepare disease information for the prompt
|
||||
disease_info = f"Disease name: {DISEASE_NAME}"
|
||||
if icd10_code:
|
||||
disease_info += f"\nICD-10 code: {icd10_code}"
|
||||
if disease_category:
|
||||
disease_info += f"\nCategory: {disease_category}"
|
||||
|
||||
# Construct a prompt that asks for structured data, not JSON
|
||||
prompt = f"""I need information to create a Synthea disease module for {DISEASE_NAME}.
|
||||
|
||||
{disease_info}
|
||||
|
||||
Please provide the following information in a structured format (NOT JSON):
|
||||
|
||||
1. DESCRIPTION: A brief description of the disease (3-5 sentences)
|
||||
|
||||
2. RISK_FACTORS: Age ranges, gender factors, and/or other risk factors for this disease
|
||||
|
||||
3. SYMPTOMS: List the main symptoms of the disease
|
||||
|
||||
4. DIAGNOSTIC_TESTS: Tests typically used to diagnose this condition
|
||||
|
||||
5. TREATMENTS: List treatments for this disease, including:
|
||||
- Medications (with their class and purpose)
|
||||
- Procedures
|
||||
- Other interventions
|
||||
|
||||
6. COMPLICATIONS: Possible complications of this disease
|
||||
|
||||
7. PROGRESSION: Typical progression of the disease, including:
|
||||
- How it starts
|
||||
- How it typically develops
|
||||
- Possible outcomes
|
||||
|
||||
8. FOLLOW_UP: Typical follow-up care needed
|
||||
|
||||
For medications, just provide generic names - I'll handle the RxNorm codes.
|
||||
For conditions and procedures, just provide plain English names - I'll handle the coding.
|
||||
|
||||
Please provide this information in a straightforward, structured text format. Do NOT use JSON.
|
||||
"""
|
||||
|
||||
# Get clinical details
|
||||
try:
|
||||
response = client.messages.create(
|
||||
model="claude-3-7-sonnet-20250219",
|
||||
max_tokens=4000,
|
||||
temperature=0.2,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
)
|
||||
|
||||
# Extract the response text
|
||||
clinical_details = response.content[0].text
|
||||
|
||||
# Save the raw clinical details for reference
|
||||
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
|
||||
with open(f"{OUTPUT_FILE}.details", "w") as f:
|
||||
f.write(clinical_details)
|
||||
|
||||
print(f"Clinical details saved to {OUTPUT_FILE}.details")
|
||||
|
||||
# Step 2: Now build a valid module with the clinical details
|
||||
print("Step 2: Building module with clinical details...")
|
||||
|
||||
# Extract information from the structured response
|
||||
description_match = re.search(r'##\s*1\.\s*DESCRIPTION(.*?)(?=##\s*2\.)', clinical_details, re.DOTALL)
|
||||
risk_factors_match = re.search(r'##\s*2\.\s*RISK_FACTORS(.*?)(?=##\s*3\.)', clinical_details, re.DOTALL)
|
||||
symptoms_match = re.search(r'##\s*3\.\s*SYMPTOMS(.*?)(?=##\s*4\.)', clinical_details, re.DOTALL)
|
||||
diagnostic_tests_match = re.search(r'##\s*4\.\s*DIAGNOSTIC_TESTS(.*?)(?=##\s*5\.)', clinical_details, re.DOTALL)
|
||||
treatments_match = re.search(r'##\s*5\.\s*TREATMENTS(.*?)(?=##\s*6\.)', clinical_details, re.DOTALL)
|
||||
complications_match = re.search(r'##\s*6\.\s*COMPLICATIONS(.*?)(?=##\s*7\.)', clinical_details, re.DOTALL)
|
||||
progression_match = re.search(r'##\s*7\.\s*PROGRESSION(.*?)(?=##\s*8\.)', clinical_details, re.DOTALL)
|
||||
follow_up_match = re.search(r'##\s*8\.\s*FOLLOW_UP(.*?)(?=$)', clinical_details, re.DOTALL)
|
||||
|
||||
# Extract text from matches
|
||||
description = description_match.group(1).strip() if description_match else f"A module for {DISEASE_NAME}"
|
||||
risk_factors = risk_factors_match.group(1).strip() if risk_factors_match else ""
|
||||
symptoms = symptoms_match.group(1).strip() if symptoms_match else ""
|
||||
diagnostic_tests = diagnostic_tests_match.group(1).strip() if diagnostic_tests_match else ""
|
||||
treatments = treatments_match.group(1).strip() if treatments_match else ""
|
||||
complications = complications_match.group(1).strip() if complications_match else ""
|
||||
progression = progression_match.group(1).strip() if progression_match else ""
|
||||
follow_up = follow_up_match.group(1).strip() if follow_up_match else ""
|
||||
|
||||
print(f"Extracted: {len(description)} chars description, {len(symptoms)} chars symptoms, {len(treatments)} chars treatments")
|
||||
|
||||
# Extract symptoms as a list
|
||||
symptom_list = []
|
||||
if symptoms_match:
|
||||
# For structured format with bullet points
|
||||
symptom_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', symptoms, re.MULTILINE)
|
||||
if symptom_lines:
|
||||
symptom_list = [s.strip() for s in symptom_lines if s.strip()]
|
||||
else:
|
||||
# Try to split by newlines for a less structured format
|
||||
symptom_list = [s.strip() for s in symptoms.split('\n') if s.strip()]
|
||||
# If that doesn't give us anything, just add the whole text as one symptom
|
||||
if not symptom_list and symptoms.strip():
|
||||
symptom_list = [symptoms.strip()]
|
||||
|
||||
print(f"Found {len(symptom_list)} symptoms")
|
||||
|
||||
# Extract medications from treatments
|
||||
medications = []
|
||||
if treatments_match:
|
||||
# Find the "Medications:" section
|
||||
meds_section_match = re.search(r'Medications?:\s*(.*?)(?=(?:- Procedures:|$))', treatments, re.DOTALL)
|
||||
if meds_section_match:
|
||||
meds_section = meds_section_match.group(1).strip()
|
||||
# Extract main medication names from list items with explanations
|
||||
# Format is typically: "- Medication name (explanation)" or "- Medication name - explanation"
|
||||
med_items = re.findall(r'[-*]\s*(.*?)(?=[-*]|\n\n|\n- |$)', meds_section, re.DOTALL)
|
||||
|
||||
for item in med_items:
|
||||
item = item.strip()
|
||||
if not item:
|
||||
continue
|
||||
|
||||
# Extract just the medication name, not the explanation
|
||||
med_match = re.match(r'([^(]+)(?:\(|-).*', item)
|
||||
if med_match:
|
||||
med_name = med_match.group(1).strip()
|
||||
else:
|
||||
med_name = item.split('(')[0].strip()
|
||||
|
||||
# Clean up common patterns
|
||||
med_name = re.sub(r'\s*\([^)]*\)', '', med_name) # Remove parentheticals
|
||||
med_name = re.sub(r'\s*-.*', '', med_name) # Remove after dash
|
||||
|
||||
# Skip descriptive phrases that aren't medications
|
||||
if any(phrase in med_name.lower() for phrase in ['reduce', 'block', 'prevent', 'for']):
|
||||
continue
|
||||
|
||||
# Look for multiple medications in parentheses
|
||||
if ',' in med_name:
|
||||
for m in med_name.split(','):
|
||||
m = m.strip()
|
||||
if m and not any(m.lower() in existing.lower() for existing in medications):
|
||||
medications.append(m)
|
||||
else:
|
||||
if med_name and not any(med_name.lower() in existing.lower() for existing in medications):
|
||||
medications.append(med_name)
|
||||
|
||||
print(f"Found {len(medications)} medications: {', '.join(medications[:5])}")
|
||||
|
||||
# Start with a basic module template
|
||||
module = {
|
||||
"name": DISEASE_NAME,
|
||||
"remarks": [],
|
||||
"states": {
|
||||
"Initial": {
|
||||
"type": "Initial",
|
||||
"direct_transition": "Age_Guard"
|
||||
},
|
||||
"Age_Guard": {
|
||||
"type": "Guard",
|
||||
"allow": {
|
||||
"condition_type": "Age",
|
||||
"operator": ">=",
|
||||
"quantity": 18,
|
||||
"unit": "years"
|
||||
},
|
||||
"direct_transition": "Delay_Until_Onset"
|
||||
},
|
||||
"Delay_Until_Onset": {
|
||||
"type": "Delay",
|
||||
"range": {
|
||||
"low": 0,
|
||||
"high": 20,
|
||||
"unit": "years"
|
||||
},
|
||||
"direct_transition": "Onset"
|
||||
},
|
||||
"Onset": {
|
||||
"type": "ConditionOnset",
|
||||
"target_encounter": "Diagnosis_Encounter",
|
||||
"codes": [
|
||||
{
|
||||
"system": "SNOMED-CT",
|
||||
"code": "64109004",
|
||||
"display": DISEASE_NAME
|
||||
}
|
||||
],
|
||||
"direct_transition": "Diagnosis_Encounter"
|
||||
},
|
||||
"Diagnosis_Encounter": {
|
||||
"type": "Encounter",
|
||||
"encounter_class": "ambulatory",
|
||||
"reason": "Onset",
|
||||
"codes": [
|
||||
{
|
||||
"system": "SNOMED-CT",
|
||||
"code": "185347001",
|
||||
"display": "Encounter for problem"
|
||||
}
|
||||
],
|
||||
"direct_transition": "End_Diagnosis_Encounter"
|
||||
},
|
||||
"End_Diagnosis_Encounter": {
|
||||
"type": "EncounterEnd"
|
||||
# direct_transition will be set later
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Add description to remarks
|
||||
if description:
|
||||
for line in description.split('\n'):
|
||||
line = line.strip()
|
||||
if line:
|
||||
module["remarks"].append(line)
|
||||
|
||||
# Add ICD-10 code if available
|
||||
if icd10_code:
|
||||
module["remarks"].append(f"ICD-10 code: {icd10_code}")
|
||||
|
||||
# Add category if available
|
||||
if disease_category:
|
||||
module["remarks"].append(f"Category: {disease_category}")
|
||||
|
||||
# Make sure remarks is not empty
|
||||
if not module["remarks"]:
|
||||
module["remarks"].append(f"Generated module for {DISEASE_NAME}")
|
||||
|
||||
# Add risk factors to remarks if available
|
||||
if risk_factors:
|
||||
module["remarks"].append(f"Risk factors: {risk_factors.replace('\n', ' ')}")
|
||||
|
||||
# Add symptoms if available
|
||||
symptom_i = 0
|
||||
current_state = "End_Diagnosis_Encounter"
|
||||
|
||||
if symptom_list:
|
||||
for i, symptom in enumerate(symptom_list[:3]): # Limit to 3 symptoms
|
||||
symptom_name = symptom.strip()
|
||||
if not symptom_name:
|
||||
continue
|
||||
|
||||
state_name = f"Symptom_{symptom_i}"
|
||||
symptom_i += 1
|
||||
|
||||
module["states"][state_name] = {
|
||||
"type": "ConditionOnset",
|
||||
"target_encounter": "Diagnosis_Encounter",
|
||||
"codes": [
|
||||
{
|
||||
"system": "SNOMED-CT",
|
||||
"code": "418107008", # Default symptom code
|
||||
"display": symptom_name
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Connect states
|
||||
module["states"][current_state]["direct_transition"] = state_name
|
||||
current_state = state_name
|
||||
|
||||
# Add diagnostic tests if available
|
||||
if diagnostic_tests:
|
||||
test_lines = re.findall(r'[-*]\s*(.*?)(?:\n|$)', diagnostic_tests, re.MULTILINE)
|
||||
tests = [t.strip() for t in test_lines if t.strip()]
|
||||
|
||||
if tests:
|
||||
for i, test in enumerate(tests[:2]): # Limit to 2 tests
|
||||
test_name = test.strip()
|
||||
if not test_name:
|
||||
continue
|
||||
|
||||
state_name = f"DiagnosticTest_{i}"
|
||||
|
||||
module["states"][state_name] = {
|
||||
"type": "Procedure",
|
||||
"target_encounter": "Diagnosis_Encounter",
|
||||
"reason": "Onset",
|
||||
"codes": [
|
||||
{
|
||||
"system": "SNOMED-CT",
|
||||
"code": "386053000", # Default diagnostic procedure code
|
||||
"display": test_name
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Connect states
|
||||
module["states"][current_state]["direct_transition"] = state_name
|
||||
current_state = state_name
|
||||
|
||||
# Add medications if found
|
||||
if medications:
|
||||
for i, med in enumerate(medications[:3]): # Limit to 3 medications to keep module manageable
|
||||
med_name = med.strip()
|
||||
if not med_name:
|
||||
continue
|
||||
|
||||
state_name = f"Prescribe_{i}"
|
||||
|
||||
module["states"][state_name] = {
|
||||
"type": "MedicationOrder",
|
||||
"target_encounter": "Diagnosis_Encounter",
|
||||
"reason": "Onset",
|
||||
"codes": [
|
||||
{
|
||||
"system": "RxNorm",
|
||||
"code": "308047", # Default code
|
||||
"display": med_name
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Connect states
|
||||
module["states"][current_state]["direct_transition"] = state_name
|
||||
current_state = state_name
|
||||
|
||||
# Connect to Follow-up
|
||||
module["states"][current_state]["direct_transition"] = "Follow_Up_Encounter"
|
||||
|
||||
# Add follow-up encounter
|
||||
module["states"]["Follow_Up_Encounter"] = {
|
||||
"type": "Encounter",
|
||||
"encounter_class": "ambulatory",
|
||||
"reason": "Onset",
|
||||
"codes": [
|
||||
{
|
||||
"system": "SNOMED-CT",
|
||||
"code": "390906007",
|
||||
"display": "Follow-up encounter"
|
||||
}
|
||||
],
|
||||
"direct_transition": "End_Follow_Up_Encounter"
|
||||
}
|
||||
|
||||
module["states"]["End_Follow_Up_Encounter"] = {
|
||||
"type": "EncounterEnd",
|
||||
"direct_transition": "Terminal"
|
||||
}
|
||||
|
||||
module["states"]["Terminal"] = {
|
||||
"type": "Terminal"
|
||||
}
|
||||
|
||||
# Format the JSON for output
|
||||
formatted_json = json.dumps(module, indent=2)
|
||||
|
||||
# Save the module
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
f.write(formatted_json)
|
||||
|
||||
print(f"✅ Successfully generated module and saved to {OUTPUT_FILE}")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
|
||||
# Write the error to a file
|
||||
with open(f"{OUTPUT_FILE}.error", "w") as f:
|
||||
f.write(f"Error: {str(e)}")
|
||||
return 1
|
||||
|
||||
def main():
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description='Generate a Synthea module for a disease')
|
||||
parser.add_argument('--disease', type=str, required=True, help='Disease name')
|
||||
parser.add_argument('--icd10', type=str, help='ICD-10 code')
|
||||
parser.add_argument('--category', type=str, help='Disease category')
|
||||
parser.add_argument('--no-interactive', action='store_true', help='Skip interactive prompts, for batch processing')
|
||||
parser.add_argument('--auto-fallback', action='store_true', help='Automatically use fallback template if JSON parsing fails')
|
||||
parser.add_argument('--force', action='store_true', help='Force regeneration even if module already exists')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set global variables
|
||||
global DISEASE_NAME, OUTPUT_FILE, use_interactive, use_auto_fallback
|
||||
DISEASE_NAME = args.disease
|
||||
normalized_name = normalize_disease_name(DISEASE_NAME)
|
||||
OUTPUT_FILE = f"src/main/resources/modules/{normalized_name}.json"
|
||||
use_interactive = not args.no_interactive
|
||||
use_auto_fallback = args.auto_fallback
|
||||
|
||||
# Use provided ICD-10 code or category if specified
|
||||
global icd10_code, disease_category
|
||||
icd10_code = args.icd10
|
||||
disease_category = args.category
|
||||
|
||||
print(f"Disease name: {DISEASE_NAME}")
|
||||
print(f"Normalized filename: {normalized_name}.json")
|
||||
|
||||
# Check if module already exists
|
||||
if os.path.exists(OUTPUT_FILE):
|
||||
print(f"✅ Module already exists at: {OUTPUT_FILE}")
|
||||
if args.force:
|
||||
print("Force flag set. Regenerating module.")
|
||||
else:
|
||||
return 0
|
||||
else:
|
||||
print(f"❌ Module not found at: {OUTPUT_FILE}")
|
||||
|
||||
# If no ICD-10 code provided, look up in disease list CSV
|
||||
if not icd10_code:
|
||||
csv_path = "src/main/resources/disease_list.csv"
|
||||
if os.path.exists(csv_path):
|
||||
print(f"Looking up disease information in {csv_path}...")
|
||||
try:
|
||||
with open(csv_path, 'r') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
if row.get("disease_name", "").lower() == DISEASE_NAME.lower():
|
||||
icd10_code = row.get("id", "")
|
||||
disease_category = row.get("disease_category", "")
|
||||
print(f"Found matching disease in CSV:")
|
||||
print(f"- ICD-10 code: {icd10_code}")
|
||||
print(f"- Category: {disease_category}")
|
||||
break
|
||||
else:
|
||||
print("No matching disease found in CSV. Continuing without ICD-10 code.")
|
||||
except Exception as e:
|
||||
print(f"Error reading CSV: {str(e)}")
|
||||
print("Continuing without ICD-10 code.")
|
||||
|
||||
# Ensure ANTHROPIC_API_KEY is set
|
||||
if not os.environ.get('ANTHROPIC_API_KEY'):
|
||||
print("Error: ANTHROPIC_API_KEY environment variable is not set")
|
||||
return 1
|
||||
|
||||
# Ask Claude to generate the module
|
||||
generate_module()
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user