Trying to fix basic functionality again.

2025-03-23 11:53:47 -07:00
parent ebda48190a
commit 2141e81f42
406 changed files with 173963 additions and 69 deletions
--- a/scripts/patient_analysis.py
+++ b/scripts/patient_analysis.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+import glob
+import argparse
+from collections import Counter
+from datetime import datetime
+
+def analyze_patient_data(disease_name, input_dir, output_dir, format_type="html"):
+    print(f"Analyzing patient data for {disease_name}...")
+    
+    # Create the output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Find all patient JSON files
+    patients_files = glob.glob(f"{input_dir}/*.json")
+    patients_files = [f for f in patients_files if not 'hospitalInformation' in f and not 'practitionerInformation' in f]
+    
+    print(f"Found {len(patients_files)} patient records for analysis")
+    
+    if len(patients_files) == 0:
+        print("No patient files found to analyze.")
+        with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.html"), 'w') as f:
+            f.write(f"<html><body><h1>Analysis Report for {disease_name}</h1><p>No patient files found to analyze.</p></body></html>")
+        # Create empty CSV and JSON files
+        with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.csv"), 'w') as f:
+            f.write("No patient files found to analyze.\n")
+        with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.json"), 'w') as f:
+            f.write('{"error": "No patient files found to analyze."}\n')
+        return
+    
+    # Initialize data collectors
+    demographics = {'gender': Counter(), 'age': [], 'race': Counter(), 'ethnicity': Counter()}
+    condition_counts = Counter()
+    medication_counts = Counter()
+    
+    # Process each patient file
+    for patient_file in patients_files:
+        try:
+            with open(patient_file, 'r') as f:
+                data = json.load(f)
+            
+            # Skip non-patient resources
+            if 'resourceType' in data and data['resourceType'] == 'Patient':
+                # Basic patient info
+                if 'gender' in data:
+                    demographics['gender'][data['gender']] += 1
+                
+                if 'birthDate' in data:
+                    # Calculate age based on birth year
+                    birth_year = int(data['birthDate'][:4])
+                    current_year = datetime.now().year
+                    age = current_year - birth_year
+                    demographics['age'].append(age)
+                
+                # Process race and ethnicity extensions
+                if 'extension' in data:
+                    for ext in data.get('extension', []):
+                        if 'url' in ext and 'extension' in ext:
+                            if ext['url'].endswith('us-core-race'):
+                                for race_ext in ext['extension']:
+                                    if 'valueCoding' in race_ext:
+                                        race = race_ext['valueCoding'].get('display', 'Unknown')
+                                        demographics['race'][race] += 1
+                            elif ext['url'].endswith('us-core-ethnicity'):
+                                for eth_ext in ext['extension']:
+                                    if 'valueCoding' in eth_ext:
+                                        ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
+                                        demographics['ethnicity'][ethnicity] += 1
+            
+            # Check for Bundle resources with entries
+            if 'resourceType' in data and data['resourceType'] == 'Bundle' and 'entry' in data:
+                bundle_has_patient = False
+                for entry in data['entry']:
+                    if 'resource' in entry:
+                        resource = entry['resource']
+                        
+                        # Check if this bundle contains a patient
+                        if resource.get('resourceType') == 'Patient':
+                            bundle_has_patient = True
+                            # Basic patient info
+                            if 'gender' in resource:
+                                demographics['gender'][resource['gender']] += 1
+                            
+                            if 'birthDate' in resource:
+                                # Calculate age based on birth year
+                                birth_year = int(resource['birthDate'][:4])
+                                current_year = datetime.now().year
+                                age = current_year - birth_year
+                                demographics['age'].append(age)
+                            
+                            # Process race and ethnicity extensions
+                            if 'extension' in resource:
+                                for ext in resource.get('extension', []):
+                                    if 'url' in ext and 'extension' in ext:
+                                        if ext['url'].endswith('us-core-race'):
+                                            for race_ext in ext['extension']:
+                                                if 'valueCoding' in race_ext:
+                                                    race = race_ext['valueCoding'].get('display', 'Unknown')
+                                                    demographics['race'][race] += 1
+                                        elif ext['url'].endswith('us-core-ethnicity'):
+                                            for eth_ext in ext['extension']:
+                                                if 'valueCoding' in eth_ext:
+                                                    ethnicity = eth_ext['valueCoding'].get('display', 'Unknown')
+                                                    demographics['ethnicity'][ethnicity] += 1
+                        
+                        # Check for conditions
+                        if resource.get('resourceType') == 'Condition':
+                            if 'code' in resource and 'coding' in resource['code']:
+                                for code in resource['code']['coding']:
+                                    if 'display' in code:
+                                        condition_counts[code['display']] += 1
+                        
+                        # Check for medications
+                        if resource.get('resourceType') == 'MedicationRequest':
+                            if 'medicationCodeableConcept' in resource and 'coding' in resource['medicationCodeableConcept']:
+                                for code in resource['medicationCodeableConcept']['coding']:
+                                    if 'display' in code:
+                                        medication_counts[code['display']] += 1
+        
+        except Exception as e:
+            print(f"Error processing {patient_file}: {e}")
+    
+    # Calculate total patients (count unique patient files)
+    total_patients = sum(demographics['gender'].values())
+    if total_patients == 0:
+        print("Warning: No patient demographics found. Setting total_patients to file count.")
+        total_patients = len(patients_files)
+    
+    print(f"Total patients found: {total_patients}")
+    print(f"Gender distribution: {dict(demographics['gender'])}")
+    
+    if total_patients == 0:
+        total_patients = 1  # Avoid division by zero
+    
+    # Generate HTML report
+    if format_type.lower() in ["html", "all"]:
+        create_html_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
+    
+    # Generate CSV report
+    if format_type.lower() in ["csv", "all"]:
+        create_csv_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
+    
+    # Generate JSON report
+    if format_type.lower() in ["json", "all"]:
+        create_json_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients)
+    
+    print(f"Analysis complete. Reports generated in {output_dir}")
+
+
+def create_html_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
+    with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.html"), 'w') as f:
+        f.write(f'''<!DOCTYPE html>
+<html>
+<head>
+    <title>Synthea Patient Analysis - {disease_name}</title>
+    <style>
+        body {{ font-family: Arial, sans-serif; margin: 20px; }}
+        h1, h2, h3 {{ color: #333; }}
+        .container {{ max-width: 1000px; margin: 0 auto; }}
+        table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
+        th, td {{ text-align: left; padding: 8px; border-bottom: 1px solid #ddd; }}
+        th {{ background-color: #f2f2f2; }}
+        tr:hover {{background-color: #f5f5f5;}}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Synthea Patient Analysis - {disease_name}</h1>
+        <p>Total patients analyzed: {total_patients}</p>
+        
+        <h2>Demographics</h2>
+        <h3>Gender Distribution</h3>
+        <table>
+            <tr><th>Gender</th><th>Count</th><th>Percentage</th></tr>
+''')
+        
+        for gender, count in demographics['gender'].items():
+            percentage = (count / total_patients) * 100
+            f.write(f"<tr><td>{gender}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
+        
+        f.write('''
+        </table>
+        
+        <h3>Age Statistics</h3>
+        <table>
+''')
+        
+        if demographics['age']:
+            min_age = min(demographics['age'])
+            max_age = max(demographics['age'])
+            avg_age = sum(demographics['age']) / len(demographics['age'])
+            f.write(f"<tr><td>Minimum Age</td><td>{min_age}</td></tr>\n")
+            f.write(f"<tr><td>Maximum Age</td><td>{max_age}</td></tr>\n")
+            f.write(f"<tr><td>Average Age</td><td>{avg_age:.1f}</td></tr>\n")
+        else:
+            f.write("<tr><td colspan='2'>No age data available</td></tr>\n")
+        
+        f.write('''
+        </table>
+        
+        <h3>Top Conditions</h3>
+        <table>
+            <tr><th>Condition</th><th>Count</th><th>Percentage of Patients</th></tr>
+''')
+        
+        for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
+            percentage = (count / total_patients) * 100
+            f.write(f"<tr><td>{condition}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
+        
+        f.write('''
+        </table>
+        
+        <h3>Top Medications</h3>
+        <table>
+            <tr><th>Medication</th><th>Count</th><th>Percentage of Patients</th></tr>
+''')
+        
+        for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
+            percentage = (count / total_patients) * 100
+            f.write(f"<tr><td>{medication}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>\n")
+        
+        f.write('''
+        </table>
+    </div>
+</body>
+</html>''')
+
+
+def create_csv_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
+    with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.csv"), 'w') as f:
+        # Write header
+        f.write(f"Synthea Patient Analysis - {disease_name}\n")
+        f.write(f"Total patients analyzed,{total_patients}\n\n")
+        
+        # Gender distribution
+        f.write("Gender Distribution\n")
+        f.write("Gender,Count,Percentage\n")
+        for gender, count in demographics['gender'].items():
+            percentage = (count / total_patients) * 100
+            f.write(f"{gender},{count},{percentage:.1f}%\n")
+        f.write("\n")
+        
+        # Age statistics
+        f.write("Age Statistics\n")
+        if demographics['age']:
+            min_age = min(demographics['age'])
+            max_age = max(demographics['age'])
+            avg_age = sum(demographics['age']) / len(demographics['age'])
+            f.write(f"Minimum Age,{min_age}\n")
+            f.write(f"Maximum Age,{max_age}\n")
+            f.write(f"Average Age,{avg_age:.1f}\n")
+        else:
+            f.write("No age data available\n")
+        f.write("\n")
+        
+        # Top conditions
+        f.write("Top Conditions\n")
+        f.write("Condition,Count,Percentage of Patients\n")
+        for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
+            percentage = (count / total_patients) * 100
+            f.write(f"{condition},{count},{percentage:.1f}%\n")
+        f.write("\n")
+        
+        # Top medications
+        f.write("Top Medications\n")
+        f.write("Medication,Count,Percentage of Patients\n")
+        for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
+            percentage = (count / total_patients) * 100
+            f.write(f"{medication},{count},{percentage:.1f}%\n")
+
+
+def create_json_report(disease_name, output_dir, demographics, condition_counts, medication_counts, total_patients):
+    # Prepare the report data
+    report_data = {
+        "disease": disease_name,
+        "total_patients": total_patients,
+        "demographics": {
+            "gender": {k: v for k, v in demographics['gender'].items()},
+            "race": {k: v for k, v in demographics['race'].items()},
+            "ethnicity": {k: v for k, v in demographics['ethnicity'].items()}
+        },
+        "age_statistics": {}
+    }
+    
+    if demographics['age']:
+        report_data["age_statistics"] = {
+            "min_age": min(demographics['age']),
+            "max_age": max(demographics['age']),
+            "avg_age": sum(demographics['age']) / len(demographics['age'])
+        }
+    
+    # Add top conditions
+    report_data["top_conditions"] = [
+        {"name": condition, "count": count, "percentage": (count / total_patients) * 100}
+        for condition, count in sorted(condition_counts.items(), key=lambda x: x[1], reverse=True)[:15]
+    ]
+    
+    # Add top medications
+    report_data["top_medications"] = [
+        {"name": medication, "count": count, "percentage": (count / total_patients) * 100}
+        for medication, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:15]
+    ]
+    
+    # Write to JSON file
+    with open(os.path.join(output_dir, f"{disease_name.lower().replace(' ', '_')}_report.json"), 'w') as f:
+        json.dump(report_data, f, indent=2)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Analyze Synthea patient data")
+    parser.add_argument("--disease", required=True, help="Disease name")
+    parser.add_argument("--input_dir", required=True, help="Input directory with FHIR files")
+    parser.add_argument("--output_dir", default=".", help="Output directory for reports")
+    parser.add_argument("--format", default="html", choices=["html", "csv", "json", "all"], 
+                       help="Output format (html, csv, json, or all)")
+    
+    args = parser.parse_args()
+    
+    analyze_patient_data(args.disease, args.input_dir, args.output_dir, args.format)