Trying to fix basic functionality again.
This commit is contained in:
175
scripts/generate_batch.py
Executable file
175
scripts/generate_batch.py
Executable file
@@ -0,0 +1,175 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import csv
|
||||
import time
|
||||
import argparse
|
||||
import subprocess
|
||||
import concurrent.futures
|
||||
import re
|
||||
|
||||
# Global variables
|
||||
ARGS = None
|
||||
|
||||
def normalize_disease_name(name):
|
||||
"""Convert a disease name to a normalized filename"""
|
||||
# Convert to lowercase
|
||||
name = name.lower()
|
||||
|
||||
# Replace special characters with underscores
|
||||
name = re.sub(r'[^a-z0-9]+', '_', name)
|
||||
|
||||
# Remove leading/trailing underscores
|
||||
name = name.strip('_')
|
||||
|
||||
# Ensure the name is not empty
|
||||
if not name:
|
||||
name = "unknown_disease"
|
||||
|
||||
return name
|
||||
|
||||
def process_disease(disease_entry):
|
||||
"""Process a single disease from the CSV"""
|
||||
disease_name = disease_entry.get("disease_name", "")
|
||||
normalized_name = normalize_disease_name(disease_name)
|
||||
icd10 = disease_entry.get("id", "")
|
||||
category = disease_entry.get("disease_category", "")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Processing disease: {disease_name}")
|
||||
print(f"ICD-10 code: {icd10}")
|
||||
print(f"Category: {category}")
|
||||
|
||||
# Skip if module already exists (unless --force flag is used)
|
||||
module_path = f"src/main/resources/modules/{normalized_name}.json"
|
||||
if os.path.exists(module_path) and not ARGS.force:
|
||||
print(f"✅ Module already exists at {module_path}, skipping")
|
||||
return {"name": disease_name, "status": "skipped", "path": module_path}
|
||||
|
||||
# Create the command
|
||||
cmd = ["python3", "generate_module.py", "--disease", disease_name, "--no-interactive"]
|
||||
|
||||
if icd10:
|
||||
cmd.extend(["--icd10", icd10])
|
||||
|
||||
if category:
|
||||
cmd.extend(["--category", category])
|
||||
|
||||
# Add auto-fallback option if requested
|
||||
if ARGS.auto_fallback:
|
||||
cmd.append("--auto-fallback")
|
||||
|
||||
# Run the generator
|
||||
try:
|
||||
print(f"Executing: {' '.join(cmd)}")
|
||||
process = subprocess.run(cmd, check=True, text=True, capture_output=True)
|
||||
print(f"✅ Successfully generated module for {disease_name}")
|
||||
print(process.stdout.strip())
|
||||
return {"name": disease_name, "status": "success", "path": module_path}
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Failed to generate module for {disease_name}")
|
||||
print(f"Error: {str(e)}")
|
||||
print(f"STDOUT: {e.stdout}")
|
||||
print(f"STDERR: {e.stderr}")
|
||||
return {"name": disease_name, "status": "error", "error": str(e), "path": None}
|
||||
|
||||
def main():
|
||||
"""Main function to process diseases from the CSV"""
|
||||
global ARGS
|
||||
parser = argparse.ArgumentParser(description='Generate Synthea modules for diseases')
|
||||
parser.add_argument('--category', help='Only process diseases in this category')
|
||||
parser.add_argument('--disease', help='Only process a specific disease (by name)')
|
||||
parser.add_argument('--limit', type=int, help='Limit number of diseases to process')
|
||||
parser.add_argument('--parallel', type=int, default=1, help='Number of parallel processes')
|
||||
parser.add_argument('--skip-existing', action='store_true', help='Skip diseases that already have modules')
|
||||
parser.add_argument('--csv-path', default='src/main/resources/disease_list.csv', help='Path to disease list CSV')
|
||||
parser.add_argument('--force', action='store_true', help='Force generation even if module already exists')
|
||||
parser.add_argument('--auto-fallback', action='store_true', help='Enable auto-fallback option')
|
||||
args = parser.parse_args()
|
||||
|
||||
ARGS = args
|
||||
|
||||
# Read the disease list CSV
|
||||
if not os.path.exists(args.csv_path):
|
||||
print(f"Error: Disease list CSV not found at {args.csv_path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Reading disease list from {args.csv_path}")
|
||||
try:
|
||||
with open(args.csv_path, 'r') as f:
|
||||
reader = csv.DictReader(f)
|
||||
diseases = list(reader)
|
||||
except Exception as e:
|
||||
print(f"Error reading CSV: {str(e)}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(diseases)} diseases in the CSV")
|
||||
|
||||
# Filter diseases
|
||||
if args.category:
|
||||
diseases = [d for d in diseases if (d.get('disease_category', '').lower() == args.category.lower())]
|
||||
print(f"Filtered to {len(diseases)} diseases in category '{args.category}'")
|
||||
|
||||
if args.disease:
|
||||
# Try to find an exact match first
|
||||
disease_name_lower = args.disease.lower()
|
||||
exact_match = [d for d in diseases if d.get("disease_name", "").lower() == disease_name_lower]
|
||||
|
||||
if exact_match:
|
||||
diseases = exact_match
|
||||
else:
|
||||
# Try to find a disease that contains the specified name
|
||||
partial_matches = [d for d in diseases if disease_name_lower in d.get("disease_name", "").lower()]
|
||||
|
||||
if partial_matches:
|
||||
diseases = partial_matches
|
||||
print(f"Found {len(diseases)} partial matches for '{args.disease}'")
|
||||
else:
|
||||
print(f"No matches found for disease '{args.disease}'")
|
||||
sys.exit(1)
|
||||
|
||||
if args.limit and args.limit > 0:
|
||||
diseases = diseases[:args.limit]
|
||||
print(f"Limited to {args.limit} diseases")
|
||||
|
||||
# Process the diseases
|
||||
total = len(diseases)
|
||||
print(f"\nProcessing {total} diseases with {args.parallel} parallel workers")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
if args.parallel > 1:
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=args.parallel) as executor:
|
||||
results = list(executor.map(process_disease, diseases))
|
||||
else:
|
||||
results = [process_disease(disease) for disease in diseases]
|
||||
|
||||
end_time = time.time()
|
||||
elapsed = end_time - start_time
|
||||
|
||||
# Summarize results
|
||||
success_count = sum(1 for r in results if r["status"] == "success")
|
||||
skipped_count = sum(1 for r in results if r["status"] == "skipped")
|
||||
error_count = sum(1 for r in results if r["status"] == "error")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print(f"SUMMARY: Processed {total} diseases in {elapsed:.2f} seconds")
|
||||
print(f"- Successfully generated: {success_count}")
|
||||
print(f"- Skipped (already exist): {skipped_count}")
|
||||
print(f"- Failed: {error_count}")
|
||||
|
||||
# List errors if any
|
||||
if error_count > 0:
|
||||
print("\nFAILED DISEASES:")
|
||||
for result in results:
|
||||
if result["status"] == "error":
|
||||
print(f"- {result['name']}: {result.get('error', 'Unknown error')}")
|
||||
|
||||
# Exit with error code if any failures
|
||||
sys.exit(1)
|
||||
|
||||
print("\nAll done! 🎉")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user