From e10ae0cf81a53bee99b47d55f0820fe031860ff3 Mon Sep 17 00:00:00 2001 From: olamide24 Date: Thu, 27 Mar 2025 11:26:33 -0700 Subject: [PATCH] Fixed patient generation with separate script approach --- generate_patients.sh | 78 +++++++++++ main.nf | 4 +- module_generator/anthropic_patch.py | 23 ++++ module_generator/simple_module_generator.py | 122 +++++++++++++++++ nextflow.config | 9 +- params.json | 2 +- simple.nf | 4 +- src/main/resources/synthea.properties | 5 + test.nf | 141 ++++++++++++++++++++ test_synthea.sh | 21 +++ 10 files changed, 401 insertions(+), 8 deletions(-) create mode 100755 generate_patients.sh create mode 100644 module_generator/anthropic_patch.py create mode 100755 module_generator/simple_module_generator.py create mode 100644 src/main/resources/synthea.properties create mode 100644 test.nf create mode 100755 test_synthea.sh diff --git a/generate_patients.sh b/generate_patients.sh new file mode 100755 index 0000000..4eb0372 --- /dev/null +++ b/generate_patients.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# Script to generate synthetic patients directly using modules created by the pipeline +# Usage: ./generate_patients.sh + +MODULE_NAME=$1 +OUTPUT_DIR=$2 +POPULATION=${3:-10} + +if [ -z "$MODULE_NAME" ] || [ -z "$OUTPUT_DIR" ]; then + echo "Usage: $0 [population_size]" + echo "Example: $0 diabetes /path/to/output 20" + exit 1 +fi + +# Create output directory +mkdir -p "$OUTPUT_DIR/m" "$OUTPUT_DIR/f" + +# Location of module file +MODULE_PATH="/data/olamide/synthea-alldiseases/modules/${MODULE_NAME}.json" + +if [ ! -f "$MODULE_PATH" ]; then + echo "Module file not found: $MODULE_PATH" + exit 1 +fi + +# Create a temporary directory for the container output +TEMP_DIR=$(mktemp -d) +echo "Created temporary directory: $TEMP_DIR" + +# Run for male patients +echo "Generating male patients..." +docker run --rm \ + -v "$MODULE_PATH:/app/modules/${MODULE_NAME}.json" \ + -v "$TEMP_DIR:/app/output" \ + synthea-module-generator \ + bash -c "cd /app && ./run_synthea -p $((POPULATION/2)) -g M -m ${MODULE_NAME} Massachusetts" + +# Copy male patient files to the output directory +echo "Copying male patient files..." +find "$TEMP_DIR/fhir" -name "*.json" ! -name "*hospital*" ! -name "*practitioner*" | while read file; do + # Check if it's a patient file by looking for gender field + if grep -q '"gender"' "$file"; then + cp "$file" "$OUTPUT_DIR/m/" + fi +done + +# Clear the temp directory +rm -rf "$TEMP_DIR/fhir"/* + +# Run for female patients +echo "Generating female patients..." +docker run --rm \ + -v "$MODULE_PATH:/app/modules/${MODULE_NAME}.json" \ + -v "$TEMP_DIR:/app/output" \ + synthea-module-generator \ + bash -c "cd /app && ./run_synthea -p $((POPULATION/2)) -g F -m ${MODULE_NAME} Massachusetts" + +# Copy female patient files to the output directory +echo "Copying female patient files..." +find "$TEMP_DIR/fhir" -name "*.json" ! -name "*hospital*" ! -name "*practitioner*" | while read file; do + # Check if it's a patient file by looking for gender field + if grep -q '"gender"' "$file"; then + cp "$file" "$OUTPUT_DIR/f/" + fi +done + +# Count the results +male_count=$(find "$OUTPUT_DIR/m" -type f -name "*.json" | wc -l) +female_count=$(find "$OUTPUT_DIR/f" -type f -name "*.json" | wc -l) + +# Report results +echo "Patient generation complete. Results saved to $OUTPUT_DIR" +echo "Male patients: $male_count" +echo "Female patients: $female_count" + +# Clean up temp directory +rm -rf "$TEMP_DIR" diff --git a/main.nf b/main.nf index d3ce336..2ef2c10 100644 --- a/main.nf +++ b/main.nf @@ -21,7 +21,7 @@ if (envFile.exists()) { // Default parameters params.disease_name = null // Disease name to generate patients for -params.output_dir = "output" // Output directory +params.output_dir = "/mnt/OmicNAS/private/old/olamide/synthea/output/new" // Output directory params.modules_dir = "src/main/resources/modules" // Directory for module files params.population = 100 // Number of patients to generate params.gender = 0.5 // Decimal representing proportion female (0.0-1.0) @@ -125,4 +125,4 @@ workflow { // Then generate patients generatePatients(params.disease_name, checkAndGetModule.out.module_file) -} \ No newline at end of file +} diff --git a/module_generator/anthropic_patch.py b/module_generator/anthropic_patch.py new file mode 100644 index 0000000..cef2c29 --- /dev/null +++ b/module_generator/anthropic_patch.py @@ -0,0 +1,23 @@ +""" +Patch for Anthropic client to fix 'proxies' parameter issue +Place this file in the same directory as module_generator.py +""" +import anthropic +import inspect + +# Store the original __init__ method +original_init = anthropic.Client.__init__ + +# Define a new __init__ method that filters out problematic parameters +def patched_init(self, *args, **kwargs): + # Remove 'proxies' from kwargs if present + if 'proxies' in kwargs: + del kwargs['proxies'] + + # Call the original __init__ with filtered kwargs + original_init(self, *args, **kwargs) + +# Replace the original __init__ with our patched version +anthropic.Client.__init__ = patched_init + +print("Applied patch to fix Anthropic client proxies parameter issue") diff --git a/module_generator/simple_module_generator.py b/module_generator/simple_module_generator.py new file mode 100755 index 0000000..409532f --- /dev/null +++ b/module_generator/simple_module_generator.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +""" +Simple Module Generator for Synthea Nextflow Pipeline +Using Direct HTTP Requests to avoid client library issues +""" + +import os +import json +import argparse +import requests +from dotenv import load_dotenv + +def generate_module(disease_name, output_file): + """Generate a Synthea module for the specified disease""" + # Load API key from environment + load_dotenv() + api_key = os.getenv("ANTHROPIC_API_KEY") + + if not api_key: + raise ValueError("ANTHROPIC_API_KEY environment variable not set") + + print(f"Generating module for {disease_name}...") + + # Current Anthropic API endpoint and format (as of 2024) + url = "https://api.anthropic.com/v1/messages" + + headers = { + "Content-Type": "application/json", + "x-api-key": api_key, + "anthropic-version": "2023-06-01" + } + + system_prompt = """ + You are an expert in medical informatics and Synthea module creation. + Generate a complete, valid JSON module for the specified disease. + The module must follow Synthea's format conventions. + """ + + user_prompt = f""" + Create a complete Synthea module for {disease_name}. + + The module should include: + - Initial states for disease onset and progression + - Diagnostic procedures and criteria + - Treatment options and medication regimens + - Complications and their management + - Follow-up care protocols + + Return ONLY valid JSON that can be directly used in Synthea without any explanation or markdown. + """ + + data = { + "model": "claude-3-opus-20240229", + "system": system_prompt, + "messages": [ + { + "role": "user", + "content": user_prompt + } + ], + "max_tokens": 4000, + "temperature": 0.2 + } + + try: + # Make direct API request + response = requests.post(url, headers=headers, json=data) + + # Check for errors + if response.status_code != 200: + print(f"API request failed with status code {response.status_code}: {response.text}") + raise Exception(f"API request failed with status code {response.status_code}") + + # Parse response + result = response.json() + module_content = result["content"][0]["text"] + + # Extract the JSON part if wrapped in markdown + if "```json" in module_content: + module_content = module_content.split("```json")[1].split("```")[0].strip() + elif "```" in module_content: + module_content = module_content.split("```")[1].split("```")[0].strip() + + # Validate JSON + try: + module_json = json.loads(module_content) + with open(output_file, 'w') as f: + json.dump(module_json, f, indent=2) + print(f"Successfully generated module and saved to {output_file}") + return True + except json.JSONDecodeError as e: + print(f"Generated content is not valid JSON: {e}") + with open(f"{output_file}.raw", 'w') as f: + f.write(module_content) + print(f"Raw content saved to {output_file}.raw") + return False + + except Exception as e: + print(f"Error generating module: {e}") + if isinstance(e, requests.exceptions.RequestException): + print(f"Request error details: {e.response.text if hasattr(e, 'response') else 'No response details'}") + return False + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Generate Synthea disease module') + parser.add_argument('--disease', required=True, help='Disease name to generate module for') + parser.add_argument('--output', required=True, help='Output filename for the module') + + args = parser.parse_args() + + # Install required packages if needed + try: + import requests + except ImportError: + import subprocess + print("Installing required packages...") + subprocess.check_call(["pip", "install", "requests"]) + + success = generate_module(args.disease, args.output) + + if not success: + exit(1) diff --git a/nextflow.config b/nextflow.config index 819f8db..8a0c2b1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -7,11 +7,15 @@ manifest { def paramsJson = new File("$baseDir/params.json").text def paramsData = new groovy.json.JsonSlurper().parseText(paramsJson) +// Keep work directories for debugging +cleanup = false + // Merge with defaults params { + local_output_dir = "$baseDir/local_output" disease_name = paramsData.disease_name modules_dir = paramsData.modules_dir ?: "/Users/richman/workspace/synthea-alldiseases/modules" - output_dir = paramsData.output_dir ?: "output" + output_dir = paramsData.output_dir ?: "/mnt/OmicNAS/private/old/olamide/synthea/output/new" population = paramsData.population ?: 100 gender = paramsData.gender ?: 0.5 min_age = paramsData.min_age ?: 0 @@ -19,7 +23,6 @@ params { seed = paramsData.seed generate_patients = paramsData.generate_patients ?: true publish_dir = paramsData.publish_dir ?: "published_output" - // Additional params max_cost = 5.0 timeout = 300 @@ -30,7 +33,7 @@ params { docker { enabled = true - runOptions = "-v $baseDir/$params.modules_dir:/app/src/main/resources/modules -v $baseDir/src/main/python:/app/src/main/python -v $baseDir/src/main/resources:/app/src/main/resources -v $baseDir/.env:/app/.env" + runOptions = "-v $baseDir/$params.modules_dir:/app/src/main/resources/modules -v $baseDir/src/main/python:/app/src/main/python -v $baseDir/src/main/resources:/app/src/main/resources -v $baseDir/.env:/app/.env -v $baseDir/module_generator:/app/module_generator -v $baseDir/modules:/app/modules" } process { diff --git a/params.json b/params.json index eeaaaf3..991057a 100644 --- a/params.json +++ b/params.json @@ -1,7 +1,7 @@ { "disease_name": "Parkinson's Disease", "modules_dir": "modules", - "output_dir": "output", + "output_dir": "/mnt/OmicNAS/private/old/olamide/synthea/output/new", "generate_patients": true, "population": 10, "gender": 0.5, diff --git a/simple.nf b/simple.nf index 3681f74..82a8d94 100644 --- a/simple.nf +++ b/simple.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl=2 // Default parameters params.disease_name = "Diabetes" // Default disease name -params.output_dir = "output" // Output directory +params.output_dir = "/mnt/OmicNAS/private/old/olamide/synthea/output/new" // Output directory params.modules_dir = "modules" // Directory for module files // Process to generate synthetic patients @@ -74,4 +74,4 @@ workflow { // Generate patients generatePatients(moduleChannel) -} \ No newline at end of file +} diff --git a/src/main/resources/synthea.properties b/src/main/resources/synthea.properties new file mode 100644 index 0000000..95a333a --- /dev/null +++ b/src/main/resources/synthea.properties @@ -0,0 +1,5 @@ +exporter.fhir.export = true +exporter.hospital.fhir.export = true +exporter.practitioner.fhir.export = true +generate.timestep = 7 +generate.append_numbers_to_person_names = true diff --git a/test.nf b/test.nf new file mode 100644 index 0000000..dfa0df5 --- /dev/null +++ b/test.nf @@ -0,0 +1,141 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +/* + * Synthea Disease Module Generator Pipeline + * + * A Nextflow pipeline to generate and manage Synthea disease modules + */ + +// Load API key from .env file if it exists +def envFile = file('.env') +if (envFile.exists()) { + envFile.eachLine { line -> + def (key, value) = line.tokenize('=') + if (key && value && key.trim() == 'ANTHROPIC_API_KEY') { + params.anthropic_api_key = value.trim() + } + } +} + +// Default parameters +params.disease_name = null // Disease name to generate patients for +params.output_dir = "/mnt/OmicNAS/private/old/olamide/synthea/output/new" // Output directory +params.modules_dir = "src/main/resources/modules" // Directory for module files +params.population = 100 // Number of patients to generate +params.gender = 0.5 // Decimal representing proportion female (0.0-1.0) +params.min_age = 0 // Minimum age of generated patients +params.max_age = 90 // Maximum age of generated patients +params.seed = null // Random seed for reproducibility +params.help = false // Show help message + +// Show help message +if (params.help) { + log.info """ + Synthea Patient Generator + ======================== + Usage: nextflow run main.nf --disease_name "Disease Name" + + Required Arguments: + --disease_name Disease name to generate patients for + + Optional Arguments: + --modules_dir Module directory (default: modules) + --output_dir Output directory (default: output) + --population Number of patients (default: 100) + --gender Gender ratio - female proportion 0.0-1.0 (default: 0.5) + --min_age Minimum age (default: 0) + --max_age Maximum age (default: 90) + --seed Random seed (default: random) + """ + exit 0 +} + +// Validate required parameters +if (!params.disease_name && !params.help) { + error "Disease name is required. Please specify with --disease_name" +} + +// Process to check if module exists and generate it if needed +process checkAndGetModule { + container 'synthea-module-generator' + publishDir "${params.modules_dir}", mode: 'copy' + + input: + val diseaseName + + output: + path "*.json", emit: module_file + + script: + // Use sanitized disease name for filenames - replace spaces with underscores and remove special chars + def moduleFilename = diseaseName.toLowerCase().replaceAll(' ', '_').replaceAll('[^a-z0-9_]', '') + '.json' + def fullPath = "/app/src/main/resources/modules/${moduleFilename}" + """ + echo "Looking for module at ${fullPath}" + if [ -f "${fullPath}" ]; then + echo "Module exists, copying..." + cp "${fullPath}" . + else + echo "Module not found, generating..." + # Use the simple generator script instead + python3 /app/module_generator/simple_module_generator.py --disease "${diseaseName}" --output "${moduleFilename}" + if [ -f "${moduleFilename}" ]; then + echo "Successfully generated module" + else + echo "Error: Failed to generate module" + exit 1 + fi + fi + """ +} + +// Process to generate synthetic patients +process generatePatients { + container 'synthea-module-generator' + publishDir "${params.output_dir}/${diseaseName.toLowerCase().replaceAll(' ', '_').replaceAll('[^a-z0-9_]', '')}", mode: 'copy', failOnError: false + + input: + val diseaseName + path moduleFile + + output: + path "m", optional: true + path "f", optional: true + path "module.json", optional: true + path "README.txt", optional: true + + script: + def moduleBasename = diseaseName.toLowerCase().replaceAll(' ', '_').replaceAll('[^a-z0-9_]', '') + """ + # Create directories + mkdir -p m f + + # Copy the module file for reference + cp "${moduleFile}" module.json + + # Create a README file with instructions + cat > README.txt << EOF +This directory contains the module for ${diseaseName}. +To generate patients, run: +./generate_patients.sh ${moduleBasename} ${params.output_dir}/${moduleBasename}/patients 20 +EOF + + # Create marker files + touch m/.keep + touch f/.keep + + # Always exit successfully + exit 0 + """ +} + +// Define workflow +workflow { + // First check if the module exists + checkAndGetModule(params.disease_name) + + // Then generate patients + generatePatients(params.disease_name, checkAndGetModule.out.module_file) +} diff --git a/test_synthea.sh b/test_synthea.sh new file mode 100755 index 0000000..fde8b3d --- /dev/null +++ b/test_synthea.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Set up environment and variables +MODULE_NAME="diabetes" +JSON_PATH="$(pwd)/modules/${MODULE_NAME}.json" + +# Make sure we have the module file +if [ ! -f "$JSON_PATH" ]; then + echo "Module file not found: $JSON_PATH" + exit 1 +fi + +# Run Synthea directly in a container +docker run --rm -v "${JSON_PATH}:/app/modules/${MODULE_NAME}.json" \ + -v "$(pwd)/test_output:/app/output" \ + synthea-module-generator \ + bash -c "cd /app && ./run_synthea -p 10 -g B -m ${MODULE_NAME} -a 0-90 -s 12345 | tee /app/output/synthea_run.log" + +# Check the output +echo "Checking output directory:" +find test_output -type f | sort