#!/bin/bash # Synthea-All-Diseases Repository Cleanup Script # This script removes unnecessary files and keeps only the essential ones for the Synthea module generator echo "Starting repository cleanup..." # Create backup directory mkdir -p backup mkdir -p backup/scripts # Create a directory for Python scripts mkdir -p backup/modules # Create a directory for modules mkdir -p backup/module_generator # Create a directory for module generator scripts # 1. Keep essential Nextflow files, remove others echo "Handling Nextflow files..." cp main.nf backup/ rm -f build_synthea.nf synthea_module_generator_old.nf minimal_test.nf fresh_synthea.nf simple_synthea.nf test_workflow.nf new_synthea.nf minimal_working.nf # Keep synthea_module_generator.nf around for reference but not used mv synthea_module_generator.nf backup/ # 2. Keep essential Python scripts, Docker files, and configuration files echo "Keeping essential files..." if [ -d "src/main/python" ]; then cp -r src/main/python backup/module_generator/ fi if [ -d "src/main/resources/modules" ]; then cp -r src/main/resources/modules/* backup/modules/ fi cp Dockerfile docker-compose.yml CLAUDE.md .env.example nextflow.config README.md backup/ cp DOCKER_README.md SYNTHEA_GUIDE.md backup/ 2>/dev/null || : # Keep additional README files # Backup .env file if it exists if [ -f ".env" ]; then cp .env backup/ fi # 3. Back up all Python scripts in the root directory echo "Backing up Python scripts..." # Key scripts - these will be kept in the scripts directory after cleanup # The only key script we'll keep in the root is check_condition_structure.py for convenience key_scripts=("check_condition_structure.py") for script in "${key_scripts[@]}"; do if [ -f "$script" ]; then cp "$script" backup/ echo " Backed up key script for root: $script" fi done # All other Python scripts - these will be moved to scripts/ directory for script in *.py; do if [ -f "$script" ] && [[ ! " ${key_scripts[@]} " =~ " ${script} " ]]; then cp "$script" backup/scripts/ echo " Backed up utility script: $script" fi done # 4. Remove ALL Synthea code - we'll clone it in Docker echo "Removing ALL Synthea source code..." echo "This will be cloned during Docker build based on Dockerfile..." # Remove the entire src directory - we'll recreate what we need rm -rf src/ # Remove any Synthea build files/directories rm -rf build/ output/ simulator/ lib/ logs/ # Remove any Synthea run files rm -f run_synthea run_synthea.bat synthea *.jar # 5. Remove other experimental or temporary files echo "Removing experimental and temporary files..." rm -f *.log *.txt trace.txt rm -f error_output.txt generate_module.sh run_synthetic_data_generation.sh rm -f build_docker.sh entrypoint.sh run_module_generator.sh generate_samples.sh test_run.sh rm -f run_flexporter .DS_Store # 6. Remove gradle files and other unnecessary files echo "Removing additional unnecessary files..." rm -f build.gradle gradlew gradlew.bat settings.gradle rm -rf gradle/ .gradle/ # Note: Keeping .git, .gitignore, .github for version control rm -rf .nextflow/ .nextflow.log* nextflow-*.zip # Keep LICENSE files but remove other documentation that will be included from Synthea rm -f CODE_OF_CONDUCT.md NOTICE rm -f *.bak *~ # 7. Create clean directory structure for the repository echo "Creating clean directory structure..." mkdir -p modules # Top-level modules directory mkdir -p module_generator # Module generator directory (renamed from python) mkdir -p scripts # Utility scripts directory # 8. Restore files to the cleaned structure echo "Restoring files to cleaned structure..." # Restore modules if [ -d "backup/modules" ]; then cp -r backup/modules/* modules/ 2>/dev/null || : echo " Restored modules to modules/ directory" fi # Restore Module generator core scripts if [ -d "backup/module_generator" ]; then cp -r backup/module_generator/* module_generator/ 2>/dev/null || : echo " Restored core generator scripts to module_generator/ directory" fi # Restore key Python scripts to root directory for script in "${key_scripts[@]}"; do if [ -f "backup/$script" ]; then cp "backup/$script" ./ echo " Restored key script to root: $script" fi done # Restore utility Python scripts to scripts/ directory if [ -d "backup/scripts" ]; then cp backup/scripts/* scripts/ 2>/dev/null || : echo " Restored utility scripts to scripts/ directory" fi # Restore root files cp backup/main.nf backup/Dockerfile backup/docker-compose.yml backup/CLAUDE.md backup/nextflow.config backup/.env.example backup/README.md ./ 2>/dev/null || : if [ -f "backup/DOCKER_README.md" ]; then cp backup/DOCKER_README.md ./ fi if [ -f "backup/SYNTHEA_GUIDE.md" ]; then cp backup/SYNTHEA_GUIDE.md ./ fi # Restore .env if it existed if [ -f "backup/.env" ]; then cp backup/.env ./ echo " Restored .env file" fi # 9. Update the Dockerfile to reference the new directory structure echo "Updating Dockerfile references if needed..." if [ -f "Dockerfile" ]; then # Update the Dockerfile to use the new directory structure sed -i.bak 's|COPY src/main/python/|COPY module_generator/|g' Dockerfile sed -i.bak 's|COPY python/|COPY module_generator/|g' Dockerfile sed -i.bak 's|COPY src/main/resources/modules/|COPY modules/|g' Dockerfile sed -i.bak 's|src/main/python/|module_generator/|g' Dockerfile sed -i.bak 's|src/main/resources/modules|modules|g' Dockerfile # Update script paths if needed sed -i.bak 's|/app/src/main/python/|/app/module_generator/|g' Dockerfile rm -f Dockerfile.bak fi # 10. Update the main.nf file to reference the new directory structure echo "Updating main.nf references if needed..." if [ -f "main.nf" ]; then # Update the main.nf file to use the new directory structure sed -i.bak 's|src/main/resources/modules|modules|g' main.nf sed -i.bak 's|src/main/python|module_generator|g' main.nf sed -i.bak 's|python/|module_generator/|g' main.nf rm -f main.nf.bak fi # 11. Create a simple README for the scripts directory echo "Creating README for scripts directory..." cat > scripts/README.md << 'EOF' # Utility Scripts This directory contains utility scripts for working with Synthea modules: - `analyze_patient_data.py` - Analyzes patient data generated by Synthea - `check_json.py` - Validates JSON structure of module files - `validate_module.py` - Performs comprehensive validation of modules - `test_module_exists.py` - Checks if a module exists for a given disease - `run_module.py` - Script to run a specific module with Synthea - Other utility scripts for module generation and testing These scripts complement the core module generator scripts found in the `module_generator/` directory. EOF # Update README.md with instructions on using the cleaned repo echo "Updating README..." cat > README.md << 'EOF' # Synthea All Diseases This repository contains a workflow for generating Synthea disease modules using Claude AI and generating synthetic patient data. ## Repository Structure - `module_generator/` - Core module generation scripts - `modules/` - Generated disease modules - `scripts/` - Utility Python scripts and tools - `main.nf` - Nextflow workflow for module generation and patient data generation - `Dockerfile` and `docker-compose.yml` - Docker configuration ## Key Files - `check_condition_structure.py` - Validates condition structure in modules - `main.nf` - Main workflow file for generating modules and patient data ## Running the Pipeline ### Prerequisites - Docker and Docker Compose - Nextflow - Anthropic API key (for Claude AI) ### Setup 1. Clone this repository 2. Copy `.env.example` to `.env` and add your Anthropic API key 3. Run with Docker Compose: ``` docker-compose up ``` ### Running the Workflow ``` nextflow run main.nf --disease_name "Disease Name" [OPTIONS] ``` For a full list of options, run: ``` nextflow run main.nf --help ``` ## Key Features - Generate modules for diseases that don't exist in Synthea - Validate condition structure in generated modules - Generate synthetic patient data using the modules - Analyze generated patient data ## Documentation - See `CLAUDE.md` for additional development guidelines - See `DOCKER_README.md` for Docker setup and usage - See `SYNTHEA_GUIDE.md` for detailed usage examples and scenarios - See `scripts/README.md` for information about utility scripts ## License This project uses the same license as Synthea. EOF # 12. Update docker-compose.yml to reference the new directory structure echo "Updating docker-compose.yml references if needed..." if [ -f "docker-compose.yml" ]; then # No changes needed as it uses relative paths and volume mounts echo " No changes needed for docker-compose.yml" fi echo "Cleanup complete! All essential files have been kept, and unnecessary files have been removed." echo "Repository structure has been simplified to:" echo " - module_generator/ (core module generation scripts)" echo " - modules/ (disease modules)" echo " - scripts/ (utility scripts)" echo " - check_condition_structure.py (in root for easy access)" echo "" echo "The following important files were preserved:" echo " - .git/ (version control repository)" echo " - .gitignore (version control configuration)" echo " - .env (environment configuration if it existed)" echo " - .github/ (GitHub configuration if it existed)" echo "" echo "Optional: Remove the backup directory after verifying everything works: rm -rf backup"