Files

262 lines
9.2 KiB
Bash
Executable File

#!/bin/bash
# Synthea-All-Diseases Repository Cleanup Script
# This script removes unnecessary files and keeps only the essential ones for the Synthea module generator
echo "Starting repository cleanup..."
# Create backup directory
mkdir -p backup
mkdir -p backup/scripts # Create a directory for Python scripts
mkdir -p backup/modules # Create a directory for modules
mkdir -p backup/module_generator # Create a directory for module generator scripts
# 1. Keep essential Nextflow files, remove others
echo "Handling Nextflow files..."
cp main.nf backup/
rm -f build_synthea.nf synthea_module_generator_old.nf minimal_test.nf fresh_synthea.nf simple_synthea.nf test_workflow.nf new_synthea.nf minimal_working.nf
# Keep synthea_module_generator.nf around for reference but not used
mv synthea_module_generator.nf backup/
# 2. Keep essential Python scripts, Docker files, and configuration files
echo "Keeping essential files..."
if [ -d "src/main/python" ]; then
cp -r src/main/python backup/module_generator/
fi
if [ -d "src/main/resources/modules" ]; then
cp -r src/main/resources/modules/* backup/modules/
fi
cp Dockerfile docker-compose.yml CLAUDE.md .env.example nextflow.config README.md backup/
cp DOCKER_README.md SYNTHEA_GUIDE.md backup/ 2>/dev/null || : # Keep additional README files
# Backup .env file if it exists
if [ -f ".env" ]; then
cp .env backup/
fi
# 3. Back up all Python scripts in the root directory
echo "Backing up Python scripts..."
# Key scripts - these will be kept in the scripts directory after cleanup
# The only key script we'll keep in the root is check_condition_structure.py for convenience
key_scripts=("check_condition_structure.py")
for script in "${key_scripts[@]}"; do
if [ -f "$script" ]; then
cp "$script" backup/
echo " Backed up key script for root: $script"
fi
done
# All other Python scripts - these will be moved to scripts/ directory
for script in *.py; do
if [ -f "$script" ] && [[ ! " ${key_scripts[@]} " =~ " ${script} " ]]; then
cp "$script" backup/scripts/
echo " Backed up utility script: $script"
fi
done
# 4. Remove ALL Synthea code - we'll clone it in Docker
echo "Removing ALL Synthea source code..."
echo "This will be cloned during Docker build based on Dockerfile..."
# Remove the entire src directory - we'll recreate what we need
rm -rf src/
# Remove any Synthea build files/directories
rm -rf build/ output/ simulator/ lib/ logs/
# Remove any Synthea run files
rm -f run_synthea run_synthea.bat synthea *.jar
# 5. Remove other experimental or temporary files
echo "Removing experimental and temporary files..."
rm -f *.log *.txt trace.txt
rm -f error_output.txt generate_module.sh run_synthetic_data_generation.sh
rm -f build_docker.sh entrypoint.sh run_module_generator.sh generate_samples.sh test_run.sh
rm -f run_flexporter .DS_Store
# 6. Remove gradle files and other unnecessary files
echo "Removing additional unnecessary files..."
rm -f build.gradle gradlew gradlew.bat settings.gradle
rm -rf gradle/ .gradle/
# Note: Keeping .git, .gitignore, .github for version control
rm -rf .nextflow/ .nextflow.log* nextflow-*.zip
# Keep LICENSE files but remove other documentation that will be included from Synthea
rm -f CODE_OF_CONDUCT.md NOTICE
rm -f *.bak *~
# 7. Create clean directory structure for the repository
echo "Creating clean directory structure..."
mkdir -p modules # Top-level modules directory
mkdir -p module_generator # Module generator directory (renamed from python)
mkdir -p scripts # Utility scripts directory
# 8. Restore files to the cleaned structure
echo "Restoring files to cleaned structure..."
# Restore modules
if [ -d "backup/modules" ]; then
cp -r backup/modules/* modules/ 2>/dev/null || :
echo " Restored modules to modules/ directory"
fi
# Restore Module generator core scripts
if [ -d "backup/module_generator" ]; then
cp -r backup/module_generator/* module_generator/ 2>/dev/null || :
echo " Restored core generator scripts to module_generator/ directory"
fi
# Restore key Python scripts to root directory
for script in "${key_scripts[@]}"; do
if [ -f "backup/$script" ]; then
cp "backup/$script" ./
echo " Restored key script to root: $script"
fi
done
# Restore utility Python scripts to scripts/ directory
if [ -d "backup/scripts" ]; then
cp backup/scripts/* scripts/ 2>/dev/null || :
echo " Restored utility scripts to scripts/ directory"
fi
# Restore root files
cp backup/main.nf backup/Dockerfile backup/docker-compose.yml backup/CLAUDE.md backup/nextflow.config backup/.env.example backup/README.md ./ 2>/dev/null || :
if [ -f "backup/DOCKER_README.md" ]; then
cp backup/DOCKER_README.md ./
fi
if [ -f "backup/SYNTHEA_GUIDE.md" ]; then
cp backup/SYNTHEA_GUIDE.md ./
fi
# Restore .env if it existed
if [ -f "backup/.env" ]; then
cp backup/.env ./
echo " Restored .env file"
fi
# 9. Update the Dockerfile to reference the new directory structure
echo "Updating Dockerfile references if needed..."
if [ -f "Dockerfile" ]; then
# Update the Dockerfile to use the new directory structure
sed -i.bak 's|COPY src/main/python/|COPY module_generator/|g' Dockerfile
sed -i.bak 's|COPY python/|COPY module_generator/|g' Dockerfile
sed -i.bak 's|COPY src/main/resources/modules/|COPY modules/|g' Dockerfile
sed -i.bak 's|src/main/python/|module_generator/|g' Dockerfile
sed -i.bak 's|src/main/resources/modules|modules|g' Dockerfile
# Update script paths if needed
sed -i.bak 's|/app/src/main/python/|/app/module_generator/|g' Dockerfile
rm -f Dockerfile.bak
fi
# 10. Update the main.nf file to reference the new directory structure
echo "Updating main.nf references if needed..."
if [ -f "main.nf" ]; then
# Update the main.nf file to use the new directory structure
sed -i.bak 's|src/main/resources/modules|modules|g' main.nf
sed -i.bak 's|src/main/python|module_generator|g' main.nf
sed -i.bak 's|python/|module_generator/|g' main.nf
rm -f main.nf.bak
fi
# 11. Create a simple README for the scripts directory
echo "Creating README for scripts directory..."
cat > scripts/README.md << 'EOF'
# Utility Scripts
This directory contains utility scripts for working with Synthea modules:
- `analyze_patient_data.py` - Analyzes patient data generated by Synthea
- `check_json.py` - Validates JSON structure of module files
- `validate_module.py` - Performs comprehensive validation of modules
- `test_module_exists.py` - Checks if a module exists for a given disease
- `run_module.py` - Script to run a specific module with Synthea
- Other utility scripts for module generation and testing
These scripts complement the core module generator scripts found in the `module_generator/` directory.
EOF
# Update README.md with instructions on using the cleaned repo
echo "Updating README..."
cat > README.md << 'EOF'
# Synthea All Diseases
This repository contains a workflow for generating Synthea disease modules using Claude AI and generating synthetic patient data.
## Repository Structure
- `module_generator/` - Core module generation scripts
- `modules/` - Generated disease modules
- `scripts/` - Utility Python scripts and tools
- `main.nf` - Nextflow workflow for module generation and patient data generation
- `Dockerfile` and `docker-compose.yml` - Docker configuration
## Key Files
- `check_condition_structure.py` - Validates condition structure in modules
- `main.nf` - Main workflow file for generating modules and patient data
## Running the Pipeline
### Prerequisites
- Docker and Docker Compose
- Nextflow
- Anthropic API key (for Claude AI)
### Setup
1. Clone this repository
2. Copy `.env.example` to `.env` and add your Anthropic API key
3. Run with Docker Compose:
```
docker-compose up
```
### Running the Workflow
```
nextflow run main.nf --disease_name "Disease Name" [OPTIONS]
```
For a full list of options, run:
```
nextflow run main.nf --help
```
## Key Features
- Generate modules for diseases that don't exist in Synthea
- Validate condition structure in generated modules
- Generate synthetic patient data using the modules
- Analyze generated patient data
## Documentation
- See `CLAUDE.md` for additional development guidelines
- See `DOCKER_README.md` for Docker setup and usage
- See `SYNTHEA_GUIDE.md` for detailed usage examples and scenarios
- See `scripts/README.md` for information about utility scripts
## License
This project uses the same license as Synthea.
EOF
# 12. Update docker-compose.yml to reference the new directory structure
echo "Updating docker-compose.yml references if needed..."
if [ -f "docker-compose.yml" ]; then
# No changes needed as it uses relative paths and volume mounts
echo " No changes needed for docker-compose.yml"
fi
echo "Cleanup complete! All essential files have been kept, and unnecessary files have been removed."
echo "Repository structure has been simplified to:"
echo " - module_generator/ (core module generation scripts)"
echo " - modules/ (disease modules)"
echo " - scripts/ (utility scripts)"
echo " - check_condition_structure.py (in root for easy access)"
echo ""
echo "The following important files were preserved:"
echo " - .git/ (version control repository)"
echo " - .gitignore (version control configuration)"
echo " - .env (environment configuration if it existed)"
echo " - .github/ (GitHub configuration if it existed)"
echo ""
echo "Optional: Remove the backup directory after verifying everything works: rm -rf backup"