Initial commit: FlowDock pipeline configured for WES execution

2026-03-16 15:23:29 +01:00
commit a3ffec6a07
116 changed files with 16139 additions and 0 deletions
--- a/scripts/esmfold_prior_plinder_finetuning.sh
+++ b/scripts/esmfold_prior_plinder_finetuning.sh
@@ -0,0 +1,64 @@
+#!/bin/bash -l
+######################### Batch Headers #########################
+#SBATCH --partition chengji-lab-gpu    # NOTE: use reserved partition `chengji-lab-gpu` to use reserved A100 or H100 GPUs
+#SBATCH --account chengji-lab  # NOTE: this must be specified to use the reserved partition above
+#SBATCH --nodes=1              # NOTE: this needs to match Lightning's `Trainer(num_nodes=...)`
+#SBATCH --gres gpu:1           # request A100/H100 GPU resource(s)
+#SBATCH --ntasks-per-node=1    # NOTE: this needs to be `1` on SLURM clusters when using Lightning's `ddp_spawn` strategy`; otherwise, set to match Lightning's quantity of `Trainer(devices=...)`
+#SBATCH --mem=59G              # NOTE: use `--mem=0` to request all memory "available" on the assigned node
+#SBATCH -t 2-00:00:00          # time limit for the job (up to 2 days: `2-00:00:00`)
+#SBATCH -J esmfold_prior_plinder_finetuning    # job name
+#SBATCH --output=R-%x.%j.out   # output log file
+#SBATCH --error=R-%x.%j.err    # error log file
+
+module purge
+module load cuda/11.8.0_gcc_9.5.0
+
+# determine location of the project directory
+use_private_project_dir=false # NOTE: customize as needed
+if [ "$use_private_project_dir" = true ]; then
+    project_dir="/home/acmwhb/data/Repositories/Lab_Repositories/FlowDock"
+else
+    project_dir="/cluster/pixstor/chengji-lab/acmwhb/Repositories/Lab_Repositories/FlowDock"
+fi
+
+# shellcheck source=/dev/null
+source /cluster/pixstor/chengji-lab/acmwhb/miniforge3/etc/profile.d/conda.sh
+conda activate "$project_dir"/FlowDock/
+
+# Reference Conda system libraries
+export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
+
+echo "Calling flowdock/train.py!"
+cd "$project_dir" || exit
+srun python3 flowdock/train.py \
+    callbacks.last_model_checkpoint.filename=null \
+    callbacks.last_model_checkpoint.every_n_train_steps=200 \
+    callbacks.last_model_checkpoint.every_n_epochs=null \
+    ckpt_path="$(realpath 'logs/train/runs/2025-03-17_17-39-39/checkpoints/169-562000.ckpt')" \
+    data=plinder \
+    experiment='flowdock_fm' \
+    environment=slurm \
+    logger=wandb \
+    logger.wandb.entity='bml-lab' \
+    logger.wandb.group='FlowDock-FM' \
+    +logger.wandb.name='2025-03-17_17:00:00-ESMFold-Prior-PLINDER-Finetuning' \
+    +logger.wandb.id='1x2k5a79' \
+    model.cfg.prior_type=esmfold \
+    model.cfg.task.freeze_score_head=false \
+    model.cfg.task.freeze_affinity=true \
+    paths.output_dir="$(realpath 'logs/train/runs/2025-03-17_17-39-39')" \
+    strategy=ddp \
+    trainer=ddp \
+    +trainer.accumulate_grad_batches=4 \
+    trainer.devices=1 \
+    trainer.num_nodes=1
+echo "Finished calling flowdock/train.py!"
+
+# NOTE: the following commands must be used to resume training from a checkpoint
+# ckpt_path="$(realpath 'logs/train/runs/2025-03-17_17-39-39/checkpoints/169-562000.ckpt')" \
+# paths.output_dir="$(realpath 'logs/train/runs/2025-03-17_17-39-39')" \
+
+# NOTE: the following commands may be used to speed up training
+# model.compile=false \
+# +trainer.precision=bf16-mixed
--- a/scripts/esmfold_prior_tiered_training.sh
+++ b/scripts/esmfold_prior_tiered_training.sh
@@ -0,0 +1,64 @@
+#!/bin/bash -l
+######################### Batch Headers #########################
+#SBATCH --partition chengji-lab-gpu  # NOTE: use reserved partition `chengji-lab-gpu` to use reserved A100 or H100 GPUs
+#SBATCH --account chengji-lab  # NOTE: this must be specified to use the reserved partition above
+#SBATCH --nodes=1              # NOTE: this needs to match Lightning's `Trainer(num_nodes=...)`
+#SBATCH --gres gpu:H100:4      # request H100 GPU resource(s)
+#SBATCH --ntasks-per-node=4    # NOTE: this needs to be `1` on SLURM clusters when using Lightning's `ddp_spawn` strategy`; otherwise, set to match Lightning's quantity of `Trainer(devices=...)`
+#SBATCH --mem=0                # NOTE: use `--mem=0` to request all memory "available" on the assigned node
+#SBATCH -t 7-00:00:00          # time limit for the job (up to 7 days: `7-00:00:00`)
+#SBATCH -J esmfold_prior_tiered_training # job name
+#SBATCH --output=R-%x.%j.out   # output log file
+#SBATCH --error=R-%x.%j.err    # error log file
+
+random_seconds=$(( (RANDOM % 100) + 1 ))
+echo "Sleeping for $random_seconds seconds before starting run"
+sleep "$random_seconds"
+
+module purge
+module load cuda/11.8.0_gcc_9.5.0
+
+# determine location of the project directory
+use_private_project_dir=false # NOTE: customize as needed
+if [ "$use_private_project_dir" = true ]; then
+    project_dir="/home/acmwhb/data/Repositories/Lab_Repositories/FlowDock"
+else
+    project_dir="/cluster/pixstor/chengji-lab/acmwhb/Repositories/Lab_Repositories/FlowDock"
+fi
+
+# shellcheck source=/dev/null
+source /cluster/pixstor/chengji-lab/acmwhb/miniforge3/etc/profile.d/conda.sh
+conda activate "$project_dir"/FlowDock/
+
+# Reference Conda system libraries
+export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
+
+# NOTE: for tiered training, start by setting `model.cfg.task.freeze_score_head=false` and `model.cfg.task.freeze_affinity=true`,
+# and once the model's score head has been trained to convergence, resume training with `model.cfg.task.freeze_score_head=true` and `model.cfg.task.freeze_affinity=false`
+
+echo "Calling flowdock/train.py!"
+cd "$project_dir" || exit
+srun python3 flowdock/train.py \
+    experiment='flowdock_fm' \
+    environment=slurm \
+    logger=wandb \
+    logger.wandb.entity='bml-lab' \
+    logger.wandb.group='FlowDock-FM' \
+    +logger.wandb.name='2024-12-06_18:00:00-ESMFold-Prior-Tiered-Training' \
+    +logger.wandb.id='z1u52tvj' \
+    model.cfg.prior_type=esmfold \
+    model.cfg.task.freeze_score_head=false \
+    model.cfg.task.freeze_affinity=true \
+    strategy=ddp \
+    trainer=ddp \
+    trainer.devices=4 \
+    trainer.num_nodes=1
+echo "Finished calling flowdock/train.py!"
+
+# NOTE: the following commands must be used to resume training from a checkpoint
+# ckpt_path="$(realpath 'logs/train/runs/2024-05-17_13-45-06/checkpoints/last.ckpt')" \
+# paths.output_dir="$(realpath 'logs/train/runs/2024-05-17_13-45-06')" \
+
+# NOTE: the following commands may be used to speed up training
+# model.compile=false \
+# +trainer.precision=bf16-mixed
--- a/scripts/esmfold_prior_training.sh
+++ b/scripts/esmfold_prior_training.sh
@@ -0,0 +1,61 @@
+#!/bin/bash -l
+######################### Batch Headers #########################
+#SBATCH --partition chengji-lab-gpu  # NOTE: use reserved partition `chengji-lab-gpu` to use reserved A100 or H100 GPUs
+#SBATCH --account chengji-lab  # NOTE: this must be specified to use the reserved partition above
+#SBATCH --nodes=1              # NOTE: this needs to match Lightning's `Trainer(num_nodes=...)`
+#SBATCH --gres gpu:H100:4      # request H100 GPU resource(s)
+#SBATCH --ntasks-per-node=4    # NOTE: this needs to be `1` on SLURM clusters when using Lightning's `ddp_spawn` strategy`; otherwise, set to match Lightning's quantity of `Trainer(devices=...)`
+#SBATCH --mem=0                # NOTE: use `--mem=0` to request all memory "available" on the assigned node
+#SBATCH -t 7-00:00:00          # time limit for the job (up to 7 days: `7-00:00:00`)
+#SBATCH -J esmfold_prior_training # job name
+#SBATCH --output=R-%x.%j.out   # output log file
+#SBATCH --error=R-%x.%j.err    # error log file
+
+random_seconds=$(( (RANDOM % 100) + 1 ))
+echo "Sleeping for $random_seconds seconds before starting run"
+sleep "$random_seconds"
+
+module purge
+module load cuda/11.8.0_gcc_9.5.0
+
+# determine location of the project directory
+use_private_project_dir=false # NOTE: customize as needed
+if [ "$use_private_project_dir" = true ]; then
+    project_dir="/home/acmwhb/data/Repositories/Lab_Repositories/FlowDock"
+else
+    project_dir="/cluster/pixstor/chengji-lab/acmwhb/Repositories/Lab_Repositories/FlowDock"
+fi
+
+# shellcheck source=/dev/null
+source /cluster/pixstor/chengji-lab/acmwhb/miniforge3/etc/profile.d/conda.sh
+conda activate "$project_dir"/FlowDock/
+
+# Reference Conda system libraries
+export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
+
+echo "Calling flowdock/train.py!"
+cd "$project_dir" || exit
+srun python3 flowdock/train.py \
+    experiment='flowdock_fm' \
+    environment=slurm \
+    logger=wandb \
+    logger.wandb.entity='bml-lab' \
+    logger.wandb.group='FlowDock-FM' \
+    +logger.wandb.name='2024-12-06_18:00:00-ESMFold-Prior-Training' \
+    +logger.wandb.id='z0u52tvj' \
+    model.cfg.prior_type=esmfold \
+    model.cfg.task.freeze_score_head=false \
+    model.cfg.task.freeze_affinity=false \
+    strategy=ddp \
+    trainer=ddp \
+    trainer.devices=4 \
+    trainer.num_nodes=1
+echo "Finished calling flowdock/train.py!"
+
+# NOTE: the following commands must be used to resume training from a checkpoint
+# ckpt_path="$(realpath 'logs/train/runs/2024-05-17_13-45-06/checkpoints/last.ckpt')" \
+# paths.output_dir="$(realpath 'logs/train/runs/2024-05-17_13-45-06')" \
+
+# NOTE: the following commands may be used to speed up training
+# model.compile=false \
+# +trainer.precision=bf16-mixed
--- a/scripts/harmonic_prior_training.sh
+++ b/scripts/harmonic_prior_training.sh
@@ -0,0 +1,61 @@
+#!/bin/bash -l
+######################### Batch Headers #########################
+#SBATCH --partition chengji-lab-gpu  # NOTE: use reserved partition `chengji-lab-gpu` to use reserved A100 or H100 GPUs
+#SBATCH --account chengji-lab  # NOTE: this must be specified to use the reserved partition above
+#SBATCH --nodes=1              # NOTE: this needs to match Lightning's `Trainer(num_nodes=...)`
+#SBATCH --gres gpu:H100:4      # request H100 GPU resource(s)
+#SBATCH --ntasks-per-node=4    # NOTE: this needs to be `1` on SLURM clusters when using Lightning's `ddp_spawn` strategy`; otherwise, set to match Lightning's quantity of `Trainer(devices=...)`
+#SBATCH --mem=0                # NOTE: use `--mem=0` to request all memory "available" on the assigned node
+#SBATCH -t 7-00:00:00          # time limit for the job (up to 7 days: `7-00:00:00`)
+#SBATCH -J harmonic_prior_training # job name
+#SBATCH --output=R-%x.%j.out   # output log file
+#SBATCH --error=R-%x.%j.err    # error log file
+
+random_seconds=$(( (RANDOM % 100) + 1 ))
+echo "Sleeping for $random_seconds seconds before starting run"
+sleep "$random_seconds"
+
+module purge
+module load cuda/11.8.0_gcc_9.5.0
+
+# determine location of the project directory
+use_private_project_dir=false # NOTE: customize as needed
+if [ "$use_private_project_dir" = true ]; then
+    project_dir="/home/acmwhb/data/Repositories/Lab_Repositories/FlowDock"
+else
+    project_dir="/cluster/pixstor/chengji-lab/acmwhb/Repositories/Lab_Repositories/FlowDock"
+fi
+
+# shellcheck source=/dev/null
+source /cluster/pixstor/chengji-lab/acmwhb/miniforge3/etc/profile.d/conda.sh
+conda activate "$project_dir"/FlowDock/
+
+# Reference Conda system libraries
+export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
+
+echo "Calling flowdock/train.py!"
+cd "$project_dir" || exit
+srun python3 flowdock/train.py \
+    experiment='flowdock_fm' \
+    environment=slurm \
+    logger=wandb \
+    logger.wandb.entity='bml-lab' \
+    logger.wandb.group='FlowDock-FM' \
+    +logger.wandb.name='2024-12-06_18:00:00-Harmonic-Prior-Training' \
+    +logger.wandb.id='z2u52tvj' \
+    model.cfg.prior_type=harmonic \
+    model.cfg.task.freeze_score_head=false \
+    model.cfg.task.freeze_affinity=false \
+    strategy=ddp \
+    trainer=ddp \
+    trainer.devices=4 \
+    trainer.num_nodes=1
+echo "Finished calling flowdock/train.py!"
+
+# NOTE: the following commands must be used to resume training from a checkpoint
+# ckpt_path="$(realpath 'logs/train/runs/2024-05-17_13-45-06/checkpoints/last.ckpt')" \
+# paths.output_dir="$(realpath 'logs/train/runs/2024-05-17_13-45-06')" \
+
+# NOTE: the following commands may be used to speed up training
+# model.compile=false \
+# +trainer.precision=bf16-mixed
--- a/scripts/plinder_download.sh
+++ b/scripts/plinder_download.sh
@@ -0,0 +1,38 @@
+#!/bin/bash -l
+######################### Batch Headers #########################
+#SBATCH --partition general    # NOTE: use reserved partition `chengji-lab-gpu` to use reserved A100 or H100 GPUs
+#SBATCH --account chengji-lab  # NOTE: this must be specified to use the reserved partition above
+#SBATCH --nodes=1              # NOTE: this needs to match Lightning's `Trainer(num_nodes=...)`
+#SBATCH --ntasks-per-node=1    # NOTE: this needs to be `1` on SLURM clusters when using Lightning's `ddp_spawn` strategy`; otherwise, set to match Lightning's quantity of `Trainer(devices=...)`
+#SBATCH --mem=59G              # NOTE: use `--mem=0` to request all memory "available" on the assigned node
+#SBATCH -t 0-02:00:00          # time limit for the job (up to 2 days: `2-00:00:00`)
+#SBATCH -J plinder_download    # job name
+#SBATCH --output=R-%x.%j.out   # output log file
+#SBATCH --error=R-%x.%j.err    # error log file
+
+module purge
+module load cuda/11.8.0_gcc_9.5.0
+
+# determine location of the project directory
+use_private_project_dir=false # NOTE: customize as needed
+if [ "$use_private_project_dir" = true ]; then
+    project_dir="/home/acmwhb/data/Repositories/Lab_Repositories/FlowDock"
+else
+    project_dir="/cluster/pixstor/chengji-lab/acmwhb/Repositories/Lab_Repositories/FlowDock"
+fi
+
+# shellcheck source=/dev/null
+source /cluster/pixstor/chengji-lab/acmwhb/miniforge3/etc/profile.d/conda.sh
+conda activate "$project_dir"/FlowDock/
+
+# Reference Conda system libraries
+export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
+
+# determine location of PLINDER dataset
+export PLINDER_MOUNT="$project_dir/data/PLINDER" # NOTE: customize as needed
+mkdir -p "$PLINDER_MOUNT" # create the directory if it doesn't exist
+
+echo "Downloading PLINDER to $PLINDER_MOUNT!"
+cd "$project_dir" || exit
+plinder_download -y
+echo "Finished downloading PLINDER to $PLINDER_MOUNT!"