#!/bin/bash -l ######################### Batch Headers ######################### #SBATCH --partition chengji-lab-gpu # NOTE: use reserved partition `chengji-lab-gpu` to use reserved A100 or H100 GPUs #SBATCH --account chengji-lab # NOTE: this must be specified to use the reserved partition above #SBATCH --nodes=1 # NOTE: this needs to match Lightning's `Trainer(num_nodes=...)` #SBATCH --gres gpu:1 # request A100/H100 GPU resource(s) #SBATCH --ntasks-per-node=1 # NOTE: this needs to be `1` on SLURM clusters when using Lightning's `ddp_spawn` strategy`; otherwise, set to match Lightning's quantity of `Trainer(devices=...)` #SBATCH --mem=59G # NOTE: use `--mem=0` to request all memory "available" on the assigned node #SBATCH -t 2-00:00:00 # time limit for the job (up to 2 days: `2-00:00:00`) #SBATCH -J esmfold_prior_plinder_finetuning # job name #SBATCH --output=R-%x.%j.out # output log file #SBATCH --error=R-%x.%j.err # error log file module purge module load cuda/11.8.0_gcc_9.5.0 # determine location of the project directory use_private_project_dir=false # NOTE: customize as needed if [ "$use_private_project_dir" = true ]; then project_dir="/home/acmwhb/data/Repositories/Lab_Repositories/FlowDock" else project_dir="/cluster/pixstor/chengji-lab/acmwhb/Repositories/Lab_Repositories/FlowDock" fi # shellcheck source=/dev/null source /cluster/pixstor/chengji-lab/acmwhb/miniforge3/etc/profile.d/conda.sh conda activate "$project_dir"/FlowDock/ # Reference Conda system libraries export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH" echo "Calling flowdock/train.py!" cd "$project_dir" || exit srun python3 flowdock/train.py \ callbacks.last_model_checkpoint.filename=null \ callbacks.last_model_checkpoint.every_n_train_steps=200 \ callbacks.last_model_checkpoint.every_n_epochs=null \ ckpt_path="$(realpath 'logs/train/runs/2025-03-17_17-39-39/checkpoints/169-562000.ckpt')" \ data=plinder \ experiment='flowdock_fm' \ environment=slurm \ logger=wandb \ logger.wandb.entity='bml-lab' \ logger.wandb.group='FlowDock-FM' \ +logger.wandb.name='2025-03-17_17:00:00-ESMFold-Prior-PLINDER-Finetuning' \ +logger.wandb.id='1x2k5a79' \ model.cfg.prior_type=esmfold \ model.cfg.task.freeze_score_head=false \ model.cfg.task.freeze_affinity=true \ paths.output_dir="$(realpath 'logs/train/runs/2025-03-17_17-39-39')" \ strategy=ddp \ trainer=ddp \ +trainer.accumulate_grad_batches=4 \ trainer.devices=1 \ trainer.num_nodes=1 echo "Finished calling flowdock/train.py!" # NOTE: the following commands must be used to resume training from a checkpoint # ckpt_path="$(realpath 'logs/train/runs/2025-03-17_17-39-39/checkpoints/169-562000.ckpt')" \ # paths.output_dir="$(realpath 'logs/train/runs/2025-03-17_17-39-39')" \ # NOTE: the following commands may be used to speed up training # model.compile=false \ # +trainer.precision=bf16-mixed