finetune_provpath.sh

#!/bin/bash

# for interact: salloc -N 1 -n 2 -p gpu --gres=gpu:2 --gres-flags=enforce-binding --mem=70G -t 00:05:00 --exclude=gpu2603,gpu2250,gpu2102,gpu2101,gpu2115,gpu2116
# ./finetune_provpath.sh --data TCGA
# sbatch -A carney-tserre-condo --mem=60G -t 30:00:00 finetune_provpath.sh --data TCGA --seed 0 --featlayer 11

#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 2
#SBATCH --account=carney-tserre-condo
# choices: 2x speed (all ampere): geforce3090, a5500, a5000. 1x speed (older architectures): titanrtx, quadrortx
#SBATCH --exclude=gpu2115,gpu2112,gpu2116
#SBATCH -C ampere
#SBATCH -p gpu --gres=gpu:1 --gres-flags=enforce-binding  # keep the num of gpus even or 1 to make the math of val score cacluation work
#SBATCH -J ft
#SBATCH --mem=45G
#SBATCH -t 15:00:00  # 2 folds per job of 75 epochs each
#SBATCH -o finetune_output/logs/longnet-%j.out
#SBATCH -e finetune_output/logs/longnet-%j.err
#SBATCH --mail-type=ALL
#SBATCH --mail-user=vipul_sharma@brown.edu


num_gpus=$(echo $SLURM_JOB_GPUS | awk -F ',' '{print NF}')
echo "Number of GPUs: $num_gpus"

if [ -z "$num_gpus" ]; then
    echo "The number of gpus/num_gpus cannot be inferred. Exiting."
    exit 1
fi

# Update the fsdp config to use num gpus mentioned in sbatch script
sed -i "s/^num_processes:.*/num_processes: $num_gpus/" fsdp_config.yaml

# update the fsdp config to use NO_SHARD if world_size is 1
if [ $num_gpus -gt 1 ]; then
    echo "FSDP sharding strategy as FULL_SHARD"
    sed -i "s/^  fsdp_sharding_strategy:.*/  fsdp_sharding_strategy: FULL_SHARD/" fsdp_config.yaml
else
    echo "FSDP sharding strategy as NO_SHARD"
    sed -i "s/^  fsdp_sharding_strategy:.*/  fsdp_sharding_strategy: NO_SHARD/" fsdp_config.yaml
fi

# Function to find a free port in the range 8000-9999
find_free_port() {
    local port
    while :; do
        port=$(shuf -i6000-39999 -n1)
        if ! lsof -i:$port >/dev/null; then
            echo $port
            return
        fi
    done
}

source ~/.bashrc
module load miniforge/23.11.0-0s
source /oscar/runtime/software/external/miniforge/23.11.0-0/etc/profile.d/conda.sh
conda activate gigapath

usage() {
    echo "Usage: $0 --data DATA [--project PROJECT]"\
    "[--rootpath ROOTPATH] [--taskcfg TASKCFG] [--datasetcsv DATASETCSV] [--presplitdir PRESPLITDIR]"\
    "[--epoch EPOCH] [--gradientaccumulation GC] [--baselearningrate BLR] [--lr LR]"\
    "[--weightdecay WD] [--layerdecay LD] [--featlayer FEATLAYER] [--dropout DROPOUT]"\
    "[--batchsize BATCHSIZE] [--savedir SAVEDIR] [--numworkers NUMWORKERS] [--seed SEED]"\
    "[--scheduler SCHEDULER] [--profile PROFILE] [--start_fold STARTFOLD] [--end_fold ENDFOLD]"\
    "[--optim OPTIM] [--debug_grads DEBUGGRADS] [--batch_dryrun BATCHDR] [--epoch_dryrun EPOCHDR]"\
    "[--test_strat TESTSTRAT] [--val_strat VALSTRAT] [--train_strat TRAINSTRAT] [--pat_strat PATSTRAT]"\
    "[--num_checkpoints NUMCHECKPOINTS]"
    exit 1
}

# wandb project
PROJECT="prov-gigapath"
# Task setting
DATA=TCGA
TASKCFG=data/task_configs/mutation_5_gene.yaml
DATASETCSV=data/dataset_csv/mutation/tile_aug_LUAD-5-gene_${DATA}.csv
PRESPLITDIR=data/dataset_csv/mutation/ # Use the predefined split
ROOTPATH=data/${DATA}/pt_files
MAX_WSI_SIZE=150000  # Maximum WSI size in pixels for the longer side (width or height). default for prov-path was 250k but 110848 for TCGA-LUAD.
TILE_SIZE=256
TESTSTRAT=1
VALSTRAT=1
TRAINSTRAT=1
PATSTRAT=1
# Model settings
HFMODEL=hf_hub:prov-gigapath/prov-gigapath # Huggingface model name
MODELARCH=gigapath_slide_enc12l768d
TILEEMBEDSIZE=1536
LATENTDIM=768
MODELSELECT="val"  # choices -> 'val', 'last_epoch'
# Training settings
BATCHDR=0
EPOCHDR=0
EPOCH=50
GC=32
LR=0.00008
BLR=0.002
WD=0.01
LD=0.95
FEATLAYER="12"
DROPOUT=0.1
BATCHSIZE=1
SEED=0
PROFILE=0
SCHEDULER="fixed" # either 'cosine', 'fixed'
STARTFOLD=0
ENDFOLD=9
DEBUGGRADS=0
OPTIM="adamw_schedulefree"
NUMCHECKPOINTS=5
# Data loading settings
NUMWORKERS=1
# Output settings
SAVEDIR=finetune_output/${DATA}_luad

while [[ $# -gt 0 ]]; do
    case $1 in
        --data)
            DATA="$2"
            shift 2
            ;;
        --rootpath)
            ROOTPATH="$2"
            shift 2
            ;;
        --taskcfg)
            TASKCFG="$2"
            shift 2
            ;;
        --datasetcsv)
            DATASETCSV="$2"
            shift 2
            ;;
        --presplitdir)
            PRESPLITDIR="$2"
            shift 2
            ;;
        --epoch)
            EPOCH="$2"
            shift 2
            ;;
        --gradientaccumulation)
            GC="$2"
            shift 2
            ;;
        --baselearningrate)
            BLR="$2"
            shift 2
            ;;
        --weightdecay)
            WD="$2"
            shift 2
            ;;
        --layerdecay)
            LD="$2"
            shift 2
            ;;
        --featlayer)
            FEATLAYER="$2"
            shift 2
            ;;
        --dropout)
            DROPOUT="$2"
            shift 2
            ;;
        --batchsize)
            BATCHSIZE="$2"
            shift 2
            ;;
        --savedir)
            SAVEDIR="$2"
            shift 2
            ;;
        --numworkers)
            NUMWORKERS="$2"
            shift 2
            ;;
        --seed)
            SEED="$2"
            shift 2
            ;;
        --profile)
            PROFILE="$2"
            shift 2
            ;;
        --scheduler)
            PROFILE="$2"
            shift 2
            ;;
        --folds)
            FOLDS="$2"
            shift 2
            ;;
        --debug_grads)
            DEBUGGRADS="$2"
            shift 2
            ;;
        --start_fold)
            STARTFOLD="$2"
            shift 2
            ;;
        --end_fold)
            ENDFOLD="$2"
            shift 2
            ;;
        --lr)
            LR="$2"
            shift 2
            ;;
        --optim)
            OPTIM="$2"
            shift 2
            ;;
        --batch_dryrun)
            BATCHDR="$2"
            shift 2
            ;;
        --epoch_dryrun)
            EPOCHDR="$2"
            shift 2
            ;;
        --test_strat)
            TESTSTRAT="$2"
            shift 2
            ;;
        --val_strat)
            VALSTRAT="$2"
            shift 2
            ;;
        --train_strat)
            TRAINSTRAT="$2"
            shift 2
            ;;
        --pat_strat)
            PATSTRAT="$2"
            shift 2
            ;;
        --num_checkpoints)
            NUMCHECKPOINTS="$2"
            shift 2
            ;;
        *)
            echo "Unknown argument: $1"
            usage
            ;;
    esac
done


# Check for required arguments
if [[ -z "$HF_TOKEN" || -z "$DATA" ]]; then
    echo "Error: --hftoken and --data are required arguments."
    usage
fi

echo "HF token value was set!"

EXPNAME=${DATA}-run_epoch-${EPOCH}_lr-${LR}_wd-${WD}_ld-${LD}_feat-${FEATLAYER}

echo "Data directory set to $ROOTPATH"
echo "Using $GC gradient accumulation per GPU"

echo "Starting training with seed: $SEED"
# update the fsdp main proc port to an unused port, useful when two experiments on the same node
free_port=$(find_free_port)
echo "Setting the fsdp main proc port as $free_port."
sed "s/^main_process_port:.*/main_process_port: $free_port/" fsdp_config.yaml > fsdp_config_${free_port}.yaml

FOLD=$STARTFOLD
while [ $FOLD -le $ENDFOLD ]
do
    echo "$EXPNAME with fold $FOLD started at $(date)"
    srun -n 1 -c 2 -p gpu --gres=gpu:1 --gres-flags=enforce-binding python -u -m accelerate.commands.launch --config_file fsdp_config_${free_port}.yaml \
                models_and_papers/prov_gigapath/finetune/main.py \
                --task_cfg_path ${TASKCFG} \
                --dataset_csv $DATASETCSV \
                --root_path $ROOTPATH \
                --model_arch $MODELARCH \
                --lr $LR \
                --blr $BLR \
                --layer_decay $LD \
                --optim $OPTIM \
                --optim_wd $WD \
                --dropout $DROPOUT \
                --drop_path_rate 0.0 \
                --val_r 0.1 \
                --epochs $EPOCH \
                --input_dim $TILEEMBEDSIZE \
                --latent_dim $LATENTDIM \
                --feat_layer $FEATLAYER \
                --warmup_epochs 1 \
                --gc $GC \
                --lr_scheduler $SCHEDULER \
                --start_fold $FOLD \
                --end_fold $FOLD \
                --dataset_csv $DATASETCSV \
                --pre_split_dir $PRESPLITDIR \
                --save_dir $SAVEDIR \
                --pretrained $HFMODEL \
                --report_to wandb \
                --exp_name $EXPNAME \
                --max_wsi_size $MAX_WSI_SIZE \
                --num_workers $NUMWORKERS \
                --batch_size $BATCHSIZE \
                --seed $SEED \
                --wandb_project $PROJECT \
                --data $DATA \
                --profile $PROFILE \
                --debug_grads $DEBUGGRADS \
                --batch_dryrun $BATCHDR \
                --epoch_dryrun $EPOCHDR \
                --pat_strat $PATSTRAT \
                --test_strat $TESTSTRAT \
                --val_strat $VALSTRAT \
                --train_strat $TRAINSTRAT \
                --num_checkpoints $NUMCHECKPOINTS
    echo "$EXPNAME with fold $FOLD finished at $(date)"
    FOLD=$(( FOLD + 1 ))
done

echo "$EXPNAME for folds $STARTFOLD to $ENDFOLD (both inclusive) finished at $(date)"