Skip to content

Commit

Permalink
Enable Auto-resume
Browse files Browse the repository at this point in the history
Signed-off-by: Sean Smith <seaam@amazon.com>
  • Loading branch information
sean-smith committed Apr 1, 2024
1 parent c016b59 commit 38b6b54
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 3 deletions.
9 changes: 7 additions & 2 deletions 3.test_cases/16.pytorch-cpu-ddp/1.conda-train.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#SBATCH --exclusive
#SBATCH --wait-all-nodes=1
#SBATCH --nodes 2
#SBATCH --cpus-per-task=4
#SBATCH --output=logs/%x_%j.out # logfile for stdout/stderr

nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
Expand All @@ -14,7 +13,13 @@ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
echo Node IP: $head_node_ip
export LOGLEVEL=INFO

srun ./pt_cpu/bin/torchrun \
AUTO_RESUME=""
if [ -d "/opt/sagemaker_cluster" ]; then
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
AUTO_RESUME="--auto-resume=1"
fi

srun ${AUTO_RESUME} ./pt_cpu/bin/torchrun \
--nnodes 2 \
--nproc_per_node 4 \
--rdzv_id $RANDOM \
Expand Down
8 changes: 7 additions & 1 deletion 3.test_cases/16.pytorch-cpu-ddp/3.container-train.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@ declare -a ARGS=(
--container-mounts ${PWD}
)

srun -l "${ARGS[@]}" torchrun \
AUTO_RESUME=""
if [ -d "/opt/sagemaker_cluster" ]; then
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
AUTO_RESUME="--auto-resume=1"
fi

srun ${AUTO_RESUME} -l "${ARGS[@]}" torchrun \
--nnodes 2 \
--nproc_per_node 4 \
--rdzv_id $RANDOM \
Expand Down

0 comments on commit 38b6b54

Please sign in to comment.