Skip to content

Commit

Permalink
Merge pull request #1295 from AI-Hypercomputer:mtc
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 729682186
  • Loading branch information
maxtext authors committed Feb 21, 2025
2 parents e7038bc + 3f043eb commit e2b9df0
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions end_to_end/test_multi_tier_checkpointing.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash
set -ex

RUN_NAME=${1}_$(date +%Y-%m-%d-%H)
OUTPUT_PATH=${2}
DATASET_PATH=${3}
export TPU_PREMAPPED_BUFFER_SIZE=20000014336
export TPU_PREMAPPED_BUFFER_TRANSFER_THRESHOLD_BYTES=20000014336

# Train and save checkpoint
python3 MaxText/train.py MaxText/configs/base.yml remat_policy=full base_output_directory=$OUTPUT_PATH dataset_path=$DATASET_PATH \
steps=100 enable_emergency_checkpoint=true checkpoint_period=200 local_checkpoint_directory=/local local_checkpoint_period=20 run_name=$RUN_NAME metrics_file='saved_metrics.txt'

# Retrieve checkpoint
python3 MaxText/train.py MaxText/configs/base.yml remat_policy=full base_output_directory=$OUTPUT_PATH dataset_path=$DATASET_PATH \
steps=110 enable_emergency_checkpoint=true checkpoint_period=200 local_checkpoint_directory=/local local_checkpoint_period=20 run_name=$RUN_NAME metrics_file='restored_metrics.txt'


python3 end_to_end/tpu/eval_assert.py checkpoint_save_restore metrics.txt learning/loss

0 comments on commit e2b9df0

Please sign in to comment.