-
Notifications
You must be signed in to change notification settings - Fork 75
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #272 from aws-samples/olcf-6
Add OLFC-6 test case
- Loading branch information
Showing
25 changed files
with
577 additions
and
60 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
*.csv | ||
*.ipynb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/bin/bash | ||
|
||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
#SBATCH --job-name="neox" | ||
#SBATCH --nodes=2 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --gpus-per-node=8 # Number of GPU per node | ||
#SBATCH --output=logs/%x_%j.out # logfile for stdout | ||
#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs | ||
#SBATCH --wait-all-nodes=1 | ||
#SBATCH --exclusive | ||
set -uxo pipefail | ||
|
||
# default variables for Enroot, if these variables are defined then use them | ||
: "${FSX_PATH:=/fsx}" | ||
: "${IMAGE:=$FSX_PATH/apps/gpt-neox.sqsh}" | ||
: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" | ||
## EFA settings | ||
export FI_LOG_LEVEL=warn | ||
export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons | ||
export FI_EFA_USE_HUGE_PAGE=0 | ||
# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 | ||
# https://github.com/pytorch/pytorch/issues/68893 | ||
export NCCL_SOCKET_IFNAME=en | ||
export NCCL_ASYNC_ERROR_HANDLING=1 | ||
#export NCCL_DEBUG=INFO | ||
|
||
export DATA_CONFIG=${PWD}/configs/frontier.yml | ||
export MODEL_CONFIG=${PWD}/configs/forge-m.yml | ||
# Some potentially useful distributed environment variables | ||
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | ||
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | ||
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | ||
export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) | ||
export NODES_ARRAY=($NODES) | ||
export HEAD_NODE=${NODES_ARRAY[0]} | ||
export MASTER_ADDR=$(hostname --ip-address) | ||
export MASTER_PORT=$RANDOM | ||
export NNODES=$SLURM_JOB_NUM_NODES | ||
export NPROC=$SLURM_GPUS_PER_NODE | ||
export WORLD_SIZE=$(( $NNODES * $NPROC )) | ||
|
||
declare -a ARGS=( | ||
--container-image $IMAGE | ||
--container-mounts $CONTAINER_MOUNT | ||
) | ||
|
||
declare -a TORCHRUN_ARGS=( | ||
# change this to match the number of gpus per node: | ||
--master_addr $MASTER_ADDR \ | ||
--master_port $RANDOM \ | ||
--nproc_per_node=8 \ | ||
--nnodes=$SLURM_JOB_NUM_NODES \ | ||
--rdzv_id=$SLURM_JOB_ID \ | ||
--rdzv_backend=c10d \ | ||
--rdzv_endpoint=$(hostname) \ | ||
) | ||
|
||
srun -l "${ARGS[@]}" python deepy.py train.py ${MODEL_CONFIG} ${DATA_CONFIG} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
|
||
# Pythia GPT-NeoX Test Case <!-- omit in toc --> | ||
|
||
This test case illustrates how to train [Pythia](https://arxiv.org/abs/2304.01373) model using GPT-Neox. | ||
|
||
## 1. Preparation | ||
|
||
This test case assumes that you have built GPT-NeoX container [`../../0.gpt-neox.dockerfile`](https://github.com/aws-samples/awsome-distributed-training/tree/main/3.test_cases/15.gpt-neox). | ||
|
||
## 2. Download Dataset | ||
|
||
This test case make use of [Tokenized Data for FORGE Foundation Models](https://doi.ccs.ornl.gov/ui/doi/453). Download the data and place as follows: | ||
|
||
```bash | ||
/fsx/data/olcf | ||
├── README.txt | ||
├── all_text_document.bin | ||
├── all_text_document.idx | ||
└── all_vocab.json | ||
``` | ||
|
||
This dataset comprises a vast corpus of 257 billion tokens, accompanied by the corresponding vocabulary file employed in the pre-training of FORGE foundation models. The primary data source for this corpus is scientific documents derived from diverse origins, and they have been tokenized using the Hugging Face BPE tokenizer. Further details about this research can be found in the publication titled "FORGE: Pre-Training Open Foundation Models for Science" authored by Junqi Yin, Sajal Dash, Feiyi Wang, and Mallikarjun (Arjun) Shankar, presented at SC'23. The data tokenization pipeline and resulting artifacts use CORE data [Ref: Knoth, P., & Zdrahal, Z. (2012). CORE: three access levels to underpin open access. D-Lib Magazine, 18(11/12)]. For use of these data sets for any purpose, please follow the guidelines provided in https://core.ac.uk/terms . | ||
|
||
## 3. Train | ||
|
||
Now that you can kickstart the training with: | ||
|
||
```bash | ||
sbatch 1.train.sbatch | ||
``` | ||
|
101 changes: 101 additions & 0 deletions
101
3.test_cases/15.gpt-neox/examples/olcf-6/configs/forge-l.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
# GPT-2 pretraining setup | ||
{ | ||
|
||
"tokenizer_type": "HFTokenizer", | ||
"data-path": "/fsx/data/olcf/all_text_document", | ||
"vocab-file": "/fsx/data/olcf/all_vocab.json", | ||
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages | ||
# across the node boundaries ) | ||
#"pipe-parallel-size": 1, | ||
"model-parallel-size": 2, | ||
|
||
# batch / data settings | ||
"train_micro_batch_size_per_gpu": 16, | ||
"gradient_accumulation_steps": 1, | ||
"data-impl": "mmap", | ||
|
||
#aws-rccl workaround | ||
"num_workers": 0, | ||
|
||
# model settings | ||
"num-layers": 48, | ||
"hidden-size": 6144, | ||
"num-attention-heads": 48, | ||
"seq-length": 2048, | ||
"max-position-embeddings": 2048, | ||
"norm": "layernorm", | ||
"pos-emb": "rotary", | ||
"no-weight-tying": true, | ||
"gpt_j_residual": false, | ||
"output_layer_parallelism": "column", | ||
|
||
# these should provide some speedup but takes a while to build, set to true if desired | ||
"scaled-upper-triang-masked-softmax-fusion": true, | ||
"bias-gelu-fusion": true, | ||
|
||
# init methods | ||
"init_method": "small_init", | ||
"output_layer_init_method": "wang_init", | ||
|
||
|
||
# optimizer settings | ||
"optimizer": { | ||
"type": "adam", | ||
"params": { | ||
"lr": 0.006, | ||
"betas": [0.9, 0.999], | ||
"eps": 1.0e-8, | ||
} | ||
}, | ||
"min_lr": 0.00006, | ||
|
||
# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training | ||
"zero_optimization": { | ||
"stage": 1, | ||
"allgather_partitions": True, | ||
"allgather_bucket_size": 450000000, | ||
"overlap_comm": True, | ||
"reduce_scatter": True, | ||
"reduce_bucket_size": 450000000, | ||
"contiguous_gradients": True, | ||
}, | ||
|
||
|
||
# activation checkpointing | ||
"checkpoint-activations": true, | ||
"checkpoint-num-layers": 1, | ||
"partition-activations": true, | ||
"synchronize-each-layer": true, | ||
|
||
# regularization | ||
"gradient_clipping": 1.0, | ||
"weight-decay": 0.1, | ||
"hidden-dropout": 0, | ||
"attention-dropout": 0, | ||
|
||
# precision settings | ||
"fp16": { | ||
"type": "bfloat16", | ||
"enabled": true, | ||
"loss_scale": 0, | ||
"loss_scale_window": 1000, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1 | ||
}, | ||
|
||
# misc. training settings | ||
"train-iters": 15300, | ||
"lr-decay-iters": 15300, | ||
"distributed-backend": "nccl", | ||
"lr-decay-style": "cosine", | ||
"warmup": 0.01, | ||
"checkpoint-factor": 50, | ||
"eval-interval": 100, | ||
"eval-iters": 10, | ||
|
||
# logging | ||
"log-interval": 20, | ||
"steps_per_print": 10, | ||
"keep-last-n-checkpoints": 200, | ||
"wall_clock_breakdown": true, | ||
} |
102 changes: 102 additions & 0 deletions
102
3.test_cases/15.gpt-neox/examples/olcf-6/configs/forge-m.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
# GPT-2 pretraining setup | ||
{ | ||
|
||
"tokenizer_type": "HFTokenizer", | ||
"data-path": "/fsx/data/olcf/all_text_document", | ||
"vocab-file": "/fsx/data/olcf/all_vocab.json", | ||
|
||
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages | ||
# across the node boundaries ) | ||
#"pipe-parallel-size": 1, | ||
"model-parallel-size": 2, | ||
|
||
# batch / data settings | ||
"train_micro_batch_size_per_gpu": 12, | ||
"gradient_accumulation_steps": 2, | ||
"data-impl": "mmap", | ||
|
||
#aws-rccl workaround | ||
"num_workers": 0, | ||
|
||
# model settings | ||
"num-layers": 40, | ||
"hidden-size": 5120, | ||
"num-attention-heads": 40, | ||
"seq-length": 2048, | ||
"max-position-embeddings": 2048, | ||
"norm": "layernorm", | ||
"pos-emb": "rotary", | ||
"no-weight-tying": true, | ||
"gpt_j_residual": false, | ||
"output_layer_parallelism": "column", | ||
|
||
# these should provide some speedup but takes a while to build, set to true if desired | ||
"scaled-upper-triang-masked-softmax-fusion": true, | ||
"bias-gelu-fusion": true, | ||
|
||
# init methods | ||
"init_method": "small_init", | ||
"output_layer_init_method": "wang_init", | ||
|
||
|
||
# optimizer settings | ||
"optimizer": { | ||
"type": "adam", | ||
"params": { | ||
"lr": 0.006, | ||
"betas": [0.9, 0.999], | ||
"eps": 1.0e-8, | ||
} | ||
}, | ||
"min_lr": 0.00006, | ||
|
||
# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training | ||
"zero_optimization": { | ||
"stage": 1, | ||
"allgather_partitions": True, | ||
"allgather_bucket_size": 500000000, | ||
"overlap_comm": True, | ||
"reduce_scatter": True, | ||
"reduce_bucket_size": 500000000, | ||
"contiguous_gradients": True, | ||
}, | ||
|
||
|
||
# activation checkpointing | ||
"checkpoint-activations": true, | ||
"checkpoint-num-layers": 1, | ||
"partition-activations": true, | ||
"synchronize-each-layer": true, | ||
|
||
# regularization | ||
"gradient_clipping": 1.0, | ||
"weight-decay": 0.1, | ||
"hidden-dropout": 0, | ||
"attention-dropout": 0, | ||
|
||
# precision settings | ||
"fp16": { | ||
"type": "bfloat16", | ||
"enabled": true, | ||
"loss_scale": 0, | ||
"loss_scale_window": 1000, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1 | ||
}, | ||
|
||
# misc. training settings | ||
"train-iters": 15300, | ||
"lr-decay-iters": 15300, | ||
"distributed-backend": "nccl", | ||
"lr-decay-style": "cosine", | ||
"warmup": 0.01, | ||
"checkpoint-factor": 50, | ||
"eval-interval": 100, | ||
"eval-iters": 10, | ||
|
||
# logging | ||
"log-interval": 50, | ||
"steps_per_print": 10, | ||
"keep-last-n-checkpoints": 200, | ||
"wall_clock_breakdown": true, | ||
} |
Oops, something went wrong.