Skip to content

Commit

Permalink
Adds clip_grad_norm to all recipe config that supports it
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasjpfan committed Dec 31, 2024
1 parent 5d1866f commit a035a35
Show file tree
Hide file tree
Showing 122 changed files with 122 additions and 0 deletions.
1 change: 1 addition & 0 deletions recipes/configs/code_llama2/7B_full_low_memory.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ optimizer:
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/code_llama2/7B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ lr_scheduler:
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/code_llama2/7B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ lr_scheduler:
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma/2B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma/2B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma/2B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma/2B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma/7B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma/7B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma/7B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ batch_size: 8
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma/7B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/27B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/27B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/27B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ batch_size: 2
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/27B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/2B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/2B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/2B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ batch_size: 8
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/2B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/9B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/9B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/9B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ batch_size: 8
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/gemma2/9B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/13B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/13B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/13B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/70B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ loss:
# Training
epochs: 1
max_steps_per_epoch: null
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
gradient_accumulation_steps: 1 # Use to increase effective batch size

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/70B_qlora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ fsdp:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/7B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/7B_full_low_memory.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training environment
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/7B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ loss:
# Training
epochs: 1
max_steps_per_epoch: null
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
gradient_accumulation_steps: 8 # Use to increase effective batch size

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/7B_lora_dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ loss:
epochs: 1
max_steps_per_epoch: 1000
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/7B_lora_dpo_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ loss:
epochs: 1
max_steps_per_epoch: 1000
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/7B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/7B_qat_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/7B_qlora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ fsdp:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama2/7B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/70B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory
custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
fsdp_cpu_offload: True
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/70B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/8B_dora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/8B_dora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/8B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/8B_full_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ loss:
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training environment
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/8B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/8B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/8B_qat_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/8B_qat_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/8B_qdora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3/8B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
1 change: 1 addition & 0 deletions recipes/configs/llama3_1/405B_qlora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ fsdp:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase effective batch size
clip_grad_norm: null
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
Expand Down
Loading

0 comments on commit a035a35

Please sign in to comment.