From 8d3f9b9627cb6af6b5846584e9118574523292eb Mon Sep 17 00:00:00 2001 From: Aleksa Gordic Date: Mon, 1 Jul 2024 17:00:29 +0200 Subject: [PATCH] mup set to false by default --- train_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_gpt2.py b/train_gpt2.py index cc4bab332..e04df4fb0 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -592,7 +592,7 @@ def print0(*args, **kwargs): parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay") parser.add_argument("--grad_clip", type=float, default=1.0, help="maximum gradient magnitude") # mup - maximum update parametrization - parser.add_argument("--use_mup", type=int, default=1, help="should we use maximum update parametrization") + parser.add_argument("--use_mup", type=int, default=0, help="should we use maximum update parametrization") parser.add_argument("--mup_width_mult", type=float, default=1.0, help="width multiplier - ratio of width to base model width") parser.add_argument("--mup_base_attn_mult", type=float, default=1.0, help="base attention multiplier") # evaluation