Add DeepSeek V2 config

AI-Hypercomputer · Feb 21, 2025 · a1ac837 · a1ac837
1 parent 3c0be9c
commit a1ac837
Show file tree

Hide file tree

Showing 6 changed files with 97 additions and 3 deletions.
diff --git a/MaxText/configs/models/deepseek2-16b.yml b/MaxText/configs/models/deepseek2-16b.yml
@@ -0,0 +1,45 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek V2-Lite - 16B
+# Please note: DeepSeek-style is not fully support at this moment
+
+base_emb_dim: 2048
+base_num_query_heads: 16
+base_num_kv_heads: 16
+base_mlp_dim: 10944
+base_moe_mlp_dim: 1408
+base_num_decoder_layers: 27
+first_num_dense_layers: 1
+mlp_activations: ["silu","linear"]
+vocab_size: 102400
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+num_experts: 64
+num_experts_per_tok: 6
+shared_experts: 2
+routed_scaling_factor: 1.0
+routed_score_func: "softmax"
+routed_bias: False
+# MLA
+attention_type: "mla"
+q_lora_rank: 0
+kv_lora_rank: 512
+qk_nope_head_dim: 128
+qk_rope_head_dim: 64
+v_head_dim: 128
+rope_type: "yarn"
+mscale: 0.707
+decoder_block: "deepseek"
diff --git a/MaxText/configs/models/deepseek2-236b.yml b/MaxText/configs/models/deepseek2-236b.yml
@@ -0,0 +1,45 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek V2 - 236B
+# Please note: DeepSeek-style is not fully support at this moment
+
+base_emb_dim: 5120
+base_num_query_heads: 128
+base_num_kv_heads: 128
+base_mlp_dim: 12288
+base_moe_mlp_dim: 1536
+base_num_decoder_layers: 60
+first_num_dense_layers: 1
+mlp_activations: ["silu","linear"]
+vocab_size: 102400
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+num_experts: 160
+num_experts_per_tok: 6
+shared_experts: 2
+routed_scaling_factor: 16.0
+routed_score_func: "softmax"
+routed_bias: False
+# MLA
+attention_type: "mla"
+q_lora_rank: 1536
+kv_lora_rank: 512
+qk_nope_head_dim: 128
+qk_rope_head_dim: 64
+v_head_dim: 128
+rope_type: "yarn"
+mscale: 0.707
+decoder_block: "deepseek"
diff --git a/MaxText/configs/models/deepseek3-671b.yml b/MaxText/configs/models/deepseek3-671b.yml
@@ -23,7 +23,7 @@ base_moe_mlp_dim: 2048
 base_num_decoder_layers: 61
 first_num_dense_layers: 3
 mlp_activations: ["silu","linear"]
-vocab_size: 32000 # TODO(b/394635939): update after adding tokenizer 
+vocab_size: 129280
 enable_dropout: False
 logits_via_embedding: False
 normalization_layer_epsilon: 1.0e-6
@@ -41,4 +41,5 @@ qk_nope_head_dim: 128
 qk_rope_head_dim: 64
 v_head_dim: 128
 rope_type: "yarn"
+mscale: 1.0
 decoder_block: "deepseek"
diff --git a/MaxText/layers/attentions.py b/MaxText/layers/attentions.py
@@ -1399,7 +1399,7 @@ def setup(self):
     if self.q_lora_rank == 0:
       # Standard Q projection (without LoRA).
       self.query_proj = DenseGeneral(
-          features=(self.num_query_heads, self.head_dim),
+          features=(self.num_query_heads, self.qk_head_dim),
           axis=-1,
           kernel_init=self.kernel_init,
           kernel_axes=("embed", "q_heads", "kv"),

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
@@ -375,7 +375,8 @@ def deepseek_scale_weights(self, weights):
     """Scales weights according to DeepSeek's v3 reference implementation.
     https://github.com/deepseek-ai/DeepSeek-V3/blob/2f7b80eecebf3d1c84da5a0d465f6639ea175012/inference/model.py#L592-L594
     """
-    weights /= weights.sum(-1, keepdims=True)
+    if self.config.routed_score_func == "sigmoid":
+      weights /= weights.sum(-1, keepdims=True)
     weights *= self.config.routed_scaling_factor
     return weights
 

diff --git a/MaxText/pyconfig.py b/MaxText/pyconfig.py
@@ -211,6 +211,8 @@ def validate_model_name(s: str) -> bool:
       "mistral-7b",
       "mixtral-8x7b",
       "mixtral-8x22b",
+      "deepseek2-16b",
+      "deepseek2-236b",
       "deepseek3-671b",
       "gemma-7b",
       "gemma-2b",