Skip to content

Commit

Permalink
Add DeepSeek V2 config
Browse files Browse the repository at this point in the history
  • Loading branch information
RissyRan committed Feb 21, 2025
1 parent 3c0be9c commit a1ac837
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 3 deletions.
45 changes: 45 additions & 0 deletions MaxText/configs/models/deepseek2-16b.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# model config for DeepSeek V2-Lite - 16B
# Please note: DeepSeek-style is not fully support at this moment

base_emb_dim: 2048
base_num_query_heads: 16
base_num_kv_heads: 16
base_mlp_dim: 10944
base_moe_mlp_dim: 1408
base_num_decoder_layers: 27
first_num_dense_layers: 1
mlp_activations: ["silu","linear"]
vocab_size: 102400
enable_dropout: False
logits_via_embedding: False
normalization_layer_epsilon: 1.0e-6
num_experts: 64
num_experts_per_tok: 6
shared_experts: 2
routed_scaling_factor: 1.0
routed_score_func: "softmax"
routed_bias: False
# MLA
attention_type: "mla"
q_lora_rank: 0
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
rope_type: "yarn"
mscale: 0.707
decoder_block: "deepseek"
45 changes: 45 additions & 0 deletions MaxText/configs/models/deepseek2-236b.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# model config for DeepSeek V2 - 236B
# Please note: DeepSeek-style is not fully support at this moment

base_emb_dim: 5120
base_num_query_heads: 128
base_num_kv_heads: 128
base_mlp_dim: 12288
base_moe_mlp_dim: 1536
base_num_decoder_layers: 60
first_num_dense_layers: 1
mlp_activations: ["silu","linear"]
vocab_size: 102400
enable_dropout: False
logits_via_embedding: False
normalization_layer_epsilon: 1.0e-6
num_experts: 160
num_experts_per_tok: 6
shared_experts: 2
routed_scaling_factor: 16.0
routed_score_func: "softmax"
routed_bias: False
# MLA
attention_type: "mla"
q_lora_rank: 1536
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
rope_type: "yarn"
mscale: 0.707
decoder_block: "deepseek"
3 changes: 2 additions & 1 deletion MaxText/configs/models/deepseek3-671b.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ base_moe_mlp_dim: 2048
base_num_decoder_layers: 61
first_num_dense_layers: 3
mlp_activations: ["silu","linear"]
vocab_size: 32000 # TODO(b/394635939): update after adding tokenizer
vocab_size: 129280
enable_dropout: False
logits_via_embedding: False
normalization_layer_epsilon: 1.0e-6
Expand All @@ -41,4 +41,5 @@ qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
rope_type: "yarn"
mscale: 1.0
decoder_block: "deepseek"
2 changes: 1 addition & 1 deletion MaxText/layers/attentions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1399,7 +1399,7 @@ def setup(self):
if self.q_lora_rank == 0:
# Standard Q projection (without LoRA).
self.query_proj = DenseGeneral(
features=(self.num_query_heads, self.head_dim),
features=(self.num_query_heads, self.qk_head_dim),
axis=-1,
kernel_init=self.kernel_init,
kernel_axes=("embed", "q_heads", "kv"),
Expand Down
3 changes: 2 additions & 1 deletion MaxText/layers/linears.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,8 @@ def deepseek_scale_weights(self, weights):
"""Scales weights according to DeepSeek's v3 reference implementation.
https://github.com/deepseek-ai/DeepSeek-V3/blob/2f7b80eecebf3d1c84da5a0d465f6639ea175012/inference/model.py#L592-L594
"""
weights /= weights.sum(-1, keepdims=True)
if self.config.routed_score_func == "sigmoid":
weights /= weights.sum(-1, keepdims=True)
weights *= self.config.routed_scaling_factor
return weights

Expand Down
2 changes: 2 additions & 0 deletions MaxText/pyconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,8 @@ def validate_model_name(s: str) -> bool:
"mistral-7b",
"mixtral-8x7b",
"mixtral-8x22b",
"deepseek2-16b",
"deepseek2-236b",
"deepseek3-671b",
"gemma-7b",
"gemma-2b",
Expand Down

0 comments on commit a1ac837

Please sign in to comment.