openvinotoolkit · eaidova · Feb 10, 2025 · Feb 7, 2025 · Feb 10, 2025 · Feb 10, 2025
diff --git a/notebooks/deepseek-r1/README.md b/notebooks/deepseek-r1/README.md
@@ -12,6 +12,7 @@ The tutorial supports different models, you can select one from the provided opt
 * **DeepSeek-R1-Distill-Qwen-1.5B** is the smallest DeepSeek-R1 distilled model based on [Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B). Despite its compact size, the model demonstrates strong capabilities in solving basic mathematical tasks, at the same time its programming capabilities are limited. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) for more info.
 * **DeepSeek-R1-Distill-Qwen-7B** is a distilled model based on [Qwen-2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B). The model demonstrates a good balance between mathematical and factual reasoning and can be less suited for complex coding tasks. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) for more info.
 * **DeepSeek-R1-Distil-Qwen-14B** is a distilled model based on [Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B) that has great competence in factual reasoning and solving complex mathematical tasks.  Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) for more info.
+* **DeepSeek-R1-Distil-Qwen-32B** is a distilled model based on [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) that has comparable capability as OpenAI o1-mini. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) for more info. As the original model size is about 65GB, to quantize it to INT4 requires 32GB of RAM with 200GB for Swap File and another 200GB storage to save the models. The INT4 quantized model has about 16GB in size and requires 32GB of RAM when performing inference on CPU or 64GB of RAM on iGPU.
 
 Learn how to accelerate **DeepSeek-R1-Distill-Llama-8B** with **FastDraft** and OpenVINO GenAI speculative decoding pipeline in this [notebook](../../supplementary_materials/notebooks/fastdraft-deepseek/fastdraft_deepseek.ipynb)
 ## Notebook Contents

diff --git a/notebooks/deepseek-r1/deepseek-r1.ipynb b/notebooks/deepseek-r1/deepseek-r1.ipynb
@@ -110,6 +110,7 @@
     "* **DeepSeek-R1-Distill-Qwen-1.5B** is the smallest DeepSeek-R1 distilled model based on [Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B). Despite its compact size, the model demonstrates strong capabilities in solving basic mathematical tasks, at the same time its programming capabilities are limited. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) for more info.\n",
     "* **DeepSeek-R1-Distill-Qwen-7B** is a distilled model based on [Qwen-2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B). The model demonstrates a good balance between mathematical and factual reasoning and can be less suited for complex coding tasks. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) for more info.\n",
     "* **DeepSeek-R1-Distil-Qwen-14B** is a distilled model based on [Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B) that has great competence in factual reasoning and solving complex mathematical tasks.  Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) for more info.\n",
+    "* **DeepSeek-R1-Distil-Qwen-32B** is a distilled model based on [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) that has comparable capability as OpenAI o1-mini. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) for more info. As the original model size is about 65GB, to quantize it to INT4 requires 32GB of RAM with 200GB for Swap File and another 200GB storage to save the models. The INT4 quantized model has about 16GB in size and requires 32GB of RAM when performing inference on CPU or 64GB of RAM on iGPU.\n",
     "\n",
     "[Weight compression](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html) is a technique for enhancing the efficiency of models, especially those with large memory requirements. This method reduces the model’s memory footprint, a crucial factor for Large Language Models (LLMs). We provide several options for model weight compression:\n",
     "\n",

diff --git a/notebooks/deepseek-r1/llm_config.py b/notebooks/deepseek-r1/llm_config.py
@@ -40,6 +40,12 @@ def deepseek_partial_text_processor(partial_text, new_text):
             "system_prompt": DEFAULT_SYSTEM_PROMPT,
             "stop_strings": ["<｜end▁of▁sentence｜>", "<｜User｜>", "</User|>", "<|User|>", "<|end_of_sentence|>", "</｜"],
         },
+        "DeepSeek-R1-Distill-Qwen-32B": {
+            "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+            "genai_chat_template": "{% for message in messages %}{% if loop.first %}{{ '<｜begin▁of▁sentence｜>' }}{% endif %}{% if message['role'] == 'system' and message['content'] %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{  '<｜User｜>' +  message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' +  message['content'] + '<｜end▁of▁sentence｜>' }}{% endif %}{% if loop.last and add_generation_prompt and message['role'] != 'assitant' %}{{ '<｜Assistant｜>' }}{% endif %}{% endfor %}",
+            "system_prompt": DEFAULT_SYSTEM_PROMPT,
+            "stop_strings": ["<｜end▁of▁sentence｜>", "<｜User｜>", "</User|>", "<|User|>", "<|end_of_sentence|>", "</｜"],
+        },
     },
     "Chinese": {
         "DeepSeek-R1-Distill-Qwen-1.5B": {
@@ -66,6 +72,12 @@ def deepseek_partial_text_processor(partial_text, new_text):
             "system_prompt": DEFAULT_SYSTEM_PROMPT_CHINESE,
             "stop_strings": ["<｜end▁of▁sentence｜>", "<｜User｜>", "</User|>", "<|User|>", "<|end_of_sentence|>", "</｜"],
         },
+        "DeepSeek-R1-Distill-Qwen-32B": {
+            "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+            "genai_chat_template": "{% for message in messages %}{% if loop.first %}{{ '<｜begin▁of▁sentence｜>' }}{% endif %}{% if message['role'] == 'system' and message['content'] %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{  '<｜User｜>' +  message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' +  message['content'] + '<｜end▁of▁sentence｜>' }}{% endif %}{% if loop.last and add_generation_prompt and message['role'] != 'assitant' %}{{ '<｜Assistant｜>' }}{% endif %}{% endfor %}",
+            "system_prompt": DEFAULT_SYSTEM_PROMPT_CHINESE,
+            "stop_strings": ["<｜end▁of▁sentence｜>", "<｜User｜>", "</User|>", "<|User|>", "<|end_of_sentence|>", "</｜"],
+        },
     },
 }
 
@@ -79,6 +91,7 @@ def deepseek_partial_text_processor(partial_text, new_text):
     "DeepSeek-R1-Distill-Qwen-7B": {"sym": True, "group_size": 128, "ratio": 1.0},
     "DeepSeek-R1-Distill-Qwen-14B": {"sym": True, "group_size": 128, "ratio": 1.0},
     "DeepSeek-R1-Distill-Qwen-1.5B": {"sym": True, "group_size": 128, "ratio": 1.0},
+    "DeepSeek-R1-Distill-Qwen-32B": {"sym": True, "group_size": 128, "ratio": 1.0},
     "default": {
         "sym": False,
         "group_size": 128,