From 1a7a3c6d94f98b1defa445c4aec3a27124d4c562 Mon Sep 17 00:00:00 2001
From: "Kurniawan, Eka Antonius" <eka.antonius.kurniawan@intel.com>
Date: Sat, 8 Feb 2025 04:28:19 +0800
Subject: [PATCH 1/3] deepseek-r1 Notebook: Add Option for
 DeepSeek-R1-Distill-Qwen-32B Model  - The 32B is the one comparable in
 capability as OpenAI-o1-mini 100B.

---
 notebooks/deepseek-r1/llm_config.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/notebooks/deepseek-r1/llm_config.py b/notebooks/deepseek-r1/llm_config.py
index f7bfa330314..1e38e33d4d5 100644
--- a/notebooks/deepseek-r1/llm_config.py
+++ b/notebooks/deepseek-r1/llm_config.py
@@ -40,6 +40,12 @@ def deepseek_partial_text_processor(partial_text, new_text):
             "system_prompt": DEFAULT_SYSTEM_PROMPT,
             "stop_strings": ["<｜end▁of▁sentence｜>", "<｜User｜>", "</User|>", "<|User|>", "<|end_of_sentence|>", "</｜"],
         },
+        "DeepSeek-R1-Distill-Qwen-32B": {
+            "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+            "genai_chat_template": "{% for message in messages %}{% if loop.first %}{{ '<｜begin▁of▁sentence｜>' }}{% endif %}{% if message['role'] == 'system' and message['content'] %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{  '<｜User｜>' +  message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' +  message['content'] + '<｜end▁of▁sentence｜>' }}{% endif %}{% if loop.last and add_generation_prompt and message['role'] != 'assitant' %}{{ '<｜Assistant｜>' }}{% endif %}{% endfor %}",
+            "system_prompt": DEFAULT_SYSTEM_PROMPT,
+            "stop_strings": ["<｜end▁of▁sentence｜>", "<｜User｜>", "</User|>", "<|User|>", "<|end_of_sentence|>", "</｜"],
+        },
     },
     "Chinese": {
         "DeepSeek-R1-Distill-Qwen-1.5B": {
@@ -66,6 +72,12 @@ def deepseek_partial_text_processor(partial_text, new_text):
             "system_prompt": DEFAULT_SYSTEM_PROMPT_CHINESE,
             "stop_strings": ["<｜end▁of▁sentence｜>", "<｜User｜>", "</User|>", "<|User|>", "<|end_of_sentence|>", "</｜"],
         },
+        "DeepSeek-R1-Distill-Qwen-32B": {
+            "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+            "genai_chat_template": "{% for message in messages %}{% if loop.first %}{{ '<｜begin▁of▁sentence｜>' }}{% endif %}{% if message['role'] == 'system' and message['content'] %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{  '<｜User｜>' +  message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' +  message['content'] + '<｜end▁of▁sentence｜>' }}{% endif %}{% if loop.last and add_generation_prompt and message['role'] != 'assitant' %}{{ '<｜Assistant｜>' }}{% endif %}{% endfor %}",
+            "system_prompt": DEFAULT_SYSTEM_PROMPT_CHINESE,
+            "stop_strings": ["<｜end▁of▁sentence｜>", "<｜User｜>", "</User|>", "<|User|>", "<|end_of_sentence|>", "</｜"],
+        },
     },
 }
 
@@ -79,6 +91,7 @@ def deepseek_partial_text_processor(partial_text, new_text):
     "DeepSeek-R1-Distill-Qwen-7B": {"sym": True, "group_size": 128, "ratio": 1.0},
     "DeepSeek-R1-Distill-Qwen-14B": {"sym": True, "group_size": 128, "ratio": 1.0},
     "DeepSeek-R1-Distill-Qwen-1.5B": {"sym": True, "group_size": 128, "ratio": 1.0},
+    "DeepSeek-R1-Distill-Qwen-32B": {"sym": True, "group_size": 128, "ratio": 1.0},
     "default": {
         "sym": False,
         "group_size": 128,

From 14f45167a80b2e88162dd43f0bbda74d1ea30645 Mon Sep 17 00:00:00 2001
From: "Kurniawan, Eka Antonius" <eka.antonius.kurniawan@intel.com>
Date: Mon, 10 Feb 2025 20:07:23 +0800
Subject: [PATCH 2/3] Update deepseek-r1 Notebook with New Model Information  -
 Add DeepSeek-R1-Distil-Qwen-32B model information to deepseek-r1 Notebook and
 README.md.

---
 notebooks/deepseek-r1/README.md         | 1 +
 notebooks/deepseek-r1/deepseek-r1.ipynb | 1 +
 2 files changed, 2 insertions(+)

diff --git a/notebooks/deepseek-r1/README.md b/notebooks/deepseek-r1/README.md
index e76d386feb2..841ac803d94 100644
--- a/notebooks/deepseek-r1/README.md
+++ b/notebooks/deepseek-r1/README.md
@@ -12,6 +12,7 @@ The tutorial supports different models, you can select one from the provided opt
 * **DeepSeek-R1-Distill-Qwen-1.5B** is the smallest DeepSeek-R1 distilled model based on [Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B). Despite its compact size, the model demonstrates strong capabilities in solving basic mathematical tasks, at the same time its programming capabilities are limited. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) for more info.
 * **DeepSeek-R1-Distill-Qwen-7B** is a distilled model based on [Qwen-2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B). The model demonstrates a good balance between mathematical and factual reasoning and can be less suited for complex coding tasks. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) for more info.
 * **DeepSeek-R1-Distil-Qwen-14B** is a distilled model based on [Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B) that has great competence in factual reasoning and solving complex mathematical tasks.  Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) for more info.
+* **DeepSeek-R1-Distil-Qwen-32B** is a distilled model based on [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) that has comparable capability as OpenAI o1-mini. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) for more info. As the original model size is about 65GB, to quantize it to INT4 requires 32GB of RAM with 200GB for vRAM (Swap File) and another 200GB storage to save the models. The INT4 quantized model has about 16GB in size and requires 32GB of RAM when performing inference on CPU or 64GB of RAM on iGPU.
 
 Learn how to accelerate **DeepSeek-R1-Distill-Llama-8B** with **FastDraft** and OpenVINO GenAI speculative decoding pipeline in this [notebook](../../supplementary_materials/notebooks/fastdraft-deepseek/fastdraft_deepseek.ipynb)
 ## Notebook Contents
diff --git a/notebooks/deepseek-r1/deepseek-r1.ipynb b/notebooks/deepseek-r1/deepseek-r1.ipynb
index 07bf0d982e1..6bc95dced9f 100644
--- a/notebooks/deepseek-r1/deepseek-r1.ipynb
+++ b/notebooks/deepseek-r1/deepseek-r1.ipynb
@@ -110,6 +110,7 @@
     "* **DeepSeek-R1-Distill-Qwen-1.5B** is the smallest DeepSeek-R1 distilled model based on [Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B). Despite its compact size, the model demonstrates strong capabilities in solving basic mathematical tasks, at the same time its programming capabilities are limited. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) for more info.\n",
     "* **DeepSeek-R1-Distill-Qwen-7B** is a distilled model based on [Qwen-2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B). The model demonstrates a good balance between mathematical and factual reasoning and can be less suited for complex coding tasks. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) for more info.\n",
     "* **DeepSeek-R1-Distil-Qwen-14B** is a distilled model based on [Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B) that has great competence in factual reasoning and solving complex mathematical tasks.  Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) for more info.\n",
+    "* **DeepSeek-R1-Distil-Qwen-32B** is a distilled model based on [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) that has comparable capability as OpenAI o1-mini. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) for more info. As the original model size is about 65GB, to quantize it to INT4 requires 32GB of RAM with 200GB for vRAM (Swap File) and another 200GB storage to save the models. The INT4 quantized model has about 16GB in size and requires 32GB of RAM when performing inference on CPU or 64GB of RAM on iGPU.\n",
     "\n",
     "[Weight compression](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html) is a technique for enhancing the efficiency of models, especially those with large memory requirements. This method reduces the model’s memory footprint, a crucial factor for Large Language Models (LLMs). We provide several options for model weight compression:\n",
     "\n",

From e41ece8a65e4525c5fa8a74a160d70da5630d4fc Mon Sep 17 00:00:00 2001
From: "Kurniawan, Eka Antonius" <eka.antonius.kurniawan@intel.com>
Date: Mon, 10 Feb 2025 20:33:20 +0800
Subject: [PATCH 3/3] Fix deepseek-r1 Notebook Spell Check  - Remove vRAM word.

---
 notebooks/deepseek-r1/README.md         | 2 +-
 notebooks/deepseek-r1/deepseek-r1.ipynb | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebooks/deepseek-r1/README.md b/notebooks/deepseek-r1/README.md
index 841ac803d94..be0fe192a26 100644
--- a/notebooks/deepseek-r1/README.md
+++ b/notebooks/deepseek-r1/README.md
@@ -12,7 +12,7 @@ The tutorial supports different models, you can select one from the provided opt
 * **DeepSeek-R1-Distill-Qwen-1.5B** is the smallest DeepSeek-R1 distilled model based on [Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B). Despite its compact size, the model demonstrates strong capabilities in solving basic mathematical tasks, at the same time its programming capabilities are limited. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) for more info.
 * **DeepSeek-R1-Distill-Qwen-7B** is a distilled model based on [Qwen-2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B). The model demonstrates a good balance between mathematical and factual reasoning and can be less suited for complex coding tasks. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) for more info.
 * **DeepSeek-R1-Distil-Qwen-14B** is a distilled model based on [Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B) that has great competence in factual reasoning and solving complex mathematical tasks.  Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) for more info.
-* **DeepSeek-R1-Distil-Qwen-32B** is a distilled model based on [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) that has comparable capability as OpenAI o1-mini. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) for more info. As the original model size is about 65GB, to quantize it to INT4 requires 32GB of RAM with 200GB for vRAM (Swap File) and another 200GB storage to save the models. The INT4 quantized model has about 16GB in size and requires 32GB of RAM when performing inference on CPU or 64GB of RAM on iGPU.
+* **DeepSeek-R1-Distil-Qwen-32B** is a distilled model based on [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) that has comparable capability as OpenAI o1-mini. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) for more info. As the original model size is about 65GB, to quantize it to INT4 requires 32GB of RAM with 200GB for Swap File and another 200GB storage to save the models. The INT4 quantized model has about 16GB in size and requires 32GB of RAM when performing inference on CPU or 64GB of RAM on iGPU.
 
 Learn how to accelerate **DeepSeek-R1-Distill-Llama-8B** with **FastDraft** and OpenVINO GenAI speculative decoding pipeline in this [notebook](../../supplementary_materials/notebooks/fastdraft-deepseek/fastdraft_deepseek.ipynb)
 ## Notebook Contents
diff --git a/notebooks/deepseek-r1/deepseek-r1.ipynb b/notebooks/deepseek-r1/deepseek-r1.ipynb
index 6bc95dced9f..7f20a819e95 100644
--- a/notebooks/deepseek-r1/deepseek-r1.ipynb
+++ b/notebooks/deepseek-r1/deepseek-r1.ipynb
@@ -110,7 +110,7 @@
     "* **DeepSeek-R1-Distill-Qwen-1.5B** is the smallest DeepSeek-R1 distilled model based on [Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B). Despite its compact size, the model demonstrates strong capabilities in solving basic mathematical tasks, at the same time its programming capabilities are limited. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) for more info.\n",
     "* **DeepSeek-R1-Distill-Qwen-7B** is a distilled model based on [Qwen-2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B). The model demonstrates a good balance between mathematical and factual reasoning and can be less suited for complex coding tasks. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) for more info.\n",
     "* **DeepSeek-R1-Distil-Qwen-14B** is a distilled model based on [Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B) that has great competence in factual reasoning and solving complex mathematical tasks.  Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) for more info.\n",
-    "* **DeepSeek-R1-Distil-Qwen-32B** is a distilled model based on [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) that has comparable capability as OpenAI o1-mini. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) for more info. As the original model size is about 65GB, to quantize it to INT4 requires 32GB of RAM with 200GB for vRAM (Swap File) and another 200GB storage to save the models. The INT4 quantized model has about 16GB in size and requires 32GB of RAM when performing inference on CPU or 64GB of RAM on iGPU.\n",
+    "* **DeepSeek-R1-Distil-Qwen-32B** is a distilled model based on [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) that has comparable capability as OpenAI o1-mini. Check [model card](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) for more info. As the original model size is about 65GB, to quantize it to INT4 requires 32GB of RAM with 200GB for Swap File and another 200GB storage to save the models. The INT4 quantized model has about 16GB in size and requires 32GB of RAM when performing inference on CPU or 64GB of RAM on iGPU.\n",
     "\n",
     "[Weight compression](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html) is a technique for enhancing the efficiency of models, especially those with large memory requirements. This method reduces the model’s memory footprint, a crucial factor for Large Language Models (LLMs). We provide several options for model weight compression:\n",
     "\n",