From 3c4b8239a0a036133649f273867d825ef6d1b004 Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Tue, 25 Feb 2025 22:44:24 +0800
Subject: [PATCH 01/16] [Feature] Add reasoning parser support to chat
 generation and completions

---
 python/sglang/srt/openai_api/adapter.py  |  50 +++++++++--
 python/sglang/srt/openai_api/protocol.py |   2 +
 python/sglang/srt/reasoning_parser.py    | 109 +++++++++++++++++++++++
 python/sglang/srt/server_args.py         |  19 ++++
 4 files changed, 175 insertions(+), 5 deletions(-)
 create mode 100644 python/sglang/srt/reasoning_parser.py

diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index 0556f852a32..777c2b7a9c0 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -74,6 +74,7 @@
     TopLogprob,
     UsageInfo,
 )
+from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.utils import get_exception_traceback
 
 logger = logging.getLogger(__name__)
@@ -1038,7 +1039,12 @@ def v1_chat_generate_request(
 
 
 def v1_chat_generate_response(
-    request, ret, to_file=False, cache_report=False, tool_call_parser=None
+    request,
+    ret,
+    to_file=False,
+    cache_report=False,
+    tool_call_parser=None,
+    reasoning_parser=None,
 ):
     choices = []
 
@@ -1086,9 +1092,22 @@ def v1_chat_generate_response(
 
         finish_reason = ret_item["meta_info"]["finish_reason"]
 
+        reasoning_content = None
+        if reasoning_parser:
+            try:
+                parser = ReasoningParser(reasoning_parser)
+                reasoning_content, ret_item["text"] = parser.parse_non_stream(
+                    ret_item["text"]
+                )
+            except Exception as e:
+                logger.error(f"Exception: {e}")
+                return create_error_response(
+                    HTTPStatus.BAD_REQUEST,
+                    "Failed to parse reasoning content",
+                )
+
         tool_calls = None
         text = ret_item["text"]
-
         if isinstance(request, list):
             tool_choice = request[idx].tool_choice
             tools = request[idx].tools
@@ -1125,6 +1144,9 @@ def v1_chat_generate_response(
                 "message": {
                     "role": "assistant",
                     "content": ret_item["text"] if tool_calls is None else None,
+                    "reasoning_content": (
+                        reasoning_content if tool_calls is None else None
+                    ),
                     "tool_calls": tool_calls,
                 },
                 "logprobs": choice_logprobs,
@@ -1141,6 +1163,7 @@ def v1_chat_generate_response(
                 message=ChatMessage(
                     role="assistant",
                     content=ret_item["text"] if tool_calls is None else None,
+                    reasoning_content=reasoning_content if tool_calls is None else None,
                     tool_calls=tool_calls,
                 ),
                 logprobs=choice_logprobs,
@@ -1208,6 +1231,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
 
     if adapted_request.stream:
         parser_dict = {}
+        reasoning_parser_dict = {}
 
         async def generate_stream_resp():
             is_firsts = {}
@@ -1302,6 +1326,16 @@ async def generate_stream_resp():
                     delta = text[len(stream_buffer) :]
                     new_stream_buffer = stream_buffer + delta
 
+                    reasoning_content = None
+                    if tokenizer_manager.server_args.enable_reasoning:
+                        if index not in reasoning_parser_dict:
+                            reasoning_parser_dict[index] = ReasoningParser(
+                                tokenizer_manager.server_args.reasoning_parser
+                            )
+                        reasoning_content, delta = reasoning_parser_dict[
+                            index
+                        ].parse_stream_chunk(delta)
+
                     if request.tool_choice != "none" and request.tools:
                         if index not in parser_dict:
                             parser_dict[index] = FunctionCallParser(
@@ -1313,11 +1347,14 @@ async def generate_stream_resp():
                         # parse_increment => returns (normal_text, calls)
                         normal_text, calls = parser.parse_stream_chunk(delta)
 
-                        # 1) if there's normal_text, output it as normal content
+                        # 1) if there's normal_text, output it as normal content, the reasoning content is also included
                         if normal_text:
                             choice_data = ChatCompletionResponseStreamChoice(
                                 index=index,
-                                delta=DeltaMessage(content=normal_text),
+                                delta=DeltaMessage(
+                                    content=normal_text,
+                                    reasoning_content=reasoning_content,
+                                ),
                                 finish_reason=(
                                     finish_reason["type"] if finish_reason else ""
                                 ),
@@ -1386,7 +1423,9 @@ async def generate_stream_resp():
                         # No tool calls => just treat this as normal text
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=index,
-                            delta=DeltaMessage(content=delta),
+                            delta=DeltaMessage(
+                                content=delta, reasoning_content=reasoning_content
+                            ),
                             finish_reason=(
                                 finish_reason["type"] if finish_reason else ""
                             ),
@@ -1456,6 +1495,7 @@ async def generate_stream_resp():
         ret,
         cache_report=tokenizer_manager.server_args.enable_cache_report,
         tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
+        reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
     )
 
     return response
diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py
index 95b34527edb..6e2ffe2015b 100644
--- a/python/sglang/srt/openai_api/protocol.py
+++ b/python/sglang/srt/openai_api/protocol.py
@@ -344,6 +344,7 @@ class ToolCall(BaseModel):
 class ChatMessage(BaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    reasoning_content: Optional[str] = None
     tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
 
 
@@ -367,6 +368,7 @@ class ChatCompletionResponse(BaseModel):
 class DeltaMessage(BaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    reasoning_content: Optional[str] = None
     tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
 
 
diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
new file mode 100644
index 00000000000..7c8e4a42cd9
--- /dev/null
+++ b/python/sglang/srt/reasoning_parser.py
@@ -0,0 +1,109 @@
+import json
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+
+class BaseReasoningParser:
+    """Base class for reasoning parser."""
+
+    def __init__(self):
+        self._buffer = ""
+
+    def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
+        """Detect and parse the text, return reasoning_content and content."""
+        raise NotImplementedError
+
+    def parse_streaming_increment(
+        self, new_text: str
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Parse the new text incrementally, return reasoning_content and content."""
+        raise NotImplementedError
+
+
+class DeepSeekR1ReasoningParser(BaseReasoningParser):
+    """
+    DeepSeekR1 reasoning parser, which use "<think>\n" and "\n</think>" to detect the reasoning part.
+    Referring to https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#usage-recommendations~.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.think_start_token = "<think>\n"
+        self.think_end_token = "\n</think>"
+        self.pattern = re.compile(
+            rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL
+        )
+
+        self.is_reasoning = False
+
+    def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
+        # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "<think>\n"
+        # We assume the output has an invisible "<think>\n", and the reasoning part is the whole text.
+        if self.think_end_token not in text:
+            return text, None
+
+        else:
+            # Add the start token to the beginning of the text.
+            if self.think_start_token not in text:
+                text = self.think_start_token + text
+
+            reasoning_content = self.pattern.findall(text)[0]
+            content = text[
+                len(self.think_start_token)
+                + len(reasoning_content)
+                + len(self.think_end_token) :
+            ]
+
+            return reasoning_content, content if len(content) > 0 else None
+
+    def parse_streaming_increment(
+        self, new_text: str
+    ) -> Tuple[Optional[str], Optional[str]]:
+        # Again, we assume the output has an invisible "<think>\n"
+        self._buffer += new_text
+
+        # Should parse
+        if self.is_reasoning:
+            # Reasoning continues
+            if self.think_end_token not in self._buffer:
+                return new_text, None
+            # Reasoning ends
+            else:
+                reasoning_part = new_text.split(self.think_end_token)[0]
+                content_part = new_text.split(self.think_end_token)[1]
+
+                self.is_reasoning = False
+                self._buffer = ""
+
+                return reasoning_part if len(reasoning_part) > 0 else None, (
+                    content_part if len(content_part) > 0 else None
+                )
+
+        else:
+            return None, new_text
+
+
+class ReasoningParser:
+    """Reasoning parser for different reasoning models."""
+
+    ReasoningParserDict: Dict[str, Type[BaseReasoningParser]] = {
+        "deepseek-r1": DeepSeekR1ReasoningParser
+    }
+
+    def __init__(self, reasoning_parser: str):
+        self.parser = self.ReasoningParserDict[reasoning_parser]()
+
+    def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Non-streaming parsing for reasoning models.
+        Return: reasoning_content, content
+        """
+        return self.parser.detect_and_parse(full_text)
+
+    def parse_stream_chunk(self, chunk_text: str):
+        """
+        Streaming parsing for reasoning models.
+        Return: reasoning_content, content
+        """
+        return self.parser.parse_streaming_increment(chunk_text)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index a81228ce34f..ed2ad09dffa 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -94,6 +94,8 @@ class ServerArgs:
     api_key: Optional[str] = None
     file_storage_pth: str = "sglang_storage"
     enable_cache_report: bool = False
+    enable_reasoning: bool = False
+    reasoning_parser: Optional[str] = None
 
     # Data parallelism
     dp_size: int = 1
@@ -282,6 +284,12 @@ def __post_init__(self):
         if is_hip():
             self.triton_attention_num_kv_splits = 16
 
+        # API Related
+        if self.enable_reasoning and not self.reasoning_parser:
+            raise ValueError(
+                "Reasoning parser must be specified when reasoning is enabled."
+            )
+
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         # Model and port args
@@ -600,6 +608,17 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
         )
+        parser.add_argument(
+            "--enable-reasoning",
+            action="store_true",
+            help="Enable the reasoning feature.",
+        )
+        parser.add_argument(
+            "--reasoning-parser",
+            type=str,
+            default=ServerArgs.reasoning_parser,
+            help="Specify the parser for reasoning tasks.",
+        )
 
         # Data parallelism
         parser.add_argument(

From 1727ba436e895e1ee1671506d49d3ac5853470a6 Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Tue, 25 Feb 2025 22:49:59 +0800
Subject: [PATCH 02/16] fix: add choices for reasoning-parser

---
 python/sglang/srt/server_args.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index ed2ad09dffa..b5a17de660e 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -616,6 +616,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument(
             "--reasoning-parser",
             type=str,
+            choices=["deepseek-r1"],
             default=ServerArgs.reasoning_parser,
             help="Specify the parser for reasoning tasks.",
         )

From 4759bedd1385d847c8a6142bba04f4af3294f4df Mon Sep 17 00:00:00 2001
From: Xihuai Wang <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 00:12:31 +0800
Subject: [PATCH 03/16] Fix reasoning_parser.py

---
 python/sglang/srt/reasoning_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
index 7c8e4a42cd9..1920e11feee 100644
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -35,7 +35,7 @@ def __init__(self):
             rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL
         )
 
-        self.is_reasoning = False
+        self.is_reasoning = True 
 
     def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
         # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "<think>\n"

From 31e4dd5a2e5f9f88a77c42d1358511f1aeefdccb Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 01:16:23 +0800
Subject: [PATCH 04/16] fix: handle possible "<think>\n" in reasoning parser

---
 python/sglang/srt/reasoning_parser.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
index 1920e11feee..1a78dc8c126 100644
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -39,9 +39,10 @@ def __init__(self):
 
     def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
         # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "<think>\n"
-        # We assume the output has an invisible "<think>\n", and the reasoning part is the whole text.
+        # We assume the output has an "<think>\n", and the reasoning part is the whole text.
         if self.think_end_token not in text:
-            return text, None
+            # Remove "<think>\n" if exists
+            return text.replace(self.think_start_token, ""), None
 
         else:
             # Add the start token to the beginning of the text.
@@ -60,11 +61,13 @@ def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
     def parse_streaming_increment(
         self, new_text: str
     ) -> Tuple[Optional[str], Optional[str]]:
-        # Again, we assume the output has an invisible "<think>\n"
-        self._buffer += new_text
 
         # Should parse
         if self.is_reasoning:
+            # Again, we assume the output has an "<think>\n"
+            if len(self._buffer) == 0:
+                new_text = new_text.replace(self.think_start_token, "")
+            self._buffer += new_text
             # Reasoning continues
             if self.think_end_token not in self._buffer:
                 return new_text, None

From d19b7a0bffcd1553ba8f9d635550d5d56f01fd4e Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 01:26:28 +0800
Subject: [PATCH 05/16] fix: update help text for reasoning parser to include
 supported models

---
 python/sglang/srt/server_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index b5a17de660e..dc0af6b3cd3 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -618,7 +618,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
             type=str,
             choices=["deepseek-r1"],
             default=ServerArgs.reasoning_parser,
-            help="Specify the parser for reasoning tasks.",
+            help="Specify the parser for reasoning models, supported parsers are: deepseek-r1.",
         )
 
         # Data parallelism

From addaeb564feb1e4346fe8b68bb77dde26ba26c07 Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 02:16:29 +0800
Subject: [PATCH 06/16] fix: update reasoning parser to handle changes in
 DeepSeek output format

---
 python/sglang/srt/openai_api/adapter.py |  4 ++++
 python/sglang/srt/reasoning_parser.py   | 14 +++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index 777c2b7a9c0..11a1abbcfb5 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -1099,6 +1099,8 @@ def v1_chat_generate_response(
                 reasoning_content, ret_item["text"] = parser.parse_non_stream(
                     ret_item["text"]
                 )
+                if not ret_item["text"]:
+                    ret_item["text"] = ""
             except Exception as e:
                 logger.error(f"Exception: {e}")
                 return create_error_response(
@@ -1335,6 +1337,8 @@ async def generate_stream_resp():
                         reasoning_content, delta = reasoning_parser_dict[
                             index
                         ].parse_stream_chunk(delta)
+                        if not delta:
+                            delta = ""
 
                     if request.tool_choice != "none" and request.tools:
                         if index not in parser_dict:
diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
index 1a78dc8c126..f769096ddc4 100644
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -23,14 +23,14 @@ def parse_streaming_increment(
 
 class DeepSeekR1ReasoningParser(BaseReasoningParser):
     """
-    DeepSeekR1 reasoning parser, which use "<think>\n" and "\n</think>" to detect the reasoning part.
+    DeepSeekR1 reasoning parser, which use "<think>" and "</think>" to detect the reasoning part.
     Referring to https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#usage-recommendations~.
     """
 
     def __init__(self):
         super().__init__()
-        self.think_start_token = "<think>\n"
-        self.think_end_token = "\n</think>"
+        self.think_start_token = "<think>"
+        self.think_end_token = "</think>"
         self.pattern = re.compile(
             rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL
         )
@@ -38,10 +38,10 @@ def __init__(self):
         self.is_reasoning = True 
 
     def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
-        # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "<think>\n"
-        # We assume the output has an "<think>\n", and the reasoning part is the whole text.
+        # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "<think>"
+        # We assume the output has an "<think>", and the reasoning part is the whole text.
         if self.think_end_token not in text:
-            # Remove "<think>\n" if exists
+            # Remove "<think>" if exists
             return text.replace(self.think_start_token, ""), None
 
         else:
@@ -64,7 +64,7 @@ def parse_streaming_increment(
 
         # Should parse
         if self.is_reasoning:
-            # Again, we assume the output has an "<think>\n"
+            # Again, we assume the output has an "<think>"
             if len(self._buffer) == 0:
                 new_text = new_text.replace(self.think_start_token, "")
             self._buffer += new_text

From f362249fcd16fe1ecf53c8c96cee383a563d6d24 Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 11:44:29 +0800
Subject: [PATCH 07/16] fix: refine reasoning parser output handling and clean
 up response logic

---
 python/sglang/srt/openai_api/adapter.py |  7 +------
 python/sglang/srt/reasoning_parser.py   | 14 ++++++--------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index 11a1abbcfb5..fdd50f55e94 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -1090,8 +1090,6 @@ def v1_chat_generate_response(
         else:
             choice_logprobs = None
 
-        finish_reason = ret_item["meta_info"]["finish_reason"]
-
         reasoning_content = None
         if reasoning_parser:
             try:
@@ -1099,8 +1097,6 @@ def v1_chat_generate_response(
                 reasoning_content, ret_item["text"] = parser.parse_non_stream(
                     ret_item["text"]
                 )
-                if not ret_item["text"]:
-                    ret_item["text"] = ""
             except Exception as e:
                 logger.error(f"Exception: {e}")
                 return create_error_response(
@@ -1108,6 +1104,7 @@ def v1_chat_generate_response(
                     "Failed to parse reasoning content",
                 )
 
+        finish_reason = ret_item["meta_info"]["finish_reason"]
         tool_calls = None
         text = ret_item["text"]
         if isinstance(request, list):
@@ -1337,8 +1334,6 @@ async def generate_stream_resp():
                         reasoning_content, delta = reasoning_parser_dict[
                             index
                         ].parse_stream_chunk(delta)
-                        if not delta:
-                            delta = ""
 
                     if request.tool_choice != "none" and request.tools:
                         if index not in parser_dict:
diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
index f769096ddc4..81d8ed780f2 100644
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -35,14 +35,14 @@ def __init__(self):
             rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL
         )
 
-        self.is_reasoning = True 
+        self.is_reasoning = True
 
     def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
         # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "<think>"
         # We assume the output has an "<think>", and the reasoning part is the whole text.
         if self.think_end_token not in text:
             # Remove "<think>" if exists
-            return text.replace(self.think_start_token, ""), None
+            return text.replace(self.think_start_token, ""), ""
 
         else:
             # Add the start token to the beginning of the text.
@@ -56,7 +56,7 @@ def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
                 + len(self.think_end_token) :
             ]
 
-            return reasoning_content, content if len(content) > 0 else None
+            return reasoning_content, content
 
     def parse_streaming_increment(
         self, new_text: str
@@ -70,7 +70,7 @@ def parse_streaming_increment(
             self._buffer += new_text
             # Reasoning continues
             if self.think_end_token not in self._buffer:
-                return new_text, None
+                return new_text, ""
             # Reasoning ends
             else:
                 reasoning_part = new_text.split(self.think_end_token)[0]
@@ -79,12 +79,10 @@ def parse_streaming_increment(
                 self.is_reasoning = False
                 self._buffer = ""
 
-                return reasoning_part if len(reasoning_part) > 0 else None, (
-                    content_part if len(content_part) > 0 else None
-                )
+                return reasoning_part, content_part
 
         else:
-            return None, new_text
+            return "", new_text
 
 
 class ReasoningParser:

From 2ed2b6d7d4cc4ac7f97d491740b8d9c3b862af80 Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 14:34:50 +0800
Subject: [PATCH 08/16] docs: add reasoning parser documentation for DeepSeek
 model support

---
 docs/backend/reasoning_parser.md | 127 +++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)
 create mode 100644 docs/backend/reasoning_parser.md

diff --git a/docs/backend/reasoning_parser.md b/docs/backend/reasoning_parser.md
new file mode 100644
index 00000000000..607acd6c012
--- /dev/null
+++ b/docs/backend/reasoning_parser.md
@@ -0,0 +1,127 @@
+# Reasoning Parser
+
+SGLang support parsing the reasoning content from reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) for convenient output processing in the downstream applications.
+
+Following Official [DeepSeek API design](https://api-docs.deepseek.com/guides/reasoning_model), SGLang offering reasoning content and final conclusions:
+
+- `reasoning_content`: The content of the CoT.
+- `content`: The content of the final answer.
+
+## Supported Models
+
+Currently, SGLang supports the following reasoning models:
+- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `<think>` and `</think>` tags.
+
+## Usage
+
+You need to enable the reasoning parser in the SGLang API server by setting the `--enable-reasoning` and `--reasoning-parser` options. The `--reasoning-parser` option specifies the reasoning parser to extract the reasoning content and final answer.
+
+```bash
+python -m sglang.launch_server --host 0.0.0.0 \
+--model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
+--enable-reasoning --reasoning-parser deepseek-r1
+```
+
+### Non-streaming Request
+
+Make a request to the reasoning model, get the reasoning content and final answer.
+
+Using OpenAI python api:
+```python
+import openai
+
+client = openai.Client(base_url="http://localhost:30000/v1", api_key="None")
+
+response = client.chat.completions.create(
+    model="deepseek-r1:14b",
+    messages=[{"role": "user", "content": "Compute 1+3"}],
+    max_tokens=1024,
+    stream=False
+)
+
+response.choices[0].message.reasoning_content
+# 'First, I recognize that the problem requires adding the numbers 1 and 3.\n\nNext, I identify the numbers to be added, which are 1 and 3.\n\nThen, I perform the addition operation: 1 plus 3 equals 4.\n\nFinally, I conclude that the sum of 1 and 3 is 4.\n'
+response.choices[0].message.content
+# \n\nTo compute \\(1 + 3\\), follow these simple steps:\n\n1. **Identify the numbers to add:**  \n   The numbers are **1** and **3**.\n\n2. **Add the numbers together:**  \n   \\[\n   1 + 3 = 4\n   \\]\n\n3. **Write the final answer:**  \n   The sum of \\(1 + 3\\) is \\(\\boxed{4}\\).'
+```
+
+### Streaming Request
+
+`reasoning_content` is available in the `delta` field of the streaming response.
+
+Using OpenAI python api:
+
+```python
+# ... Initialize the client as before ...
+
+response = client.chat.completions.create(
+    model="deepseek-r1:14b",
+    messages=[{"role": "user", "content": "Compute 1+3"}],
+    max_tokens=1024,
+    stream=True
+)
+reasoning_content = ""
+content = ""
+for chunk in response:
+    if chunk.choices[0].delta.content:
+      content += chunk.choices[0].delta.content
+    elif chunk.choices[0].delta.reasoning_content:
+      reasoning_content += chunk.choices[0].delta.reasoning_content
+
+reasoning_content
+# 'I need to calculate the sum of 1 and 3. \n\nFirst, I identify the numbers involved in the addition: 1 and 3.\n\nNext, I add these two numbers together to find the total.\n\nFinally, the result of the addition is 4.\n'
+content
+# '\n\n**Solution:**\n\nWe need to compute the sum of 1 and 3.\n\n1. **Identify the numbers to add:**\n   - Number 1\n   - Number 3\n\n2. **Add the numbers together:**\n   \\[\n   1 + 3 = 4\n   \\]\n\n3. **Final Answer:**\n   \\[\n   \\boxed{4}\n   \\]'
+```
+
+
+## Supported More Reasoning Models
+
+For future reasoning models, you can implement the reasoning parser as a subclass of `BaseReasoningParser` in `python/sglang/srt/reasoning_parser.py`.
+
+```python
+class BaseReasoningParser:
+    """Base class for reasoning parser."""
+
+    def __init__(self):
+        self._buffer = ""
+
+    def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
+        """Detect and parse the text, return reasoning_content and content."""
+        raise NotImplementedError
+
+    def parse_streaming_increment(
+        self, new_text: str
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Parse the new text incrementally, return reasoning_content and content."""
+        raise NotImplementedError
+```
+
+And specify the reasoning parser for new reasoning models accordingly.
+
+```python
+class ReasoningParser:
+    """Reasoning parser for different reasoning models."""
+
+    # Specify the reasoning parser for each reasoning model here
+    ReasoningParserDict: Dict[str, Type[BaseReasoningParser]] = {
+        "deepseek-r1": DeepSeekR1ReasoningParser
+    }
+
+    def __init__(self, reasoning_parser: str):
+        self.parser = self.ReasoningParserDict[reasoning_parser]()
+
+    def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Non-streaming parsing for reasoning models.
+        Return: reasoning_content, content
+        """
+        return self.parser.detect_and_parse(full_text)
+
+    def parse_stream_chunk(self, chunk_text: str):
+        """
+        Streaming parsing for reasoning models.
+        Return: reasoning_content, content
+        """
+        return self.parser.parse_streaming_increment(chunk_text)
+```

From 6cedd5ff3e6253cc134b3586b3e5c4e1f32875ab Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 20:34:08 +0800
Subject: [PATCH 09/16] refactor: enhance DeepSeekR1ReasoningParser
 initialization and parsing logic

---
 python/sglang/srt/reasoning_parser.py | 94 ++++++++++++++-------------
 1 file changed, 50 insertions(+), 44 deletions(-)

diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
index 81d8ed780f2..15398a5eb6c 100644
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -7,42 +7,64 @@
 class BaseReasoningParser:
     """Base class for reasoning parser."""
 
-    def __init__(self):
+    def __init__(self, think_start_token: str, think_end_token: str, force_think: bool):
         self._buffer = ""
+        self.think_start_token = think_start_token
+        self.think_end_token = think_end_token
+        self.pattern = re.compile(
+            rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL
+        )
+
+        # whether we assume the output must have a `think_start_token`
+        self.force_think = force_think
+        self.is_reasoning = (
+            self.force_think
+        )  # assume the output has a `think_start_token` at the beginning
 
-    def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
-        """Detect and parse the text, return reasoning_content and content."""
         raise NotImplementedError
 
     def parse_streaming_increment(
         self, new_text: str
     ) -> Tuple[Optional[str], Optional[str]]:
         """Parse the new text incrementally, return reasoning_content and content."""
-        raise NotImplementedError
+        # Detect the start token for toggling `is_reasoning` when `force_think` is False
+        if not self.force_think and self.think_start_token in new_text:
+            self.is_reasoning = True
 
+        # Should parse
+        if self.is_reasoning:
+            if len(self._buffer) == 0:
+                self._buffer += new_text
+                new_text = new_text.replace(self.think_start_token, "")
+            else:
+                self._buffer += new_text
 
-class DeepSeekR1ReasoningParser(BaseReasoningParser):
-    """
-    DeepSeekR1 reasoning parser, which use "<think>" and "</think>" to detect the reasoning part.
-    Referring to https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#usage-recommendations~.
-    """
+            # Reasoning continues
+            if self.think_end_token not in self._buffer:
+                return new_text, ""
+            # Reasoning ends
+            else:
+                reasoning_part = new_text.split(self.think_end_token)[0]
+                content_part = new_text.split(self.think_end_token)[1]
 
-    def __init__(self):
-        super().__init__()
-        self.think_start_token = "<think>"
-        self.think_end_token = "</think>"
-        self.pattern = re.compile(
-            rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL
-        )
+                self.is_reasoning = False
+                self._buffer = ""
+
+                return reasoning_part, content_part
 
-        self.is_reasoning = True
+        else:
+            return "", new_text
 
     def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
-        # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "<think>"
-        # We assume the output has an "<think>", and the reasoning part is the whole text.
+        """Detect and parse the text, return reasoning_content and content."""
         if self.think_end_token not in text:
-            # Remove "<think>" if exists
-            return text.replace(self.think_start_token, ""), ""
+            if self.force_think:  # all the output are reasoning content
+                # Remove "<think>" if exists
+                return text.replace(self.think_start_token, ""), ""
+            elif self.think_start_token in text:
+                return text.replace(self.think_start_token, ""), ""
+            else:
+                return "", text
 
         else:
             # Add the start token to the beginning of the text.
@@ -58,31 +80,15 @@ def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
 
             return reasoning_content, content
 
-    def parse_streaming_increment(
-        self, new_text: str
-    ) -> Tuple[Optional[str], Optional[str]]:
-
-        # Should parse
-        if self.is_reasoning:
-            # Again, we assume the output has an "<think>"
-            if len(self._buffer) == 0:
-                new_text = new_text.replace(self.think_start_token, "")
-            self._buffer += new_text
-            # Reasoning continues
-            if self.think_end_token not in self._buffer:
-                return new_text, ""
-            # Reasoning ends
-            else:
-                reasoning_part = new_text.split(self.think_end_token)[0]
-                content_part = new_text.split(self.think_end_token)[1]
-
-                self.is_reasoning = False
-                self._buffer = ""
 
-                return reasoning_part, content_part
+class DeepSeekR1ReasoningParser(BaseReasoningParser):
+    """
+    DeepSeekR1 reasoning parser, which use "<think>" and "</think>" to detect the reasoning part.
+    Referring to https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#usage-recommendations~.
+    """
 
-        else:
-            return "", new_text
+    def __init__(self):
+        super().__init__("<think> ", "</think> ", True)
 
 
 class ReasoningParser:

From 857297cd765804bfc13f4eff857b666182a963f9 Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 20:52:31 +0800
Subject: [PATCH 10/16] refactor: remove NotImplementedError from
 BaseReasoningParser class

---
 python/sglang/srt/reasoning_parser.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
index 15398a5eb6c..dcb22f8192d 100644
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -21,8 +21,6 @@ def __init__(self, think_start_token: str, think_end_token: str, force_think: bo
             self.force_think
         )  # assume the output has a `think_start_token` at the beginning
 
-        raise NotImplementedError
-
     def parse_streaming_increment(
         self, new_text: str
     ) -> Tuple[Optional[str], Optional[str]]:

From 9b209bab3af678e282a43fff93ad5d00ac39834e Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 21:10:12 +0800
Subject: [PATCH 11/16] refactor: simplify BaseReasoningParser initialization
 and parsing logic

---
 python/sglang/srt/reasoning_parser.py | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
index dcb22f8192d..90af3e57c63 100644
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -7,28 +7,19 @@
 class BaseReasoningParser:
     """Base class for reasoning parser."""
 
-    def __init__(self, think_start_token: str, think_end_token: str, force_think: bool):
+    def __init__(self, think_start_token: str, think_end_token: str):
         self._buffer = ""
         self.think_start_token = think_start_token
         self.think_end_token = think_end_token
         self.pattern = re.compile(
             rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL
         )
-
-        # whether we assume the output must have a `think_start_token`
-        self.force_think = force_think
-        self.is_reasoning = (
-            self.force_think
-        )  # assume the output has a `think_start_token` at the beginning
+        self.is_reasoning = True
 
     def parse_streaming_increment(
         self, new_text: str
     ) -> Tuple[Optional[str], Optional[str]]:
         """Parse the new text incrementally, return reasoning_content and content."""
-        # Detect the start token for toggling `is_reasoning` when `force_think` is False
-        if not self.force_think and self.think_start_token in new_text:
-            self.is_reasoning = True
-
         # Should parse
         if self.is_reasoning:
             if len(self._buffer) == 0:
@@ -56,18 +47,10 @@ def parse_streaming_increment(
     def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]:
         """Detect and parse the text, return reasoning_content and content."""
         if self.think_end_token not in text:
-            if self.force_think:  # all the output are reasoning content
-                # Remove "<think>" if exists
-                return text.replace(self.think_start_token, ""), ""
-            elif self.think_start_token in text:
-                return text.replace(self.think_start_token, ""), ""
-            else:
-                return "", text
-
+            return text, ""
         else:
             # Add the start token to the beginning of the text.
-            if self.think_start_token not in text:
-                text = self.think_start_token + text
+            text = self.think_start_token + text
 
             reasoning_content = self.pattern.findall(text)[0]
             content = text[
@@ -86,7 +69,7 @@ class DeepSeekR1ReasoningParser(BaseReasoningParser):
     """
 
     def __init__(self):
-        super().__init__("<think> ", "</think> ", True)
+        super().__init__("<think> ", "</think> ")
 
 
 class ReasoningParser:

From 7c61f89e4965600058a6638e812cc468d1f72282 Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 21:11:36 +0800
Subject: [PATCH 12/16] refactor: streamline buffer handling in
 BaseReasoningParser

---
 python/sglang/srt/reasoning_parser.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
index 90af3e57c63..28412abc515 100644
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -22,11 +22,7 @@ def parse_streaming_increment(
         """Parse the new text incrementally, return reasoning_content and content."""
         # Should parse
         if self.is_reasoning:
-            if len(self._buffer) == 0:
-                self._buffer += new_text
-                new_text = new_text.replace(self.think_start_token, "")
-            else:
-                self._buffer += new_text
+            self._buffer += new_text
 
             # Reasoning continues
             if self.think_end_token not in self._buffer:

From d3aa86f1a4aade5a3ac2684507ec876c31079410 Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Wed, 26 Feb 2025 21:37:20 +0800
Subject: [PATCH 13/16] refactor: improve text handling in
 v1_chat_generate_response function

---
 python/sglang/srt/openai_api/adapter.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index fdd50f55e94..c8180978430 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -1091,12 +1091,11 @@ def v1_chat_generate_response(
             choice_logprobs = None
 
         reasoning_content = None
+        text = ret_item["text"]
         if reasoning_parser:
             try:
                 parser = ReasoningParser(reasoning_parser)
-                reasoning_content, ret_item["text"] = parser.parse_non_stream(
-                    ret_item["text"]
-                )
+                reasoning_content, text = parser.parse_non_stream(text)
             except Exception as e:
                 logger.error(f"Exception: {e}")
                 return create_error_response(
@@ -1106,7 +1105,6 @@ def v1_chat_generate_response(
 
         finish_reason = ret_item["meta_info"]["finish_reason"]
         tool_calls = None
-        text = ret_item["text"]
         if isinstance(request, list):
             tool_choice = request[idx].tool_choice
             tools = request[idx].tools

From 07bcd23dc4f90c2935335ce6cc08ced604bc0d4b Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Fri, 28 Feb 2025 17:39:32 +0800
Subject: [PATCH 14/16] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20DeepSeekR1ReasoningP?=
 =?UTF-8?q?arser=20=E7=9A=84=E5=88=9D=E5=A7=8B=E5=8C=96=E5=8F=82=E6=95=B0?=
 =?UTF-8?q?=EF=BC=8C=E5=8E=BB=E6=8E=89=E4=BA=86=E7=A9=BA=E6=A0=BC=E4=BB=A5?=
 =?UTF-8?q?=E7=A1=AE=E4=BF=9D=E6=A0=87=E7=AD=BE=E6=A0=BC=E5=BC=8F=E6=AD=A3?=
 =?UTF-8?q?=E7=A1=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/sglang/srt/reasoning_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py
index 28412abc515..5dc3c0eb52f 100644
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -65,7 +65,7 @@ class DeepSeekR1ReasoningParser(BaseReasoningParser):
     """
 
     def __init__(self):
-        super().__init__("<think> ", "</think> ")
+        super().__init__("<think>", "</think>")
 
 
 class ReasoningParser:

From fb412be2191409a4da8b43e88fa5efce202ffd8a Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Fri, 28 Feb 2025 17:50:03 +0800
Subject: [PATCH 15/16] fix: ret bug in adapter

---
 python/sglang/srt/openai_api/adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index c8180978430..79fe287c479 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -1140,7 +1140,7 @@ def v1_chat_generate_response(
                 "index": 0,
                 "message": {
                     "role": "assistant",
-                    "content": ret_item["text"] if tool_calls is None else None,
+                    "content": text if tool_calls is None else None,
                     "reasoning_content": (
                         reasoning_content if tool_calls is None else None
                     ),

From 305238d5977b9206bc76d8f9adfcf7c76c7ac068 Mon Sep 17 00:00:00 2001
From: xihuai18 <leoxhwang@sjtu.edu.cn>
Date: Fri, 28 Feb 2025 18:02:47 +0800
Subject: [PATCH 16/16] fix: ret bug in adapter

---
 python/sglang/srt/openai_api/adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index 79fe287c479..3fc400cb5b6 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -1159,7 +1159,7 @@ def v1_chat_generate_response(
                 index=idx,
                 message=ChatMessage(
                     role="assistant",
-                    content=ret_item["text"] if tool_calls is None else None,
+                    content=text if tool_calls is None else None,
                     reasoning_content=reasoning_content if tool_calls is None else None,
                     tool_calls=tool_calls,
                 ),