From 3c4b8239a0a036133649f273867d825ef6d1b004 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Tue, 25 Feb 2025 22:44:24 +0800 Subject: [PATCH 01/16] [Feature] Add reasoning parser support to chat generation and completions --- python/sglang/srt/openai_api/adapter.py | 50 +++++++++-- python/sglang/srt/openai_api/protocol.py | 2 + python/sglang/srt/reasoning_parser.py | 109 +++++++++++++++++++++++ python/sglang/srt/server_args.py | 19 ++++ 4 files changed, 175 insertions(+), 5 deletions(-) create mode 100644 python/sglang/srt/reasoning_parser.py diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 0556f852a32..777c2b7a9c0 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -74,6 +74,7 @@ TopLogprob, UsageInfo, ) +from sglang.srt.reasoning_parser import ReasoningParser from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -1038,7 +1039,12 @@ def v1_chat_generate_request( def v1_chat_generate_response( - request, ret, to_file=False, cache_report=False, tool_call_parser=None + request, + ret, + to_file=False, + cache_report=False, + tool_call_parser=None, + reasoning_parser=None, ): choices = [] @@ -1086,9 +1092,22 @@ def v1_chat_generate_response( finish_reason = ret_item["meta_info"]["finish_reason"] + reasoning_content = None + if reasoning_parser: + try: + parser = ReasoningParser(reasoning_parser) + reasoning_content, ret_item["text"] = parser.parse_non_stream( + ret_item["text"] + ) + except Exception as e: + logger.error(f"Exception: {e}") + return create_error_response( + HTTPStatus.BAD_REQUEST, + "Failed to parse reasoning content", + ) + tool_calls = None text = ret_item["text"] - if isinstance(request, list): tool_choice = request[idx].tool_choice tools = request[idx].tools @@ -1125,6 +1144,9 @@ def v1_chat_generate_response( "message": { "role": "assistant", "content": ret_item["text"] if tool_calls is None else None, + "reasoning_content": ( + reasoning_content if tool_calls is None else None + ), "tool_calls": tool_calls, }, "logprobs": choice_logprobs, @@ -1141,6 +1163,7 @@ def v1_chat_generate_response( message=ChatMessage( role="assistant", content=ret_item["text"] if tool_calls is None else None, + reasoning_content=reasoning_content if tool_calls is None else None, tool_calls=tool_calls, ), logprobs=choice_logprobs, @@ -1208,6 +1231,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): if adapted_request.stream: parser_dict = {} + reasoning_parser_dict = {} async def generate_stream_resp(): is_firsts = {} @@ -1302,6 +1326,16 @@ async def generate_stream_resp(): delta = text[len(stream_buffer) :] new_stream_buffer = stream_buffer + delta + reasoning_content = None + if tokenizer_manager.server_args.enable_reasoning: + if index not in reasoning_parser_dict: + reasoning_parser_dict[index] = ReasoningParser( + tokenizer_manager.server_args.reasoning_parser + ) + reasoning_content, delta = reasoning_parser_dict[ + index + ].parse_stream_chunk(delta) + if request.tool_choice != "none" and request.tools: if index not in parser_dict: parser_dict[index] = FunctionCallParser( @@ -1313,11 +1347,14 @@ async def generate_stream_resp(): # parse_increment => returns (normal_text, calls) normal_text, calls = parser.parse_stream_chunk(delta) - # 1) if there's normal_text, output it as normal content + # 1) if there's normal_text, output it as normal content, the reasoning content is also included if normal_text: choice_data = ChatCompletionResponseStreamChoice( index=index, - delta=DeltaMessage(content=normal_text), + delta=DeltaMessage( + content=normal_text, + reasoning_content=reasoning_content, + ), finish_reason=( finish_reason["type"] if finish_reason else "" ), @@ -1386,7 +1423,9 @@ async def generate_stream_resp(): # No tool calls => just treat this as normal text choice_data = ChatCompletionResponseStreamChoice( index=index, - delta=DeltaMessage(content=delta), + delta=DeltaMessage( + content=delta, reasoning_content=reasoning_content + ), finish_reason=( finish_reason["type"] if finish_reason else "" ), @@ -1456,6 +1495,7 @@ async def generate_stream_resp(): ret, cache_report=tokenizer_manager.server_args.enable_cache_report, tool_call_parser=tokenizer_manager.server_args.tool_call_parser, + reasoning_parser=tokenizer_manager.server_args.reasoning_parser, ) return response diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 95b34527edb..6e2ffe2015b 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -344,6 +344,7 @@ class ToolCall(BaseModel): class ChatMessage(BaseModel): role: Optional[str] = None content: Optional[str] = None + reasoning_content: Optional[str] = None tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) @@ -367,6 +368,7 @@ class ChatCompletionResponse(BaseModel): class DeltaMessage(BaseModel): role: Optional[str] = None content: Optional[str] = None + reasoning_content: Optional[str] = None tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py new file mode 100644 index 00000000000..7c8e4a42cd9 --- /dev/null +++ b/python/sglang/srt/reasoning_parser.py @@ -0,0 +1,109 @@ +import json +import logging +import re +from typing import Any, Dict, List, Optional, Tuple, Type + + +class BaseReasoningParser: + """Base class for reasoning parser.""" + + def __init__(self): + self._buffer = "" + + def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: + """Detect and parse the text, return reasoning_content and content.""" + raise NotImplementedError + + def parse_streaming_increment( + self, new_text: str + ) -> Tuple[Optional[str], Optional[str]]: + """Parse the new text incrementally, return reasoning_content and content.""" + raise NotImplementedError + + +class DeepSeekR1ReasoningParser(BaseReasoningParser): + """ + DeepSeekR1 reasoning parser, which use "\n" and "\n" to detect the reasoning part. + Referring to https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#usage-recommendations~. + """ + + def __init__(self): + super().__init__() + self.think_start_token = "\n" + self.think_end_token = "\n" + self.pattern = re.compile( + rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL + ) + + self.is_reasoning = False + + def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: + # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "\n" + # We assume the output has an invisible "\n", and the reasoning part is the whole text. + if self.think_end_token not in text: + return text, None + + else: + # Add the start token to the beginning of the text. + if self.think_start_token not in text: + text = self.think_start_token + text + + reasoning_content = self.pattern.findall(text)[0] + content = text[ + len(self.think_start_token) + + len(reasoning_content) + + len(self.think_end_token) : + ] + + return reasoning_content, content if len(content) > 0 else None + + def parse_streaming_increment( + self, new_text: str + ) -> Tuple[Optional[str], Optional[str]]: + # Again, we assume the output has an invisible "\n" + self._buffer += new_text + + # Should parse + if self.is_reasoning: + # Reasoning continues + if self.think_end_token not in self._buffer: + return new_text, None + # Reasoning ends + else: + reasoning_part = new_text.split(self.think_end_token)[0] + content_part = new_text.split(self.think_end_token)[1] + + self.is_reasoning = False + self._buffer = "" + + return reasoning_part if len(reasoning_part) > 0 else None, ( + content_part if len(content_part) > 0 else None + ) + + else: + return None, new_text + + +class ReasoningParser: + """Reasoning parser for different reasoning models.""" + + ReasoningParserDict: Dict[str, Type[BaseReasoningParser]] = { + "deepseek-r1": DeepSeekR1ReasoningParser + } + + def __init__(self, reasoning_parser: str): + self.parser = self.ReasoningParserDict[reasoning_parser]() + + def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]: + """ + Non-streaming parsing for reasoning models. + Return: reasoning_content, content + """ + return self.parser.detect_and_parse(full_text) + + def parse_stream_chunk(self, chunk_text: str): + """ + Streaming parsing for reasoning models. + Return: reasoning_content, content + """ + return self.parser.parse_streaming_increment(chunk_text) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index a81228ce34f..ed2ad09dffa 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -94,6 +94,8 @@ class ServerArgs: api_key: Optional[str] = None file_storage_pth: str = "sglang_storage" enable_cache_report: bool = False + enable_reasoning: bool = False + reasoning_parser: Optional[str] = None # Data parallelism dp_size: int = 1 @@ -282,6 +284,12 @@ def __post_init__(self): if is_hip(): self.triton_attention_num_kv_splits = 16 + # API Related + if self.enable_reasoning and not self.reasoning_parser: + raise ValueError( + "Reasoning parser must be specified when reasoning is enabled." + ) + @staticmethod def add_cli_args(parser: argparse.ArgumentParser): # Model and port args @@ -600,6 +608,17 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.", ) + parser.add_argument( + "--enable-reasoning", + action="store_true", + help="Enable the reasoning feature.", + ) + parser.add_argument( + "--reasoning-parser", + type=str, + default=ServerArgs.reasoning_parser, + help="Specify the parser for reasoning tasks.", + ) # Data parallelism parser.add_argument( From 1727ba436e895e1ee1671506d49d3ac5853470a6 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Tue, 25 Feb 2025 22:49:59 +0800 Subject: [PATCH 02/16] fix: add choices for reasoning-parser --- python/sglang/srt/server_args.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index ed2ad09dffa..b5a17de660e 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -616,6 +616,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--reasoning-parser", type=str, + choices=["deepseek-r1"], default=ServerArgs.reasoning_parser, help="Specify the parser for reasoning tasks.", ) From 4759bedd1385d847c8a6142bba04f4af3294f4df Mon Sep 17 00:00:00 2001 From: Xihuai Wang Date: Wed, 26 Feb 2025 00:12:31 +0800 Subject: [PATCH 03/16] Fix reasoning_parser.py --- python/sglang/srt/reasoning_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 7c8e4a42cd9..1920e11feee 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -35,7 +35,7 @@ def __init__(self): rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL ) - self.is_reasoning = False + self.is_reasoning = True def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "\n" From 31e4dd5a2e5f9f88a77c42d1358511f1aeefdccb Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Wed, 26 Feb 2025 01:16:23 +0800 Subject: [PATCH 04/16] fix: handle possible "\n" in reasoning parser --- python/sglang/srt/reasoning_parser.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 1920e11feee..1a78dc8c126 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -39,9 +39,10 @@ def __init__(self): def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "\n" - # We assume the output has an invisible "\n", and the reasoning part is the whole text. + # We assume the output has an "\n", and the reasoning part is the whole text. if self.think_end_token not in text: - return text, None + # Remove "\n" if exists + return text.replace(self.think_start_token, ""), None else: # Add the start token to the beginning of the text. @@ -60,11 +61,13 @@ def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: def parse_streaming_increment( self, new_text: str ) -> Tuple[Optional[str], Optional[str]]: - # Again, we assume the output has an invisible "\n" - self._buffer += new_text # Should parse if self.is_reasoning: + # Again, we assume the output has an "\n" + if len(self._buffer) == 0: + new_text = new_text.replace(self.think_start_token, "") + self._buffer += new_text # Reasoning continues if self.think_end_token not in self._buffer: return new_text, None From d19b7a0bffcd1553ba8f9d635550d5d56f01fd4e Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Wed, 26 Feb 2025 01:26:28 +0800 Subject: [PATCH 05/16] fix: update help text for reasoning parser to include supported models --- python/sglang/srt/server_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index b5a17de660e..dc0af6b3cd3 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -618,7 +618,7 @@ def add_cli_args(parser: argparse.ArgumentParser): type=str, choices=["deepseek-r1"], default=ServerArgs.reasoning_parser, - help="Specify the parser for reasoning tasks.", + help="Specify the parser for reasoning models, supported parsers are: deepseek-r1.", ) # Data parallelism From addaeb564feb1e4346fe8b68bb77dde26ba26c07 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Wed, 26 Feb 2025 02:16:29 +0800 Subject: [PATCH 06/16] fix: update reasoning parser to handle changes in DeepSeek output format --- python/sglang/srt/openai_api/adapter.py | 4 ++++ python/sglang/srt/reasoning_parser.py | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 777c2b7a9c0..11a1abbcfb5 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1099,6 +1099,8 @@ def v1_chat_generate_response( reasoning_content, ret_item["text"] = parser.parse_non_stream( ret_item["text"] ) + if not ret_item["text"]: + ret_item["text"] = "" except Exception as e: logger.error(f"Exception: {e}") return create_error_response( @@ -1335,6 +1337,8 @@ async def generate_stream_resp(): reasoning_content, delta = reasoning_parser_dict[ index ].parse_stream_chunk(delta) + if not delta: + delta = "" if request.tool_choice != "none" and request.tools: if index not in parser_dict: diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 1a78dc8c126..f769096ddc4 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -23,14 +23,14 @@ def parse_streaming_increment( class DeepSeekR1ReasoningParser(BaseReasoningParser): """ - DeepSeekR1 reasoning parser, which use "\n" and "\n" to detect the reasoning part. + DeepSeekR1 reasoning parser, which use "" and "" to detect the reasoning part. Referring to https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#usage-recommendations~. """ def __init__(self): super().__init__() - self.think_start_token = "\n" - self.think_end_token = "\n" + self.think_start_token = "" + self.think_end_token = "" self.pattern = re.compile( rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL ) @@ -38,10 +38,10 @@ def __init__(self): self.is_reasoning = True def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: - # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "\n" - # We assume the output has an "\n", and the reasoning part is the whole text. + # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "" + # We assume the output has an "", and the reasoning part is the whole text. if self.think_end_token not in text: - # Remove "\n" if exists + # Remove "" if exists return text.replace(self.think_start_token, ""), None else: @@ -64,7 +64,7 @@ def parse_streaming_increment( # Should parse if self.is_reasoning: - # Again, we assume the output has an "\n" + # Again, we assume the output has an "" if len(self._buffer) == 0: new_text = new_text.replace(self.think_start_token, "") self._buffer += new_text From f362249fcd16fe1ecf53c8c96cee383a563d6d24 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Wed, 26 Feb 2025 11:44:29 +0800 Subject: [PATCH 07/16] fix: refine reasoning parser output handling and clean up response logic --- python/sglang/srt/openai_api/adapter.py | 7 +------ python/sglang/srt/reasoning_parser.py | 14 ++++++-------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 11a1abbcfb5..fdd50f55e94 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1090,8 +1090,6 @@ def v1_chat_generate_response( else: choice_logprobs = None - finish_reason = ret_item["meta_info"]["finish_reason"] - reasoning_content = None if reasoning_parser: try: @@ -1099,8 +1097,6 @@ def v1_chat_generate_response( reasoning_content, ret_item["text"] = parser.parse_non_stream( ret_item["text"] ) - if not ret_item["text"]: - ret_item["text"] = "" except Exception as e: logger.error(f"Exception: {e}") return create_error_response( @@ -1108,6 +1104,7 @@ def v1_chat_generate_response( "Failed to parse reasoning content", ) + finish_reason = ret_item["meta_info"]["finish_reason"] tool_calls = None text = ret_item["text"] if isinstance(request, list): @@ -1337,8 +1334,6 @@ async def generate_stream_resp(): reasoning_content, delta = reasoning_parser_dict[ index ].parse_stream_chunk(delta) - if not delta: - delta = "" if request.tool_choice != "none" and request.tools: if index not in parser_dict: diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index f769096ddc4..81d8ed780f2 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -35,14 +35,14 @@ def __init__(self): rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL ) - self.is_reasoning = True + self.is_reasoning = True def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "" # We assume the output has an "", and the reasoning part is the whole text. if self.think_end_token not in text: # Remove "" if exists - return text.replace(self.think_start_token, ""), None + return text.replace(self.think_start_token, ""), "" else: # Add the start token to the beginning of the text. @@ -56,7 +56,7 @@ def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: + len(self.think_end_token) : ] - return reasoning_content, content if len(content) > 0 else None + return reasoning_content, content def parse_streaming_increment( self, new_text: str @@ -70,7 +70,7 @@ def parse_streaming_increment( self._buffer += new_text # Reasoning continues if self.think_end_token not in self._buffer: - return new_text, None + return new_text, "" # Reasoning ends else: reasoning_part = new_text.split(self.think_end_token)[0] @@ -79,12 +79,10 @@ def parse_streaming_increment( self.is_reasoning = False self._buffer = "" - return reasoning_part if len(reasoning_part) > 0 else None, ( - content_part if len(content_part) > 0 else None - ) + return reasoning_part, content_part else: - return None, new_text + return "", new_text class ReasoningParser: From 2ed2b6d7d4cc4ac7f97d491740b8d9c3b862af80 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Wed, 26 Feb 2025 14:34:50 +0800 Subject: [PATCH 08/16] docs: add reasoning parser documentation for DeepSeek model support --- docs/backend/reasoning_parser.md | 127 +++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 docs/backend/reasoning_parser.md diff --git a/docs/backend/reasoning_parser.md b/docs/backend/reasoning_parser.md new file mode 100644 index 00000000000..607acd6c012 --- /dev/null +++ b/docs/backend/reasoning_parser.md @@ -0,0 +1,127 @@ +# Reasoning Parser + +SGLang support parsing the reasoning content from reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) for convenient output processing in the downstream applications. + +Following Official [DeepSeek API design](https://api-docs.deepseek.com/guides/reasoning_model), SGLang offering reasoning content and final conclusions: + +- `reasoning_content`: The content of the CoT. +- `content`: The content of the final answer. + +## Supported Models + +Currently, SGLang supports the following reasoning models: +- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `` and `` tags. + +## Usage + +You need to enable the reasoning parser in the SGLang API server by setting the `--enable-reasoning` and `--reasoning-parser` options. The `--reasoning-parser` option specifies the reasoning parser to extract the reasoning content and final answer. + +```bash +python -m sglang.launch_server --host 0.0.0.0 \ +--model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ +--enable-reasoning --reasoning-parser deepseek-r1 +``` + +### Non-streaming Request + +Make a request to the reasoning model, get the reasoning content and final answer. + +Using OpenAI python api: +```python +import openai + +client = openai.Client(base_url="http://localhost:30000/v1", api_key="None") + +response = client.chat.completions.create( + model="deepseek-r1:14b", + messages=[{"role": "user", "content": "Compute 1+3"}], + max_tokens=1024, + stream=False +) + +response.choices[0].message.reasoning_content +# 'First, I recognize that the problem requires adding the numbers 1 and 3.\n\nNext, I identify the numbers to be added, which are 1 and 3.\n\nThen, I perform the addition operation: 1 plus 3 equals 4.\n\nFinally, I conclude that the sum of 1 and 3 is 4.\n' +response.choices[0].message.content +# \n\nTo compute \\(1 + 3\\), follow these simple steps:\n\n1. **Identify the numbers to add:** \n The numbers are **1** and **3**.\n\n2. **Add the numbers together:** \n \\[\n 1 + 3 = 4\n \\]\n\n3. **Write the final answer:** \n The sum of \\(1 + 3\\) is \\(\\boxed{4}\\).' +``` + +### Streaming Request + +`reasoning_content` is available in the `delta` field of the streaming response. + +Using OpenAI python api: + +```python +# ... Initialize the client as before ... + +response = client.chat.completions.create( + model="deepseek-r1:14b", + messages=[{"role": "user", "content": "Compute 1+3"}], + max_tokens=1024, + stream=True +) +reasoning_content = "" +content = "" +for chunk in response: + if chunk.choices[0].delta.content: + content += chunk.choices[0].delta.content + elif chunk.choices[0].delta.reasoning_content: + reasoning_content += chunk.choices[0].delta.reasoning_content + +reasoning_content +# 'I need to calculate the sum of 1 and 3. \n\nFirst, I identify the numbers involved in the addition: 1 and 3.\n\nNext, I add these two numbers together to find the total.\n\nFinally, the result of the addition is 4.\n' +content +# '\n\n**Solution:**\n\nWe need to compute the sum of 1 and 3.\n\n1. **Identify the numbers to add:**\n - Number 1\n - Number 3\n\n2. **Add the numbers together:**\n \\[\n 1 + 3 = 4\n \\]\n\n3. **Final Answer:**\n \\[\n \\boxed{4}\n \\]' +``` + + +## Supported More Reasoning Models + +For future reasoning models, you can implement the reasoning parser as a subclass of `BaseReasoningParser` in `python/sglang/srt/reasoning_parser.py`. + +```python +class BaseReasoningParser: + """Base class for reasoning parser.""" + + def __init__(self): + self._buffer = "" + + def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: + """Detect and parse the text, return reasoning_content and content.""" + raise NotImplementedError + + def parse_streaming_increment( + self, new_text: str + ) -> Tuple[Optional[str], Optional[str]]: + """Parse the new text incrementally, return reasoning_content and content.""" + raise NotImplementedError +``` + +And specify the reasoning parser for new reasoning models accordingly. + +```python +class ReasoningParser: + """Reasoning parser for different reasoning models.""" + + # Specify the reasoning parser for each reasoning model here + ReasoningParserDict: Dict[str, Type[BaseReasoningParser]] = { + "deepseek-r1": DeepSeekR1ReasoningParser + } + + def __init__(self, reasoning_parser: str): + self.parser = self.ReasoningParserDict[reasoning_parser]() + + def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]: + """ + Non-streaming parsing for reasoning models. + Return: reasoning_content, content + """ + return self.parser.detect_and_parse(full_text) + + def parse_stream_chunk(self, chunk_text: str): + """ + Streaming parsing for reasoning models. + Return: reasoning_content, content + """ + return self.parser.parse_streaming_increment(chunk_text) +``` From 6cedd5ff3e6253cc134b3586b3e5c4e1f32875ab Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Wed, 26 Feb 2025 20:34:08 +0800 Subject: [PATCH 09/16] refactor: enhance DeepSeekR1ReasoningParser initialization and parsing logic --- python/sglang/srt/reasoning_parser.py | 94 ++++++++++++++------------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 81d8ed780f2..15398a5eb6c 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -7,42 +7,64 @@ class BaseReasoningParser: """Base class for reasoning parser.""" - def __init__(self): + def __init__(self, think_start_token: str, think_end_token: str, force_think: bool): self._buffer = "" + self.think_start_token = think_start_token + self.think_end_token = think_end_token + self.pattern = re.compile( + rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL + ) + + # whether we assume the output must have a `think_start_token` + self.force_think = force_think + self.is_reasoning = ( + self.force_think + ) # assume the output has a `think_start_token` at the beginning - def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: - """Detect and parse the text, return reasoning_content and content.""" raise NotImplementedError def parse_streaming_increment( self, new_text: str ) -> Tuple[Optional[str], Optional[str]]: """Parse the new text incrementally, return reasoning_content and content.""" - raise NotImplementedError + # Detect the start token for toggling `is_reasoning` when `force_think` is False + if not self.force_think and self.think_start_token in new_text: + self.is_reasoning = True + # Should parse + if self.is_reasoning: + if len(self._buffer) == 0: + self._buffer += new_text + new_text = new_text.replace(self.think_start_token, "") + else: + self._buffer += new_text -class DeepSeekR1ReasoningParser(BaseReasoningParser): - """ - DeepSeekR1 reasoning parser, which use "" and "" to detect the reasoning part. - Referring to https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#usage-recommendations~. - """ + # Reasoning continues + if self.think_end_token not in self._buffer: + return new_text, "" + # Reasoning ends + else: + reasoning_part = new_text.split(self.think_end_token)[0] + content_part = new_text.split(self.think_end_token)[1] - def __init__(self): - super().__init__() - self.think_start_token = "" - self.think_end_token = "" - self.pattern = re.compile( - rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL - ) + self.is_reasoning = False + self._buffer = "" + + return reasoning_part, content_part - self.is_reasoning = True + else: + return "", new_text def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: - # After DeepSeek update their chat templates in R1 series models, the reasoning models do not output "" - # We assume the output has an "", and the reasoning part is the whole text. + """Detect and parse the text, return reasoning_content and content.""" if self.think_end_token not in text: - # Remove "" if exists - return text.replace(self.think_start_token, ""), "" + if self.force_think: # all the output are reasoning content + # Remove "" if exists + return text.replace(self.think_start_token, ""), "" + elif self.think_start_token in text: + return text.replace(self.think_start_token, ""), "" + else: + return "", text else: # Add the start token to the beginning of the text. @@ -58,31 +80,15 @@ def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: return reasoning_content, content - def parse_streaming_increment( - self, new_text: str - ) -> Tuple[Optional[str], Optional[str]]: - - # Should parse - if self.is_reasoning: - # Again, we assume the output has an "" - if len(self._buffer) == 0: - new_text = new_text.replace(self.think_start_token, "") - self._buffer += new_text - # Reasoning continues - if self.think_end_token not in self._buffer: - return new_text, "" - # Reasoning ends - else: - reasoning_part = new_text.split(self.think_end_token)[0] - content_part = new_text.split(self.think_end_token)[1] - - self.is_reasoning = False - self._buffer = "" - return reasoning_part, content_part +class DeepSeekR1ReasoningParser(BaseReasoningParser): + """ + DeepSeekR1 reasoning parser, which use "" and "" to detect the reasoning part. + Referring to https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#usage-recommendations~. + """ - else: - return "", new_text + def __init__(self): + super().__init__(" ", " ", True) class ReasoningParser: From 857297cd765804bfc13f4eff857b666182a963f9 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Wed, 26 Feb 2025 20:52:31 +0800 Subject: [PATCH 10/16] refactor: remove NotImplementedError from BaseReasoningParser class --- python/sglang/srt/reasoning_parser.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 15398a5eb6c..dcb22f8192d 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -21,8 +21,6 @@ def __init__(self, think_start_token: str, think_end_token: str, force_think: bo self.force_think ) # assume the output has a `think_start_token` at the beginning - raise NotImplementedError - def parse_streaming_increment( self, new_text: str ) -> Tuple[Optional[str], Optional[str]]: From 9b209bab3af678e282a43fff93ad5d00ac39834e Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Wed, 26 Feb 2025 21:10:12 +0800 Subject: [PATCH 11/16] refactor: simplify BaseReasoningParser initialization and parsing logic --- python/sglang/srt/reasoning_parser.py | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index dcb22f8192d..90af3e57c63 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -7,28 +7,19 @@ class BaseReasoningParser: """Base class for reasoning parser.""" - def __init__(self, think_start_token: str, think_end_token: str, force_think: bool): + def __init__(self, think_start_token: str, think_end_token: str): self._buffer = "" self.think_start_token = think_start_token self.think_end_token = think_end_token self.pattern = re.compile( rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL ) - - # whether we assume the output must have a `think_start_token` - self.force_think = force_think - self.is_reasoning = ( - self.force_think - ) # assume the output has a `think_start_token` at the beginning + self.is_reasoning = True def parse_streaming_increment( self, new_text: str ) -> Tuple[Optional[str], Optional[str]]: """Parse the new text incrementally, return reasoning_content and content.""" - # Detect the start token for toggling `is_reasoning` when `force_think` is False - if not self.force_think and self.think_start_token in new_text: - self.is_reasoning = True - # Should parse if self.is_reasoning: if len(self._buffer) == 0: @@ -56,18 +47,10 @@ def parse_streaming_increment( def detect_and_parse(self, text: str) -> Tuple[Optional[str], Optional[str]]: """Detect and parse the text, return reasoning_content and content.""" if self.think_end_token not in text: - if self.force_think: # all the output are reasoning content - # Remove "" if exists - return text.replace(self.think_start_token, ""), "" - elif self.think_start_token in text: - return text.replace(self.think_start_token, ""), "" - else: - return "", text - + return text, "" else: # Add the start token to the beginning of the text. - if self.think_start_token not in text: - text = self.think_start_token + text + text = self.think_start_token + text reasoning_content = self.pattern.findall(text)[0] content = text[ @@ -86,7 +69,7 @@ class DeepSeekR1ReasoningParser(BaseReasoningParser): """ def __init__(self): - super().__init__(" ", " ", True) + super().__init__(" ", " ") class ReasoningParser: From 7c61f89e4965600058a6638e812cc468d1f72282 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Wed, 26 Feb 2025 21:11:36 +0800 Subject: [PATCH 12/16] refactor: streamline buffer handling in BaseReasoningParser --- python/sglang/srt/reasoning_parser.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 90af3e57c63..28412abc515 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -22,11 +22,7 @@ def parse_streaming_increment( """Parse the new text incrementally, return reasoning_content and content.""" # Should parse if self.is_reasoning: - if len(self._buffer) == 0: - self._buffer += new_text - new_text = new_text.replace(self.think_start_token, "") - else: - self._buffer += new_text + self._buffer += new_text # Reasoning continues if self.think_end_token not in self._buffer: From d3aa86f1a4aade5a3ac2684507ec876c31079410 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Wed, 26 Feb 2025 21:37:20 +0800 Subject: [PATCH 13/16] refactor: improve text handling in v1_chat_generate_response function --- python/sglang/srt/openai_api/adapter.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index fdd50f55e94..c8180978430 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1091,12 +1091,11 @@ def v1_chat_generate_response( choice_logprobs = None reasoning_content = None + text = ret_item["text"] if reasoning_parser: try: parser = ReasoningParser(reasoning_parser) - reasoning_content, ret_item["text"] = parser.parse_non_stream( - ret_item["text"] - ) + reasoning_content, text = parser.parse_non_stream(text) except Exception as e: logger.error(f"Exception: {e}") return create_error_response( @@ -1106,7 +1105,6 @@ def v1_chat_generate_response( finish_reason = ret_item["meta_info"]["finish_reason"] tool_calls = None - text = ret_item["text"] if isinstance(request, list): tool_choice = request[idx].tool_choice tools = request[idx].tools From 07bcd23dc4f90c2935335ce6cc08ced604bc0d4b Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Fri, 28 Feb 2025 17:39:32 +0800 Subject: [PATCH 14/16] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20DeepSeekR1ReasoningP?= =?UTF-8?q?arser=20=E7=9A=84=E5=88=9D=E5=A7=8B=E5=8C=96=E5=8F=82=E6=95=B0?= =?UTF-8?q?=EF=BC=8C=E5=8E=BB=E6=8E=89=E4=BA=86=E7=A9=BA=E6=A0=BC=E4=BB=A5?= =?UTF-8?q?=E7=A1=AE=E4=BF=9D=E6=A0=87=E7=AD=BE=E6=A0=BC=E5=BC=8F=E6=AD=A3?= =?UTF-8?q?=E7=A1=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/sglang/srt/reasoning_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 28412abc515..5dc3c0eb52f 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -65,7 +65,7 @@ class DeepSeekR1ReasoningParser(BaseReasoningParser): """ def __init__(self): - super().__init__(" ", " ") + super().__init__("", "") class ReasoningParser: From fb412be2191409a4da8b43e88fa5efce202ffd8a Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Fri, 28 Feb 2025 17:50:03 +0800 Subject: [PATCH 15/16] fix: ret bug in adapter --- python/sglang/srt/openai_api/adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index c8180978430..79fe287c479 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1140,7 +1140,7 @@ def v1_chat_generate_response( "index": 0, "message": { "role": "assistant", - "content": ret_item["text"] if tool_calls is None else None, + "content": text if tool_calls is None else None, "reasoning_content": ( reasoning_content if tool_calls is None else None ), From 305238d5977b9206bc76d8f9adfcf7c76c7ac068 Mon Sep 17 00:00:00 2001 From: xihuai18 Date: Fri, 28 Feb 2025 18:02:47 +0800 Subject: [PATCH 16/16] fix: ret bug in adapter --- python/sglang/srt/openai_api/adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 79fe287c479..3fc400cb5b6 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1159,7 +1159,7 @@ def v1_chat_generate_response( index=idx, message=ChatMessage( role="assistant", - content=ret_item["text"] if tool_calls is None else None, + content=text if tool_calls is None else None, reasoning_content=reasoning_content if tool_calls is None else None, tool_calls=tool_calls, ),