From c37d2c488745905f757f3f03e49a4dc834f918a4 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 8 Jan 2025 09:28:57 +0000 Subject: [PATCH 1/4] test: removing logprobs from detokenizer Signed-off-by: Andrew Feldman --- vllm/outputs.py | 6 ++-- vllm/v1/engine/detokenizer.py | 60 +++++++++++++++++------------------ 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/vllm/outputs.py b/vllm/outputs.py index fcaed0f95a92a..8f0ccd5fd7c5f 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -140,9 +140,9 @@ def new( prompt_token_ids: Optional[List[int]], text: str, token_ids: List[int], - logprobs: Optional[SampleLogprobs], - prompt_logprobs: Optional[PromptLogprobs], - cumulative_logprob: Optional[float], + logprobs: Optional[SampleLogprobs] = None, + prompt_logprobs: Optional[PromptLogprobs] = None, + cumulative_logprob: Optional[float] = None, finished: bool = False, ) -> "RequestOutput": """Initialize a new RequestOutput object.""" diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 0725b1ac3742e..fb1c6add30dd8 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -46,17 +46,17 @@ class IncrementalDetokenizer: # Tokenizer for this request tokenizer: AnyTokenizer - # Logprobs for this request - logprobs: Optional[SampleLogprobs] - prompt_logprobs: Optional[PromptLogprobs] - cumulative_logprob: Optional[float] - num_logprobs: int - num_prompt_logprobs: int - # Accounting for stop string buffering stop_buffer_length: int _last_output_text_offset: int = 0 + # Logprobs for this request + logprobs: Optional[SampleLogprobs] = None + prompt_logprobs: Optional[PromptLogprobs] = None + cumulative_logprob: Optional[float] = None + num_logprobs: int = 0 + num_prompt_logprobs: int = 0 + @property def output_token_ids(self) -> List[int]: assert len(self.token_ids) >= len(self.prompt_token_ids) @@ -83,8 +83,8 @@ def from_new_request( else: stop_buffer_length = 0 - logprobs = request.sampling_params.logprobs - prompt_logprobs = request.sampling_params.prompt_logprobs + #logprobs = request.sampling_params.logprobs + #prompt_logprobs = request.sampling_params.prompt_logprobs return cls( output_text="", tokens=tokens, @@ -105,11 +105,11 @@ def from_new_request( prompt_token_ids=request.prompt_token_ids, tokenizer=tokenizer, stop_buffer_length=stop_buffer_length, - cumulative_logprob=(0. if logprobs else None), - logprobs=([] if logprobs else None), - prompt_logprobs=([] if prompt_logprobs else None), - num_prompt_logprobs=(prompt_logprobs or 0), - num_logprobs=(logprobs or 0), + #cumulative_logprob=(0. if logprobs else None), + #logprobs=([] if logprobs else None), + #prompt_logprobs=([] if prompt_logprobs else None), + #num_prompt_logprobs=(prompt_logprobs or 0), + #num_logprobs=(logprobs or 0), ) def _update_sample_logprobs( @@ -330,18 +330,18 @@ def add_tokens( finish_reason = "stop" # TODO: use constant stop_reason = stop_str - # 3) Make Sample Logprobs. - logprobs = self._update_sample_logprobs( - new_token_ids, - new_logprobs_token_ids, - new_logprobs, - ) + # # 3) Make Sample Logprobs. + # logprobs = self._update_sample_logprobs( + # new_token_ids, + # new_logprobs_token_ids, + # new_logprobs, + # ) - # 4) Make Prompt Logprobs. - prompt_logprobs = self._update_prompt_logprobs( - new_prompt_logprobs_token_ids, - new_prompt_logprobs, - ) + # # 4) Make Prompt Logprobs. + # prompt_logprobs = self._update_prompt_logprobs( + # new_prompt_logprobs_token_ids, + # new_prompt_logprobs, + # ) # 5) Makes the RequestOutput object with the new text. finished = bool(finish_reason) @@ -352,8 +352,8 @@ def add_tokens( delta = self.output_kind == RequestOutputKind.DELTA output_text = self._get_next_output_text(finished, delta) token_ids = new_token_ids if delta else self.output_token_ids - logprobs = logprobs if delta else self.logprobs - prompt_logprobs = prompt_logprobs if delta else self.prompt_logprobs + # logprobs = logprobs if delta else self.logprobs + # prompt_logprobs = prompt_logprobs if delta else self.prompt_logprobs request_output = RequestOutput.new( self.request_id, @@ -361,9 +361,9 @@ def add_tokens( self.prompt_token_ids, output_text, token_ids, - logprobs, - prompt_logprobs, - self.cumulative_logprob, + None, + None, + None, finished, ) From edd0f94b145ea2029af46c4923fd2c47d2193dc2 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 8 Jan 2025 09:50:23 +0000 Subject: [PATCH 2/4] removed scheduler logprobs code Signed-off-by: Andrew Feldman --- vllm/v1/core/scheduler.py | 73 ++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index e1ccbab5948b9..7c6c6183fadab 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -3,8 +3,6 @@ from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set, Tuple, Union) -import torch - from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.logger import init_logger from vllm.sampling_params import SamplingParams @@ -14,6 +12,8 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus +# import torch + if TYPE_CHECKING: from vllm.multimodal import MultiModalKwargs from vllm.multimodal.base import PlaceholderRange @@ -403,9 +403,9 @@ def update_from_output( ) -> List[EngineCoreOutput]: # NOTE(woosuk): This method doesn't consider speculative decoding. sampled_token_ids = model_runner_output.sampled_token_ids - logprobs_token_ids_cpu = model_runner_output.logprob_token_ids_cpu - logprobs_cpu = model_runner_output.logprobs_cpu - prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict + # logprobs_token_ids_cpu = model_runner_output.logprob_token_ids_cpu + # logprobs_cpu = model_runner_output.logprobs_cpu + # prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: List[Request] = [] engine_core_outputs: List[EngineCoreOutput] = [] @@ -427,9 +427,9 @@ def update_from_output( # in the decoder's KV cache. self.encoder_cache_manager.free(request, input_id) - # Extract prompt logprobs for this req if needed. - prompt_logprobs, prompt_logprobs_token_ids = ( - prompt_logprobs_dict.get(req_id, (None, None))) + # # Extract prompt logprobs for this req if needed. + # prompt_logprobs, prompt_logprobs_token_ids = ( + # prompt_logprobs_dict.get(req_id, (None, None))) if request.num_computed_tokens == request.num_tokens: req_index = model_runner_output.req_id_to_index[req_id] @@ -444,15 +444,15 @@ def update_from_output( # This must be called before me make the EngineCoreOutput. stopped = self._check_stop(request) - # Extract sample logprobs if needed. - logprobs_token_ids: List[torch.Tensor] = [] - logprobs: List[torch.Tensor] = [] - if request.sampling_params.logprobs: - assert logprobs_token_ids_cpu is not None - assert logprobs_cpu is not None - # Here we assume there is 1 generated token per step. - logprobs_token_ids = [logprobs_token_ids_cpu[req_index]] - logprobs = [logprobs_cpu[req_index]] + # # Extract sample logprobs if needed. + # logprobs_token_ids: List[torch.Tensor] = [] + # logprobs: List[torch.Tensor] = [] + # if request.sampling_params.logprobs: + # assert logprobs_token_ids_cpu is not None + # assert logprobs_cpu is not None + # # Here we assume there is 1 generated token per step. + # logprobs_token_ids = [logprobs_token_ids_cpu[req_index]] + # logprobs = [logprobs_cpu[req_index]] # Add EngineCoreOutput for this Request. output = EngineCoreOutput( @@ -461,31 +461,32 @@ def update_from_output( finished=request.is_finished(), finish_reason=request.get_finished_reason(), stop_reason=request.stop_reason, - logprobs_token_ids=logprobs_token_ids, - logprobs=logprobs, - prompt_logprobs_token_ids=prompt_logprobs_token_ids, - prompt_logprobs=prompt_logprobs) + logprobs_token_ids=[], + logprobs=[], + prompt_logprobs_token_ids=None, + prompt_logprobs=None) engine_core_outputs.append(output) # Breakout of the loop. if stopped: continue - elif prompt_logprobs is not None: - # Chunked prefill & prompt logprobs is enabled; transmit partial - # logprobs via EngineCoreOutput - # Add EngineCoreOutput for this Request. - output = EngineCoreOutput( - request_id=req_id, - new_token_ids=[], - finished=request.is_finished(), - finish_reason=request.get_finished_reason(), - stop_reason=request.stop_reason, - logprobs_token_ids=[], - logprobs=[], - prompt_logprobs_token_ids=prompt_logprobs_token_ids, - prompt_logprobs=prompt_logprobs) - engine_core_outputs.append(output) + # elif prompt_logprobs is not None: + # # Chunked prefill & prompt logprobs is enabled; + # transmit partial + # # logprobs via EngineCoreOutput + # # Add EngineCoreOutput for this Request. + # output = EngineCoreOutput( + # request_id=req_id, + # new_token_ids=[], + # finished=request.is_finished(), + # finish_reason=request.get_finished_reason(), + # stop_reason=request.stop_reason, + # logprobs_token_ids=[], + # logprobs=[], + # prompt_logprobs_token_ids=prompt_logprobs_token_ids, + # prompt_logprobs=prompt_logprobs) + # engine_core_outputs.append(output) new_running.append(request) self.running = new_running From 8482b5c32e7d128f794d8fe152b684714bff6791 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 8 Jan 2025 10:10:45 +0000 Subject: [PATCH 3/4] disabled logprobs and loras process_inputs checks Signed-off-by: Andrew Feldman --- vllm/v1/engine/processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index cd55c37c4d77f..375e01a9617ab 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -92,8 +92,8 @@ def process_inputs( # TODO(woosuk): Support pooling models. # TODO(woosuk): Support encoder-decoder models. - self._validate_logprobs(params) - self._validate_lora(lora_request) + # self._validate_logprobs(params) + # self._validate_lora(lora_request) if arrival_time is None: arrival_time = time.time() From d2fcc64fa2c4c1973891f0dbba94c80288afc072 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 8 Jan 2025 10:26:30 +0000 Subject: [PATCH 4/4] further stripped down scheduler code Signed-off-by: Andrew Feldman --- vllm/v1/core/scheduler.py | 6 +----- vllm/v1/engine/__init__.py | 8 ++++---- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 7c6c6183fadab..9431e1903d5bf 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -460,11 +460,7 @@ def update_from_output( new_token_ids=request.output_token_ids[-num_new_tokens:], finished=request.is_finished(), finish_reason=request.get_finished_reason(), - stop_reason=request.stop_reason, - logprobs_token_ids=[], - logprobs=[], - prompt_logprobs_token_ids=None, - prompt_logprobs=None) + stop_reason=request.stop_reason) engine_core_outputs.append(output) # Breakout of the loop. diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index be6c7a441eaab..2e3943b81e544 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -41,11 +41,11 @@ class EngineCoreOutput( request_id: str new_token_ids: List[int] - logprobs: List[torch.Tensor] - logprobs_token_ids: List[torch.Tensor] - prompt_logprobs: Optional[torch.Tensor] - prompt_logprobs_token_ids: Optional[torch.Tensor] finished: bool + logprobs: List[torch.Tensor] = [] + logprobs_token_ids: List[torch.Tensor] = [] + prompt_logprobs: Optional[torch.Tensor] = None + prompt_logprobs_token_ids: Optional[torch.Tensor] = None finish_reason: Optional[str] = None stop_reason: Union[int, str, None] = None