Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ignore #51

Draft
wants to merge 4 commits into
base: afeldman-nm/v1_logprobs
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions vllm/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,9 @@ def new(
prompt_token_ids: Optional[List[int]],
text: str,
token_ids: List[int],
logprobs: Optional[SampleLogprobs],
prompt_logprobs: Optional[PromptLogprobs],
cumulative_logprob: Optional[float],
logprobs: Optional[SampleLogprobs] = None,
prompt_logprobs: Optional[PromptLogprobs] = None,
cumulative_logprob: Optional[float] = None,
finished: bool = False,
) -> "RequestOutput":
"""Initialize a new RequestOutput object."""
Expand Down
71 changes: 34 additions & 37 deletions vllm/v1/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
Tuple, Union)

import torch

from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
Expand All @@ -14,6 +12,8 @@
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus

# import torch

if TYPE_CHECKING:
from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.base import PlaceholderRange
Expand Down Expand Up @@ -403,9 +403,9 @@ def update_from_output(
) -> List[EngineCoreOutput]:
# NOTE(woosuk): This method doesn't consider speculative decoding.
sampled_token_ids = model_runner_output.sampled_token_ids
logprobs_token_ids_cpu = model_runner_output.logprob_token_ids_cpu
logprobs_cpu = model_runner_output.logprobs_cpu
prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
# logprobs_token_ids_cpu = model_runner_output.logprob_token_ids_cpu
# logprobs_cpu = model_runner_output.logprobs_cpu
# prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
num_scheduled_tokens = scheduler_output.num_scheduled_tokens
new_running: List[Request] = []
engine_core_outputs: List[EngineCoreOutput] = []
Expand All @@ -427,9 +427,9 @@ def update_from_output(
# in the decoder's KV cache.
self.encoder_cache_manager.free(request, input_id)

# Extract prompt logprobs for this req if needed.
prompt_logprobs, prompt_logprobs_token_ids = (
prompt_logprobs_dict.get(req_id, (None, None)))
# # Extract prompt logprobs for this req if needed.
# prompt_logprobs, prompt_logprobs_token_ids = (
# prompt_logprobs_dict.get(req_id, (None, None)))

if request.num_computed_tokens == request.num_tokens:
req_index = model_runner_output.req_id_to_index[req_id]
Expand All @@ -444,48 +444,45 @@ def update_from_output(
# This must be called before me make the EngineCoreOutput.
stopped = self._check_stop(request)

# Extract sample logprobs if needed.
logprobs_token_ids: List[torch.Tensor] = []
logprobs: List[torch.Tensor] = []
if request.sampling_params.logprobs:
assert logprobs_token_ids_cpu is not None
assert logprobs_cpu is not None
# Here we assume there is 1 generated token per step.
logprobs_token_ids = [logprobs_token_ids_cpu[req_index]]
logprobs = [logprobs_cpu[req_index]]
# # Extract sample logprobs if needed.
# logprobs_token_ids: List[torch.Tensor] = []
# logprobs: List[torch.Tensor] = []
# if request.sampling_params.logprobs:
# assert logprobs_token_ids_cpu is not None
# assert logprobs_cpu is not None
# # Here we assume there is 1 generated token per step.
# logprobs_token_ids = [logprobs_token_ids_cpu[req_index]]
# logprobs = [logprobs_cpu[req_index]]

# Add EngineCoreOutput for this Request.
output = EngineCoreOutput(
request_id=req_id,
new_token_ids=request.output_token_ids[-num_new_tokens:],
finished=request.is_finished(),
finish_reason=request.get_finished_reason(),
stop_reason=request.stop_reason,
logprobs_token_ids=logprobs_token_ids,
logprobs=logprobs,
prompt_logprobs_token_ids=prompt_logprobs_token_ids,
prompt_logprobs=prompt_logprobs)
stop_reason=request.stop_reason)
engine_core_outputs.append(output)

# Breakout of the loop.
if stopped:
continue

elif prompt_logprobs is not None:
# Chunked prefill & prompt logprobs is enabled; transmit partial
# logprobs via EngineCoreOutput
# Add EngineCoreOutput for this Request.
output = EngineCoreOutput(
request_id=req_id,
new_token_ids=[],
finished=request.is_finished(),
finish_reason=request.get_finished_reason(),
stop_reason=request.stop_reason,
logprobs_token_ids=[],
logprobs=[],
prompt_logprobs_token_ids=prompt_logprobs_token_ids,
prompt_logprobs=prompt_logprobs)
engine_core_outputs.append(output)
# elif prompt_logprobs is not None:
# # Chunked prefill & prompt logprobs is enabled;
# transmit partial
# # logprobs via EngineCoreOutput
# # Add EngineCoreOutput for this Request.
# output = EngineCoreOutput(
# request_id=req_id,
# new_token_ids=[],
# finished=request.is_finished(),
# finish_reason=request.get_finished_reason(),
# stop_reason=request.stop_reason,
# logprobs_token_ids=[],
# logprobs=[],
# prompt_logprobs_token_ids=prompt_logprobs_token_ids,
# prompt_logprobs=prompt_logprobs)
# engine_core_outputs.append(output)

new_running.append(request)
self.running = new_running
Expand Down
8 changes: 4 additions & 4 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ class EngineCoreOutput(

request_id: str
new_token_ids: List[int]
logprobs: List[torch.Tensor]
logprobs_token_ids: List[torch.Tensor]
prompt_logprobs: Optional[torch.Tensor]
prompt_logprobs_token_ids: Optional[torch.Tensor]
finished: bool
logprobs: List[torch.Tensor] = []
logprobs_token_ids: List[torch.Tensor] = []
prompt_logprobs: Optional[torch.Tensor] = None
prompt_logprobs_token_ids: Optional[torch.Tensor] = None
finish_reason: Optional[str] = None
stop_reason: Union[int, str, None] = None

Expand Down
60 changes: 30 additions & 30 deletions vllm/v1/engine/detokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,17 @@ class IncrementalDetokenizer:
# Tokenizer for this request
tokenizer: AnyTokenizer

# Logprobs for this request
logprobs: Optional[SampleLogprobs]
prompt_logprobs: Optional[PromptLogprobs]
cumulative_logprob: Optional[float]
num_logprobs: int
num_prompt_logprobs: int

# Accounting for stop string buffering
stop_buffer_length: int
_last_output_text_offset: int = 0

# Logprobs for this request
logprobs: Optional[SampleLogprobs] = None
prompt_logprobs: Optional[PromptLogprobs] = None
cumulative_logprob: Optional[float] = None
num_logprobs: int = 0
num_prompt_logprobs: int = 0

@property
def output_token_ids(self) -> List[int]:
assert len(self.token_ids) >= len(self.prompt_token_ids)
Expand All @@ -83,8 +83,8 @@ def from_new_request(
else:
stop_buffer_length = 0

logprobs = request.sampling_params.logprobs
prompt_logprobs = request.sampling_params.prompt_logprobs
#logprobs = request.sampling_params.logprobs
#prompt_logprobs = request.sampling_params.prompt_logprobs
return cls(
output_text="",
tokens=tokens,
Expand All @@ -105,11 +105,11 @@ def from_new_request(
prompt_token_ids=request.prompt_token_ids,
tokenizer=tokenizer,
stop_buffer_length=stop_buffer_length,
cumulative_logprob=(0. if logprobs else None),
logprobs=([] if logprobs else None),
prompt_logprobs=([] if prompt_logprobs else None),
num_prompt_logprobs=(prompt_logprobs or 0),
num_logprobs=(logprobs or 0),
#cumulative_logprob=(0. if logprobs else None),
#logprobs=([] if logprobs else None),
#prompt_logprobs=([] if prompt_logprobs else None),
#num_prompt_logprobs=(prompt_logprobs or 0),
#num_logprobs=(logprobs or 0),
)

def _update_sample_logprobs(
Expand Down Expand Up @@ -330,18 +330,18 @@ def add_tokens(
finish_reason = "stop" # TODO: use constant
stop_reason = stop_str

# 3) Make Sample Logprobs.
logprobs = self._update_sample_logprobs(
new_token_ids,
new_logprobs_token_ids,
new_logprobs,
)
# # 3) Make Sample Logprobs.
# logprobs = self._update_sample_logprobs(
# new_token_ids,
# new_logprobs_token_ids,
# new_logprobs,
# )

# 4) Make Prompt Logprobs.
prompt_logprobs = self._update_prompt_logprobs(
new_prompt_logprobs_token_ids,
new_prompt_logprobs,
)
# # 4) Make Prompt Logprobs.
# prompt_logprobs = self._update_prompt_logprobs(
# new_prompt_logprobs_token_ids,
# new_prompt_logprobs,
# )

# 5) Makes the RequestOutput object with the new text.
finished = bool(finish_reason)
Expand All @@ -352,18 +352,18 @@ def add_tokens(
delta = self.output_kind == RequestOutputKind.DELTA
output_text = self._get_next_output_text(finished, delta)
token_ids = new_token_ids if delta else self.output_token_ids
logprobs = logprobs if delta else self.logprobs
prompt_logprobs = prompt_logprobs if delta else self.prompt_logprobs
# logprobs = logprobs if delta else self.logprobs
# prompt_logprobs = prompt_logprobs if delta else self.prompt_logprobs

request_output = RequestOutput.new(
self.request_id,
self.prompt,
self.prompt_token_ids,
output_text,
token_ids,
logprobs,
prompt_logprobs,
self.cumulative_logprob,
None,
None,
None,
finished,
)

Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/engine/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def process_inputs(
# TODO(woosuk): Support pooling models.
# TODO(woosuk): Support encoder-decoder models.

self._validate_logprobs(params)
self._validate_lora(lora_request)
# self._validate_logprobs(params)
# self._validate_lora(lora_request)

if arrival_time is None:
arrival_time = time.time()
Expand Down
Loading