Skip to content

Commit

Permalink
[Platform] Move async output check to platform (vllm-project#10768)
Browse files Browse the repository at this point in the history
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
  • Loading branch information
wangxiyuan authored Dec 9, 2024
1 parent e691b26 commit aea2fc3
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 22 deletions.
17 changes: 3 additions & 14 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,11 +513,10 @@ def verify_async_output_proc(self, parallel_config, speculative_config,

# Reminder: Please update docs/source/usage/compatibility_matrix.rst
# If the feature combo become valid
if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"):
if not current_platform.is_async_output_supported(self.enforce_eager):
logger.warning(
"Async output processing is only supported for CUDA, TPU, XPU "
"and HPU."
"Disabling it for other platforms.")
"Async output processing is not supported on the "
"current platform type %s.", current_platform.device_type)
self.use_async_output_proc = False
return

Expand All @@ -527,16 +526,6 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
self.use_async_output_proc = False
return

# Reminder: Please update docs/source/usage/compatibility_matrix.rst
# If the feature combo become valid
if device_config.device_type == "cuda" and self.enforce_eager:
logger.warning(
"To see benefits of async output processing, enable CUDA "
"graph. Since, enforce-eager is enabled, async output "
"processor cannot be used")
self.use_async_output_proc = not self.enforce_eager
return

# Async postprocessor is not necessary with embedding mode
# since there is no token generation
if self.task == "embedding":
Expand Down
6 changes: 5 additions & 1 deletion vllm/platforms/cpu.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional

import psutil
import torch
Expand Down Expand Up @@ -37,6 +37,10 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
def get_device_total_memory(cls, device_id: int = 0) -> int:
return psutil.virtual_memory().total

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return False

@classmethod
def inference_mode(cls):
return torch.no_grad()
Expand Down
12 changes: 11 additions & 1 deletion vllm/platforms/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import os
from functools import lru_cache, wraps
from typing import TYPE_CHECKING, Callable, List, TypeVar
from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar

import pynvml
import torch
Expand Down Expand Up @@ -88,6 +88,16 @@ def get_device_name(cls, device_id: int = 0) -> str:
def get_device_total_memory(cls, device_id: int = 0) -> int:
raise NotImplementedError

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
if enforce_eager:
logger.warning(
"To see benefits of async output processing, enable CUDA "
"graph. Since, enforce-eager is enabled, async output "
"processor cannot be used")
return False
return True

@classmethod
def is_full_nvlink(cls, device_ids: List[int]) -> bool:
raise NotImplementedError
Expand Down
6 changes: 5 additions & 1 deletion vllm/platforms/hpu.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional

import torch

Expand All @@ -20,6 +20,10 @@ class HpuPlatform(Platform):
def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
return _Backend.HPU_ATTN

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return True

@staticmethod
def inference_mode():
return torch.no_grad()
Expand Down
11 changes: 11 additions & 0 deletions vllm/platforms/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@
import numpy as np
import torch

from vllm.logger import init_logger

if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None

logger = init_logger(__name__)


class _Backend(enum.Enum):
FLASH_ATTN = enum.auto()
Expand Down Expand Up @@ -147,6 +151,13 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
"""Get the total memory of a device in bytes."""
raise NotImplementedError

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
"""
Check if the current platform supports async output.
"""
raise NotImplementedError

@classmethod
def inference_mode(cls):
"""A device-specific wrapper of `torch.inference_mode`.
Expand Down
6 changes: 5 additions & 1 deletion vllm/platforms/neuron.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional

from .interface import Platform, PlatformEnum

Expand All @@ -18,6 +18,10 @@ class NeuronPlatform(Platform):
def get_device_name(cls, device_id: int = 0) -> str:
return "neuron"

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return False

@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config = vllm_config.parallel_config
Expand Down
6 changes: 5 additions & 1 deletion vllm/platforms/openvino.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional

import torch

Expand Down Expand Up @@ -37,6 +37,10 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
def get_device_name(self, device_id: int = 0) -> str:
return "openvino"

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return False

@classmethod
def inference_mode(self):
return torch.inference_mode(mode=True)
Expand Down
12 changes: 11 additions & 1 deletion vllm/platforms/rocm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from functools import lru_cache
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional

import torch

Expand Down Expand Up @@ -72,6 +72,16 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
device_props = torch.cuda.get_device_properties(device_id)
return device_props.total_memory

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
if enforce_eager:
logger.warning(
"To see benefits of async output processing, enable CUDA "
"graph. Since, enforce-eager is enabled, async output "
"processor cannot be used")
return False
return True

@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config = vllm_config.parallel_config
Expand Down
6 changes: 5 additions & 1 deletion vllm/platforms/tpu.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional

import torch

Expand Down Expand Up @@ -35,6 +35,10 @@ def get_device_name(cls, device_id: int = 0) -> str:
def get_device_total_memory(cls, device_id: int = 0) -> int:
raise NotImplementedError

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return True

@classmethod
def inference_mode(cls):
return torch.no_grad()
Expand Down
6 changes: 5 additions & 1 deletion vllm/platforms/xpu.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional

import torch

Expand Down Expand Up @@ -41,6 +41,10 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
device_props = torch.xpu.get_device_properties(device_id)
return device_props.total_memory

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return True

@staticmethod
def inference_mode():
return torch.no_grad()
Expand Down

0 comments on commit aea2fc3

Please sign in to comment.