Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Hardware][Misc] Make device agnostic #8961

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tests/kernels/test_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def test_env(name: str, device: str, monkeypatch):
torch.float16, 16)
assert backend.name == "ROCM_FLASH"
elif device == "openvino":
with patch("vllm.attention.selector.is_openvino", return_value=True):
with patch("vllm.attention.selector.current_platform.is_openvino",
return_value=True):
backend = which_attn_to_use(8, 16, 8, None, torch.float16,
torch.float16, 16)
assert backend.name == "OPENVINO"
Expand Down
13 changes: 7 additions & 6 deletions tests/samplers/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_random_seed
from vllm.platforms import current_platform
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import Counter, is_pin_memory_available
from vllm.utils import Counter


class MockLogitsSampler(Sampler):
Expand Down Expand Up @@ -69,7 +70,7 @@ def _do_sample(
seq_lens,
query_lens=seq_lens,
device=device,
pin_memory=is_pin_memory_available())
pin_memory=current_platform.is_pin_memory_available())
return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)


Expand Down Expand Up @@ -416,7 +417,7 @@ def run_test_case(*, expected_penalization: List[bool],
seq_lens=seq_lens if seq_lens else None,
query_lens=seq_lens if seq_lens else [1] * batch_size,
device=device,
pin_memory=is_pin_memory_available())
pin_memory=current_platform.is_pin_memory_available())
# the logits tensor is modified in-place by the sampler
_ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)

Expand Down Expand Up @@ -498,7 +499,7 @@ def test_sampling():
seq_lens,
query_lens=seq_lens,
device=device,
pin_memory=is_pin_memory_available(),
pin_memory=current_platform.is_pin_memory_available(),
generators=generators)
sampler_output = sampler(logits=fake_logits,
sampling_metadata=sampling_metadata)
Expand Down Expand Up @@ -607,7 +608,7 @@ class MockConfig:
seq_lens,
query_lens=seq_lens,
device=device,
pin_memory=is_pin_memory_available())
pin_memory=current_platform.is_pin_memory_available())

sample_probs = None

Expand Down Expand Up @@ -687,7 +688,7 @@ def test_sampling_params(sampling_params: List[SamplingParams]):
seq_lens,
query_lens=seq_lens,
device=device,
pin_memory=is_pin_memory_available())
pin_memory=current_platform.is_pin_memory_available())

fake_logits = torch.full((2, vocab_size),
1e-2,
Expand Down
4 changes: 2 additions & 2 deletions tests/test_logits_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_random_seed
from vllm.platforms import current_platform
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import is_pin_memory_available


class MockLogitsProcessor(LogitsProcessor):
Expand Down Expand Up @@ -81,7 +81,7 @@ def pick_ith(token_ids, logits):
seq_lens,
query_lens=seq_lens,
device=device,
pin_memory=is_pin_memory_available())
pin_memory=current_platform.is_pin_memory_available())
logits_processor_output = logits_processor(
lm_head=None,
hidden_states=input_tensor,
Expand Down
8 changes: 4 additions & 4 deletions vllm/attention/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR, is_cpu, is_hip, is_openvino, is_xpu
from vllm.utils import STR_BACKEND_ENV_VAR, is_cpu, is_hip

logger = init_logger(__name__)

Expand Down Expand Up @@ -133,7 +133,7 @@ def get_attn_backend(
from vllm.attention.backends.openvino import OpenVINOAttentionBackend
return OpenVINOAttentionBackend
elif backend == _Backend.IPEX:
assert is_xpu(), RuntimeError(
assert current_platform.is_xpu(), RuntimeError(
"IPEX attention backend is only used for the XPU device.")
logger.info("Using IPEX attention backend.")
from vllm.attention.backends.ipex_attn import IpexAttnBackend
Expand Down Expand Up @@ -183,12 +183,12 @@ def which_attn_to_use(
logger.info("Cannot use %s backend on CPU.", selected_backend)
return _Backend.TORCH_SDPA

if is_openvino():
if current_platform.is_openvino():
if selected_backend != _Backend.OPENVINO:
logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
return _Backend.OPENVINO

if is_xpu():
if current_platform.is_xpu():
if selected_backend != _Backend.IPEX:
logger.info("Cannot use %s backend on XPU.", selected_backend)
return _Backend.IPEX
Expand Down
53 changes: 26 additions & 27 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
get_hf_image_processor_config,
get_hf_text_config)
from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
is_hip, is_neuron, is_openvino, is_xpu,
print_warning_once)
is_hip, print_warning_once)

if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
Expand All @@ -39,8 +38,8 @@ class ModelConfig:

Args:
model: Name or path of the huggingface model to use.
It is also used as the content for `model_name` tag in metrics
output when `served_model_name` is not specified.
It is also used as the content for `model_name` tag in metrics
output when `served_model_name` is not specified.
tokenizer: Name or path of the huggingface tokenizer to use.
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
available, "slow" will always use the slow tokenizer, and
Expand Down Expand Up @@ -91,15 +90,15 @@ class ModelConfig:
skip_tokenizer_init: If true, skip initialization of tokenizer and
detokenizer.
served_model_name: The model name used in metrics tag `model_name`,
matches the model name exposed via the APIs. If multiple model
names provided, the first name will be used. If not specified,
matches the model name exposed via the APIs. If multiple model
names provided, the first name will be used. If not specified,
the model name will be the same as `model`.
limit_mm_per_prompt: Maximum number of data instances per modality
limit_mm_per_prompt: Maximum number of data instances per modality
per prompt. Only applicable for multimodal models.
override_neuron_config: Initialize non default neuron config or
override default neuron config that are specific to Neuron devices,
this argument will be used to configure the neuron config that
can not be gathered from the vllm arguments.
override_neuron_config: Initialize non default neuron config or
override default neuron config that are specific to Neuron devices,
this argument will be used to configure the neuron config that
can not be gathered from the vllm arguments.
config_format: The config format which shall be loaded.
Defaults to 'auto' which defaults to 'hf'.
mm_processor_kwargs: Arguments to be forwarded to the model's processor
Expand Down Expand Up @@ -196,8 +195,8 @@ def __init__(self,
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()

self.override_neuron_config = override_neuron_config if is_neuron(
) else None
self.override_neuron_config = \
override_neuron_config if current_platform.is_neuron() else None
self._verify_embedding_mode()
self._verify_quantization()
self._verify_cuda_graph()
Expand Down Expand Up @@ -302,7 +301,7 @@ def _verify_quantization(self) -> None:
"Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
" is not set, enabling VLLM_USE_TRITON_AWQ.")
envs.VLLM_USE_TRITON_AWQ = True
if is_neuron(
if current_platform.is_neuron(
) and self.quantization not in neuron_supported_quantization:
raise ValueError(
f"{self.quantization} quantization is currently not "
Expand Down Expand Up @@ -742,7 +741,7 @@ class LoadConfig:
fast weight loading.
"bitsandbytes" will load nf4 type weights.
ignore_patterns: The list of patterns to ignore when loading the model.
Default to "original/**/*" to avoid repeated loading of llama's
Default to "original/**/*" to avoid repeated loading of llama's
checkpoints.

"""
Expand Down Expand Up @@ -929,7 +928,7 @@ class SchedulerConfig:
enable_chunked_prefill: If True, prefill requests can be chunked based
on the remaining max_num_batched_tokens.
embedding_mode: Whether the running model is for embedding.
preemption_mode: Whether to perform preemption by swapping or
preemption_mode: Whether to perform preemption by swapping or
recomputation. If not specified, we determine the mode as follows:
We use recomputation by default since it incurs lower overhead than
swapping. However, when the sequence group has multiple sequences
Expand Down Expand Up @@ -1050,15 +1049,15 @@ def __init__(self, device: str = "auto") -> None:
# Automated device type detection
if current_platform.is_cuda_alike():
self.device_type = "cuda"
elif is_neuron():
elif current_platform.is_neuron():
self.device_type = "neuron"
elif is_openvino():
elif current_platform.is_openvino():
self.device_type = "openvino"
elif current_platform.is_tpu():
self.device_type = "tpu"
elif current_platform.is_cpu():
self.device_type = "cpu"
elif is_xpu():
elif current_platform.is_xpu():
self.device_type = "xpu"
else:
raise RuntimeError("Failed to infer device type")
Expand Down Expand Up @@ -1154,7 +1153,7 @@ def maybe_create_spec_config(
typical_acceptance_sampler_posterior_threshold (Optional[float]):
A threshold value that sets a lower bound on the posterior
probability of a token in the target model for it to be
accepted. This threshold is used only when we use the
accepted. This threshold is used only when we use the
TypicalAcceptanceSampler for token acceptance.
typical_acceptance_sampler_posterior_alpha (Optional[float]):
A scaling factor for the entropy-based threshold in the
Expand All @@ -1164,7 +1163,7 @@ def maybe_create_spec_config(
If set to False, token log probabilities are returned
according to the log probability settings in SamplingParams.
If not specified, it defaults to True.

Returns:
Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
the necessary conditions are met, else None.
Expand Down Expand Up @@ -1411,13 +1410,13 @@ def __init__(
typical_acceptance_sampler_posterior_threshold (Optional[float]):
A threshold value that sets a lower bound on the posterior
probability of a token in the target model for it to be
accepted. This threshold is used only when we use the
accepted. This threshold is used only when we use the
TypicalAcceptanceSampler for token acceptance.
typical_acceptance_sampler_posterior_alpha (Optional[float]):
A scaling factor for the entropy-based threshold in the
TypicalAcceptanceSampler.
disable_logprobs: If set to True, token log probabilities will not
be returned even if requested by sampling parameters. This
be returned even if requested by sampling parameters. This
reduces latency by skipping logprob calculation in proposal
sampling, target sampling, and after accepted tokens are
determined. If set to False, log probabilities will be
Expand Down Expand Up @@ -1778,10 +1777,10 @@ def _get_and_verify_max_len(
def get_served_model_name(model: str,
served_model_name: Optional[Union[str, List[str]]]):
"""
If the input is a non-empty list, the first model_name in
`served_model_name` is taken.
If the input is a non-empty string, it is used directly.
For cases where the input is either an empty string or an
If the input is a non-empty list, the first model_name in
`served_model_name` is taken.
If the input is a non-empty string, it is used directly.
For cases where the input is either an empty string or an
empty list, the fallback is to use `self.model`.
"""
if not served_model_name:
Expand Down
4 changes: 2 additions & 2 deletions vllm/executor/ray_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.sequence import ExecuteModelRequest, IntermediateTensors
from vllm.utils import get_ip, is_hip, is_xpu
from vllm.utils import get_ip, is_hip
from vllm.worker.worker_base import WorkerWrapperBase

logger = init_logger(__name__)
Expand Down Expand Up @@ -231,7 +231,7 @@ def initialize_ray_cluster(
assert_ray_available()

# Connect to a ray cluster.
if is_hip() or is_xpu():
if is_hip() or current_platform.is_xpu():
ray.init(address=ray_address,
ignore_reinit_error=True,
num_gpus=parallel_config.world_size)
Expand Down
5 changes: 3 additions & 2 deletions vllm/lora/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import torch
import torch.types

from vllm.utils import is_pin_memory_available
from vllm.platforms import current_platform


class LoRALayerWeights:
Expand Down Expand Up @@ -67,7 +67,8 @@ def create_dummy_lora_weights(
dtype: torch.dtype,
device: torch.types.Device,
embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights":
pin_memory = str(device) == "cpu" and is_pin_memory_available()
pin_memory = str(device) == "cpu" \
and current_platform.is_pin_memory_available()
lora_a = torch.zeros([input_dim, rank],
dtype=dtype,
device=device,
Expand Down
7 changes: 4 additions & 3 deletions vllm/lora/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from vllm.model_executor.models import SupportsLoRA, supports_multimodal
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.utils import PPMissingLayer
from vllm.utils import is_pin_memory_available
from vllm.platforms import current_platform

logger = init_logger(__name__)

Expand Down Expand Up @@ -115,7 +115,8 @@ def from_lora_tensors(
embedding_padding_modules: Optional[List[str]] = None,
) -> "LoRAModel":
"""Create a LoRAModel from a dictionary of tensors."""
pin_memory = str(device) == "cpu" and is_pin_memory_available()
pin_memory = (str(device) == "cpu"
and current_platform.is_pin_memory_available())
loras: Dict[str, LoRALayerWeights] = {}
for tensor_name, tensor in tensors.items():
module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name)
Expand Down Expand Up @@ -177,7 +178,7 @@ def from_local_checkpoint(
embedding_padding_modules: Optional[List[str]] = None,
) -> "LoRAModel":
"""Create a LoRAModel from a local checkpoint.

Args:
lora_dir: The local path that has lora data.
expected_lora_modules: Name of modules that are expected to be
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/custom_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import vllm.envs as envs
from vllm.platforms import current_platform
from vllm.utils import is_cpu, is_hip, is_xpu
from vllm.utils import is_cpu, is_hip


class CustomOp(nn.Module):
Expand Down Expand Up @@ -64,7 +64,7 @@ def dispatch_forward(self):
return self.forward_cpu
elif current_platform.is_tpu():
return self.forward_tpu
elif is_xpu():
elif current_platform.is_xpu():
return self.forward_xpu
else:
return self.forward_cuda
7 changes: 3 additions & 4 deletions vllm/model_executor/model_loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
supports_multimodal)
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.utils import is_pin_memory_available


@contextmanager
Expand All @@ -70,7 +69,7 @@ def device_loading_context(module: torch.nn.Module,

finally:
# Restore parameters to their original devices, ignoring new parameters
pin_memory = is_pin_memory_available()
pin_memory = current_platform.is_pin_memory_available()
for name, p in module.named_parameters():
if name in original_device_states:
original_device: torch.device = original_device_states[name]
Expand Down Expand Up @@ -794,8 +793,8 @@ def _get_weight_files(
model_name_or_path: str,
allowed_patterns: List[str],
revision: Optional[str] = None) -> Tuple[List[str], str]:
"""Retrieve weight files. Download the files if necessary.
"""Retrieve weight files. Download the files if necessary.

Return the weight files and the file pattern."""
is_local = os.path.isdir(model_name_or_path)

Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
from vllm.model_executor.model_loader.loader import build_model
from vllm.model_executor.models import ModelRegistry
from vllm.multimodal.base import NestedTensors
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
from vllm.utils import is_pin_memory_available


class WeightsGroup(UserDict):
Expand Down Expand Up @@ -215,7 +215,7 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
return module

pin_memory = is_pin_memory_available()
pin_memory = current_platform.is_pin_memory_available()

# offload parameters to CPU
# use pin_memory if possible, which helps cudagraph capture speed
Expand Down
Loading
Loading