vllm-project · wangshuai09 · Sep 26, 2024
@@ -29,7 +29,8 @@ def test_env(name: str, device: str, monkeypatch):
                                         torch.float16, 16)
         assert backend.name == "ROCM_FLASH"
     elif device == "openvino":
-        with patch("vllm.attention.selector.is_openvino", return_value=True):
+        with patch("vllm.attention.selector.current_platform.is_openvino",
+                   return_value=True):
             backend = which_attn_to_use(8, 16, 8, None, torch.float16,
                                         torch.float16, 16)
         assert backend.name == "OPENVINO"

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
@@ -12,8 +12,9 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
+from vllm.platforms import current_platform
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import Counter, is_pin_memory_available
+from vllm.utils import Counter
 
 
 class MockLogitsSampler(Sampler):
@@ -69,7 +70,7 @@ def _do_sample(
         seq_lens,
         query_lens=seq_lens,
         device=device,
-        pin_memory=is_pin_memory_available())
+        pin_memory=current_platform.is_pin_memory_available())
     return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
 
 
@@ -416,7 +417,7 @@ def run_test_case(*, expected_penalization: List[bool],
             seq_lens=seq_lens if seq_lens else None,
             query_lens=seq_lens if seq_lens else [1] * batch_size,
             device=device,
-            pin_memory=is_pin_memory_available())
+            pin_memory=current_platform.is_pin_memory_available())
         # the logits tensor is modified in-place by the sampler
         _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
 
@@ -498,7 +499,7 @@ def test_sampling():
             seq_lens,
             query_lens=seq_lens,
             device=device,
-            pin_memory=is_pin_memory_available(),
+            pin_memory=current_platform.is_pin_memory_available(),
             generators=generators)
         sampler_output = sampler(logits=fake_logits,
                                  sampling_metadata=sampling_metadata)
@@ -607,7 +608,7 @@ class MockConfig:
         seq_lens,
         query_lens=seq_lens,
         device=device,
-        pin_memory=is_pin_memory_available())
+        pin_memory=current_platform.is_pin_memory_available())
 
     sample_probs = None
 
@@ -687,7 +688,7 @@ def test_sampling_params(sampling_params: List[SamplingParams]):
             seq_lens,
             query_lens=seq_lens,
             device=device,
-            pin_memory=is_pin_memory_available())
+            pin_memory=current_platform.is_pin_memory_available())
 
         fake_logits = torch.full((2, vocab_size),
                                  1e-2,

diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
@@ -8,8 +8,8 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
+from vllm.platforms import current_platform
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import is_pin_memory_available
 
 
 class MockLogitsProcessor(LogitsProcessor):
@@ -81,7 +81,7 @@ def pick_ith(token_ids, logits):
         seq_lens,
         query_lens=seq_lens,
         device=device,
-        pin_memory=is_pin_memory_available())
+        pin_memory=current_platform.is_pin_memory_available())
     logits_processor_output = logits_processor(
         lm_head=None,
         hidden_states=input_tensor,

diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
@@ -10,7 +10,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR, is_cpu, is_hip, is_openvino, is_xpu
+from vllm.utils import STR_BACKEND_ENV_VAR, is_cpu, is_hip
 
 logger = init_logger(__name__)
 
@@ -133,7 +133,7 @@ def get_attn_backend(
         from vllm.attention.backends.openvino import OpenVINOAttentionBackend
         return OpenVINOAttentionBackend
     elif backend == _Backend.IPEX:
-        assert is_xpu(), RuntimeError(
+        assert current_platform.is_xpu(), RuntimeError(
             "IPEX attention backend is only used for the XPU device.")
         logger.info("Using IPEX attention backend.")
         from vllm.attention.backends.ipex_attn import IpexAttnBackend
@@ -183,12 +183,12 @@ def which_attn_to_use(
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         return _Backend.TORCH_SDPA
 
-    if is_openvino():
+    if current_platform.is_openvino():
         if selected_backend != _Backend.OPENVINO:
             logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
         return _Backend.OPENVINO
 
-    if is_xpu():
+    if current_platform.is_xpu():
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
         return _Backend.IPEX

diff --git a/vllm/config.py b/vllm/config.py
@@ -17,8 +17,7 @@
                                             get_hf_image_processor_config,
                                             get_hf_text_config)
 from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
-                        is_hip, is_neuron, is_openvino, is_xpu,
-                        print_warning_once)
+                        is_hip, print_warning_once)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -39,8 +38,8 @@ class ModelConfig:
 
     Args:
         model: Name or path of the huggingface model to use.
-            It is also used as the content for `model_name` tag in metrics 
-            output when `served_model_name` is not specified. 
+            It is also used as the content for `model_name` tag in metrics
+            output when `served_model_name` is not specified.
         tokenizer: Name or path of the huggingface tokenizer to use.
         tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
             available, "slow" will always use the slow tokenizer, and
@@ -91,15 +90,15 @@ class ModelConfig:
         skip_tokenizer_init: If true, skip initialization of tokenizer and
             detokenizer.
         served_model_name: The model name used in metrics tag `model_name`,
-            matches the model name exposed via the APIs. If multiple model 
-            names provided, the first name will be used. If not specified, 
+            matches the model name exposed via the APIs. If multiple model
+            names provided, the first name will be used. If not specified,
             the model name will be the same as `model`.
-        limit_mm_per_prompt: Maximum number of data instances per modality 
+        limit_mm_per_prompt: Maximum number of data instances per modality
             per prompt. Only applicable for multimodal models.
-        override_neuron_config: Initialize non default neuron config or 
-            override default neuron config that are specific to Neuron devices, 
-            this argument will be used to configure the neuron config that 
-            can not be gathered from the vllm arguments. 
+        override_neuron_config: Initialize non default neuron config or
+            override default neuron config that are specific to Neuron devices,
+            this argument will be used to configure the neuron config that
+            can not be gathered from the vllm arguments.
         config_format: The config format which shall be loaded.
             Defaults to 'auto' which defaults to 'hf'.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
@@ -196,8 +195,8 @@ def __init__(self,
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
-        self.override_neuron_config = override_neuron_config if is_neuron(
-        ) else None
+        self.override_neuron_config = \
+                override_neuron_config if current_platform.is_neuron() else None
         self._verify_embedding_mode()
         self._verify_quantization()
         self._verify_cuda_graph()
@@ -302,7 +301,7 @@ def _verify_quantization(self) -> None:
                     "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
                     " is not set, enabling VLLM_USE_TRITON_AWQ.")
                 envs.VLLM_USE_TRITON_AWQ = True
-            if is_neuron(
+            if current_platform.is_neuron(
             ) and self.quantization not in neuron_supported_quantization:
                 raise ValueError(
                     f"{self.quantization} quantization is currently not "
@@ -742,7 +741,7 @@ class LoadConfig:
                 fast weight loading.
             "bitsandbytes" will load nf4 type weights.
         ignore_patterns: The list of patterns to ignore when loading the model.
-            Default to "original/**/*" to avoid repeated loading of llama's 
+            Default to "original/**/*" to avoid repeated loading of llama's
             checkpoints.
 
     """
@@ -929,7 +928,7 @@ class SchedulerConfig:
         enable_chunked_prefill: If True, prefill requests can be chunked based
             on the remaining max_num_batched_tokens.
         embedding_mode: Whether the running model is for embedding.
-        preemption_mode: Whether to perform preemption by swapping or 
+        preemption_mode: Whether to perform preemption by swapping or
             recomputation. If not specified, we determine the mode as follows:
             We use recomputation by default since it incurs lower overhead than
             swapping. However, when the sequence group has multiple sequences
@@ -1050,15 +1049,15 @@ def __init__(self, device: str = "auto") -> None:
             # Automated device type detection
             if current_platform.is_cuda_alike():
                 self.device_type = "cuda"
-            elif is_neuron():
+            elif current_platform.is_neuron():
                 self.device_type = "neuron"
-            elif is_openvino():
+            elif current_platform.is_openvino():
                 self.device_type = "openvino"
             elif current_platform.is_tpu():
                 self.device_type = "tpu"
             elif current_platform.is_cpu():
                 self.device_type = "cpu"
-            elif is_xpu():
+            elif current_platform.is_xpu():
                 self.device_type = "xpu"
             else:
                 raise RuntimeError("Failed to infer device type")
@@ -1154,7 +1153,7 @@ def maybe_create_spec_config(
             typical_acceptance_sampler_posterior_threshold (Optional[float]):
                 A threshold value that sets a lower bound on the posterior
                 probability of a token in the target model for it to be
-                accepted. This threshold is used only when we use the 
+                accepted. This threshold is used only when we use the
                 TypicalAcceptanceSampler for token acceptance.
             typical_acceptance_sampler_posterior_alpha (Optional[float]):
                 A scaling factor for the entropy-based threshold in the
@@ -1164,7 +1163,7 @@ def maybe_create_spec_config(
                 If set to False, token log probabilities are returned
                 according to the log probability settings in SamplingParams.
                 If not specified, it defaults to True.
-    
+
         Returns:
             Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
                 the necessary conditions are met, else None.
@@ -1411,13 +1410,13 @@ def __init__(
             typical_acceptance_sampler_posterior_threshold (Optional[float]):
                 A threshold value that sets a lower bound on the posterior
                 probability of a token in the target model for it to be
-                accepted. This threshold is used only when we use the 
+                accepted. This threshold is used only when we use the
                 TypicalAcceptanceSampler for token acceptance.
             typical_acceptance_sampler_posterior_alpha (Optional[float]):
                 A scaling factor for the entropy-based threshold in the
                 TypicalAcceptanceSampler.
             disable_logprobs: If set to True, token log probabilities will not
-                be returned even if requested by sampling parameters. This 
+                be returned even if requested by sampling parameters. This
                 reduces latency by skipping logprob calculation in proposal
                 sampling, target sampling, and after accepted tokens are
                 determined. If set to False, log probabilities will be
@@ -1778,10 +1777,10 @@ def _get_and_verify_max_len(
 def get_served_model_name(model: str,
                           served_model_name: Optional[Union[str, List[str]]]):
     """
-    If the input is a non-empty list, the first model_name in 
-    `served_model_name` is taken. 
-    If the input is a non-empty string, it is used directly. 
-    For cases where the input is either an empty string or an 
+    If the input is a non-empty list, the first model_name in
+    `served_model_name` is taken.
+    If the input is a non-empty string, it is used directly.
+    For cases where the input is either an empty string or an
     empty list, the fallback is to use `self.model`.
     """
     if not served_model_name:

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
@@ -10,7 +10,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
-from vllm.utils import get_ip, is_hip, is_xpu
+from vllm.utils import get_ip, is_hip
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -231,7 +231,7 @@ def initialize_ray_cluster(
     assert_ray_available()
 
     # Connect to a ray cluster.
-    if is_hip() or is_xpu():
+    if is_hip() or current_platform.is_xpu():
         ray.init(address=ray_address,
                  ignore_reinit_error=True,
                  num_gpus=parallel_config.world_size)

diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
@@ -4,7 +4,7 @@
 import torch
 import torch.types
 
-from vllm.utils import is_pin_memory_available
+from vllm.platforms import current_platform
 
 
 class LoRALayerWeights:
@@ -67,7 +67,8 @@ def create_dummy_lora_weights(
             dtype: torch.dtype,
             device: torch.types.Device,
             embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights":
-        pin_memory = str(device) == "cpu" and is_pin_memory_available()
+        pin_memory = str(device) == "cpu" \
+                        and current_platform.is_pin_memory_available()
         lora_a = torch.zeros([input_dim, rank],
                              dtype=dtype,
                              device=device,

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
@@ -27,7 +27,7 @@
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import PPMissingLayer
-from vllm.utils import is_pin_memory_available
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -115,7 +115,8 @@ def from_lora_tensors(
         embedding_padding_modules: Optional[List[str]] = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
-        pin_memory = str(device) == "cpu" and is_pin_memory_available()
+        pin_memory = (str(device) == "cpu"
+                      and current_platform.is_pin_memory_available())
         loras: Dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
             module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name)
@@ -177,7 +178,7 @@ def from_local_checkpoint(
         embedding_padding_modules: Optional[List[str]] = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
-        
+
         Args:
             lora_dir: The local path that has lora data.
             expected_lora_modules: Name of modules that are expected to be

diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
@@ -2,7 +2,7 @@
 
 import vllm.envs as envs
 from vllm.platforms import current_platform
-from vllm.utils import is_cpu, is_hip, is_xpu
+from vllm.utils import is_cpu, is_hip
 
 
 class CustomOp(nn.Module):
@@ -64,7 +64,7 @@ def dispatch_forward(self):
             return self.forward_cpu
         elif current_platform.is_tpu():
             return self.forward_tpu
-        elif is_xpu():
+        elif current_platform.is_xpu():
             return self.forward_xpu
         else:
             return self.forward_cuda
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
@@ -45,7 +45,6 @@
                                         supports_multimodal)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import is_pin_memory_available
 
 
 @contextmanager
@@ -70,7 +69,7 @@ def device_loading_context(module: torch.nn.Module,
 
     finally:
         # Restore parameters to their original devices, ignoring new parameters
-        pin_memory = is_pin_memory_available()
+        pin_memory = current_platform.is_pin_memory_available()
         for name, p in module.named_parameters():
             if name in original_device_states:
                 original_device: torch.device = original_device_states[name]
@@ -794,8 +793,8 @@ def _get_weight_files(
             model_name_or_path: str,
             allowed_patterns: List[str],
             revision: Optional[str] = None) -> Tuple[List[str], str]:
-        """Retrieve weight files. Download the files if necessary. 
-        
+        """Retrieve weight files. Download the files if necessary.
+
         Return the weight files and the file pattern."""
         is_local = os.path.isdir(model_name_or_path)
 

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
@@ -14,8 +14,8 @@
 from vllm.model_executor.model_loader.loader import build_model
 from vllm.model_executor.models import ModelRegistry
 from vllm.multimodal.base import NestedTensors
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available
 
 
 class WeightsGroup(UserDict):
@@ -215,7 +215,7 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
     if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
         return module
 
-    pin_memory = is_pin_memory_available()
+    pin_memory = current_platform.is_pin_memory_available()
 
     # offload parameters to CPU
     # use pin_memory if possible, which helps cudagraph capture speed