From 4efff036f0dfeee21e82044e9b6e63b861b817a3 Mon Sep 17 00:00:00 2001
From: liuyhwangyh <liuyhwangyh@163.com>
Date: Fri, 7 Jun 2024 00:28:10 +0800
Subject: [PATCH 01/30] Bugfix: fix broken of download models from modelscope
 (#5233)

Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>
---
 tests/test_regression.py          | 21 +++++++++++++++++++++
 vllm/config.py                    |  6 +++++-
 vllm/transformers_utils/config.py |  7 ++++++-
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/tests/test_regression.py b/tests/test_regression.py
index cb68e9ecfc06b..5d27d35793017 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -53,6 +53,27 @@ def test_gc():
     assert allocated < 50 * 1024 * 1024
 
 
+def test_model_from_modelscope(monkeypatch):
+    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
+    MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
+    monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
+    try:
+        llm = LLM(model=MODELSCOPE_MODEL_NAME)
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+        outputs = llm.generate(prompts, sampling_params)
+        assert len(outputs) == 4
+    finally:
+        monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
+
+
 if __name__ == "__main__":
     import pytest
     pytest.main([__file__])
diff --git a/vllm/config.py b/vllm/config.py
index 3a14c391b7f00..4efdb6cab52c4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -113,7 +113,11 @@ def __init__(
         self.revision = revision
         self.code_revision = code_revision
         self.rope_scaling = rope_scaling
-        self.tokenizer_revision = tokenizer_revision
+        # The tokenizer version is consistent with the model version by default.
+        if tokenizer_revision is None:
+            self.tokenizer_revision = revision
+        else:
+            self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
         self.quantization_param_path = quantization_param_path
         self.enforce_eager = enforce_eager
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 044eec6410a54..970645987885a 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,7 +1,8 @@
 from typing import Dict, Optional
 
-from transformers import AutoConfig, PretrainedConfig
+from transformers import PretrainedConfig
 
+from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                              JAISConfig, MPTConfig, RWConfig)
@@ -24,6 +25,10 @@ def get_config(model: str,
                code_revision: Optional[str] = None,
                rope_scaling: Optional[dict] = None) -> PretrainedConfig:
     try:
+        if VLLM_USE_MODELSCOPE:
+            from modelscope import AutoConfig
+        else:
+            from transformers import AutoConfig
         config = AutoConfig.from_pretrained(
             model,
             trust_remote_code=trust_remote_code,

From abe855d63774c44e69048dfd188f0333db581d4b Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 6 Jun 2024 09:29:29 -0700
Subject: [PATCH 02/30] [Kernel] Retune Mixtral 8x22b configs for FP8 on H100
 (#5294)

---
 ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 110 +++++++++---------
 ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json |  72 ++++++------
 2 files changed, 91 insertions(+), 91 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
index 0c495e7e290c6..5557187339546 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -1,113 +1,113 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "4": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 5
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
     },
     "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
         "num_stages": 3
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "32": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
         "num_stages": 3
     },
     "48": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 4
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "64": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
         "num_stages": 3
     },
     "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 4
+        "num_warps": 4,
+        "num_stages": 3
     },
     "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
+        "num_warps": 8,
+        "num_stages": 5
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 8,
         "num_stages": 4
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 8,
         "num_stages": 4
     }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
index 60a65724d68b9..cc614e635ea57 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -1,17 +1,17 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 5
     },
     "2": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -20,59 +20,59 @@
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2
+        "num_warps": 4,
+        "num_stages": 3
     },
     "8": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "16": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
+        "num_warps": 4,
+        "num_stages": 4
     },
     "48": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 2
+        "num_stages": 3
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 2
+        "num_stages": 3
     },
     "96": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -81,11 +81,11 @@
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
+        "num_warps": 4,
+        "num_stages": 3
     },
     "256": {
         "BLOCK_SIZE_M": 128,
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 5
+        "num_stages": 3
     },
     "512": {
         "BLOCK_SIZE_M": 128,

From 828da0d44e9124d949909477d6018fc08469a31e Mon Sep 17 00:00:00 2001
From: Matthew Goldey <matthew.goldey@gmail.com>
Date: Thu, 6 Jun 2024 16:48:13 -0400
Subject: [PATCH 03/30] [Frontend] enable passing multiple LoRA adapters at
 once to generate() (#5300)

---
 .../test_llm_generate_multiple_loras.py       | 69 +++++++++++++++++++
 vllm/entrypoints/llm.py                       | 39 ++++++-----
 2 files changed, 91 insertions(+), 17 deletions(-)
 create mode 100644 tests/entrypoints/test_llm_generate_multiple_loras.py

diff --git a/tests/entrypoints/test_llm_generate_multiple_loras.py b/tests/entrypoints/test_llm_generate_multiple_loras.py
new file mode 100644
index 0000000000000..b429b904c7c35
--- /dev/null
+++ b/tests/entrypoints/test_llm_generate_multiple_loras.py
@@ -0,0 +1,69 @@
+import weakref
+
+import pytest
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from vllm import LLM
+from vllm.lora.request import LoRARequest
+
+from ..conftest import cleanup
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+pytestmark = pytest.mark.llm
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              tensor_parallel_size=1,
+              max_model_len=8192,
+              enable_lora=True,
+              max_loras=4,
+              max_lora_rank=64,
+              max_num_seqs=128,
+              enforce_eager=True)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup()
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
+    lora_request = [
+        LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 905c36afde1e0..411d5256b75b9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -170,7 +170,7 @@ def generate(
                                         List[SamplingParams]]] = None,
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -182,7 +182,7 @@ def generate(
                                         List[SamplingParams]]] = None,
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -195,7 +195,7 @@ def generate(
         *,
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -208,7 +208,7 @@ def generate(
         *,
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -219,7 +219,7 @@ def generate(
         sampling_params: None,
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -232,7 +232,7 @@ def generate(
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -249,7 +249,7 @@ def generate(
                                         Sequence[SamplingParams]]] = None,
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -312,7 +312,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -324,7 +324,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -337,7 +337,7 @@ def encode(
         *,
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -350,7 +350,7 @@ def encode(
         *,
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -361,7 +361,7 @@ def encode(
         pooling_params: None,
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -374,7 +374,7 @@ def encode(
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -391,7 +391,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -498,7 +498,7 @@ def _validate_and_add_requests(
         inputs: Union[PromptStrictInputs, Sequence[PromptStrictInputs]],
         params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                       Sequence[PoolingParams]],
-        lora_request: Optional[LoRARequest],
+        lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
     ) -> None:
         if isinstance(inputs, (str, dict)):
             # Convert a single prompt to a list.
@@ -509,20 +509,25 @@ def _validate_and_add_requests(
         if isinstance(params, list) and len(params) != num_requests:
             raise ValueError("The lengths of prompts and params "
                              "must be the same.")
+        if isinstance(lora_request,
+                      list) and len(lora_request) != num_requests:
+            raise ValueError("The lengths of prompts and lora_request "
+                             "must be the same.")
 
         # Add requests to the engine.
         for i, request_inputs in enumerate(inputs):
             self._add_request(
                 request_inputs,
                 params[i] if isinstance(params, Sequence) else params,
-                lora_request=lora_request,
+                lora_request=lora_request[i] if isinstance(
+                    lora_request, Sequence) else lora_request,
             )
 
     def _add_request(
         self,
         inputs: PromptInputs,
         params: Union[SamplingParams, PoolingParams],
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> None:
         request_id = str(next(self.request_counter))
         self.llm_engine.add_request(request_id,

From a31cab7556f540b558b0b454b4a4b9b438542566 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 6 Jun 2024 18:12:00 -0700
Subject: [PATCH 04/30] [Core] Avoid copying prompt/output tokens if no
 penalties are used (#5289)

---
 vllm/model_executor/sampling_metadata.py | 80 +++++++++++++++---------
 1 file changed, 50 insertions(+), 30 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 0b3b41e69d6bc..7ad84f51b7e4c 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -386,16 +386,18 @@ def from_sampling_metadata(
                 presence_penalties += [0] * prefill_len
                 frequency_penalties += [0] * prefill_len
                 repetition_penalties += [1] * prefill_len
-                prompt_tokens.extend([] for _ in range(prefill_len))
-                output_tokens.extend([] for _ in range(prefill_len))
+                if do_penalties:
+                    prompt_tokens.extend([] for _ in range(prefill_len))
+                    output_tokens.extend([] for _ in range(prefill_len))
 
             if seq_group.do_sample:
                 sample_lens = len(seq_group.sample_indices)
                 assert sample_lens == len(seq_ids)
                 for seq_id in seq_ids:
                     seq_data = seq_group.seq_data[seq_id]
-                    prompt_tokens.append(seq_data.prompt_token_ids)
-                    output_tokens.append(seq_data.output_token_ids)
+                    if do_penalties:
+                        prompt_tokens.append(seq_data.prompt_token_ids)
+                        output_tokens.append(seq_data.output_token_ids)
                 temperatures += [temperature] * len(seq_ids)
                 top_ps += [top_p] * len(seq_ids)
                 top_ks += [top_k] * len(seq_ids)
@@ -443,18 +445,22 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         # Note that the performance will be very bad without
         # pinned memory.
         pin_memory = is_pin_memory_available()
-        prompt_max_len = max([len(tokens) for tokens in prompt_tokens],
-                             default=0)
-        prompt_padded_tokens = [
-            tokens + [vocab_size] * (prompt_max_len - len(tokens))
-            for tokens in prompt_tokens
-        ]
-        output_max_len = max([len(tokens) for tokens in output_tokens],
-                             default=0)
-        output_padded_tokens = [
-            tokens + [vocab_size] * (output_max_len - len(tokens))
-            for tokens in output_tokens
-        ]
+
+        do_penalties = prompt_tokens or output_tokens
+
+        if do_penalties:
+            prompt_max_len = max([len(tokens) for tokens in prompt_tokens],
+                                 default=0)
+            prompt_padded_tokens = [
+                tokens + [vocab_size] * (prompt_max_len - len(tokens))
+                for tokens in prompt_tokens
+            ]
+            output_max_len = max([len(tokens) for tokens in output_tokens],
+                                 default=0)
+            output_padded_tokens = [
+                tokens + [vocab_size] * (output_max_len - len(tokens))
+                for tokens in output_tokens
+            ]
 
         temperatures_t = torch.tensor(
             temperatures,
@@ -504,18 +510,22 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             dtype=torch.long,
             pin_memory=pin_memory,
         )
-        prompt_tensor = torch.tensor(
-            prompt_padded_tokens,
-            device="cpu",
-            dtype=torch.long,
-            pin_memory=pin_memory,
-        )
-        output_tensor = torch.tensor(
-            output_padded_tokens,
-            device="cpu",
-            dtype=torch.long,
-            pin_memory=pin_memory,
-        )
+        if do_penalties:
+            prompt_tensor = torch.tensor(
+                prompt_padded_tokens,
+                device="cpu",
+                dtype=torch.long,
+                pin_memory=pin_memory,
+            )
+            output_tensor = torch.tensor(
+                output_padded_tokens,
+                device="cpu",
+                dtype=torch.long,
+                pin_memory=pin_memory,
+            )
+        else:
+            prompt_tensor = None
+            output_tensor = None
         # need to transpose and make contiguous to
         # copy the tensor correctly.
         # [batch_size, n_seeds] -> [n_seeds, batch_size]
@@ -538,6 +548,16 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             extra_seeds_gpu = None
         sampling_seeds_gpu = sampling_seeds_gpu[:num_base_seeds]
 
+        if do_penalties:
+            prompt_tokens_gpu = prompt_tensor.to(device=device,
+                                                 non_blocking=True)
+            output_tokens_gpu = output_tensor.to(device=device,
+                                                 non_blocking=True)
+        else:
+            empty_tensor = torch.empty(0, device=device, dtype=torch.long)
+            prompt_tokens_gpu = empty_tensor
+            output_tokens_gpu = empty_tensor
+
         return cls(
             temperatures=temperatures_t.to(device=device, non_blocking=True),
             top_ps=top_ps_t.to(device=device, non_blocking=True),
@@ -549,8 +569,8 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
                                                          non_blocking=True),
             repetition_penalties=repetition_penalties_t.to(device=device,
                                                            non_blocking=True),
-            prompt_tokens=prompt_tensor.to(device=device, non_blocking=True),
-            output_tokens=output_tensor.to(device=device, non_blocking=True),
+            prompt_tokens=prompt_tokens_gpu,
+            output_tokens=output_tokens_gpu,
             sampling_seeds=sampling_seeds_gpu,
             sample_indices=sample_indices_t.to(device=device,
                                                non_blocking=True),

From ccdc490dda3f534c63c1faf29a638e65594d0dc3 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 6 Jun 2024 19:07:57 -0700
Subject: [PATCH 05/30] [Core] Change LoRA embedding sharding to support
 loading methods (#5038)

---
 .buildkite/test-pipeline.yaml                 |  10 +-
 tests/conftest.py                             |  21 ++
 tests/lora/conftest.py                        |  18 +-
 tests/lora/test_layers.py                     | 219 ++++++++++++++-
 tests/lora/test_llama.py                      |  17 +-
 tests/lora/test_long_context.py               |  23 +-
 tests/test_sharded_state_loader.py            | 124 ++++++---
 vllm/lora/layers.py                           |  76 +++--
 vllm/lora/utils.py                            |   3 +-
 .../layers/vocab_parallel_embedding.py        | 260 ++++++++++++++++--
 vllm/worker/model_runner.py                   |  19 +-
 11 files changed, 661 insertions(+), 129 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d8030ab219ccb..b48ef31bc4163 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -46,6 +46,7 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
 
 - label: Distributed Tests (Multiple Groups)
   #mirror_hardwares: [amd]
@@ -138,14 +139,7 @@ steps:
   num_gpus: 4
   # This test runs llama 13B, so it is required to run on 4 GPUs.
   commands:
-    # Temporarily run this way because we cannot clean up GPU mem usage
-    # for multi GPU tests.
-    # TODO(sang): Fix it.
-    - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
-    - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
-    - pytest -v -s lora/test_long_context.py::test_self_consistency
-    - pytest -v -s lora/test_long_context.py::test_quality
-    - pytest -v -s lora/test_long_context.py::test_max_len
+    - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
diff --git a/tests/conftest.py b/tests/conftest.py
index a481daa3c23e4..1a7037eb2f290 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,8 @@
 import contextlib
 import gc
 import os
+import subprocess
+import sys
 from typing import Any, Dict, List, Optional, Tuple, TypeVar
 
 import pytest
@@ -522,3 +524,22 @@ def caplog_vllm(temporary_enable_log_propagate, caplog):
     # To capture vllm log, we should enable propagate=True temporarily
     # because caplog depends on logs propagated to the root logger.
     yield caplog
+
+
+@pytest.fixture(scope="session")
+def num_gpus_available():
+    """Get number of GPUs without initializing the CUDA context
+    in current process."""
+
+    try:
+        out = subprocess.run([
+            sys.executable, "-c",
+            "import torch; print(torch.cuda.device_count())"
+        ],
+                             capture_output=True,
+                             check=True,
+                             text=True)
+    except subprocess.CalledProcessError as e:
+        logger.warning("Failed to get number of GPUs.", exc_info=e)
+        return 0
+    return int(out.stdout.strip())
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index e5cf9cd48b65d..400333066b9fa 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -42,10 +42,24 @@ def cleanup():
     ray.shutdown()
 
 
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    """Allow subdirectories to skip global cleanup by overriding this fixture.
+    This can provide a ~10x speedup for non-GPU unit tests since they don't need
+    to initialize torch.
+    """
+
+    if request.node.get_closest_marker("skip_global_cleanup"):
+        return False
+
+    return True
+
+
 @pytest.fixture(autouse=True)
-def cleanup_fixture():
+def cleanup_fixture(should_do_global_cleanup_after_test: bool):
     yield
-    cleanup()
+    if should_do_global_cleanup_after_test:
+        cleanup()
 
 
 @pytest.fixture
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 9a2c8b04dac47..fc4445c657f1b 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -2,6 +2,7 @@
 from copy import deepcopy
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -32,7 +33,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
 from vllm.model_executor.utils import set_random_seed
 
 from .utils import DummyLoRAManager
@@ -427,7 +428,8 @@ def _pretest():
         logits_processor = LogitsProcessor(
             vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
         lora_logits_processor = LogitsProcessorWithLoRA(
-            logits_processor, 1024, linear.weight.dtype, linear.weight.device)
+            logits_processor, 1024, linear.weight.dtype, linear.weight.device,
+            None)
         lora_logits_processor.create_lora_weights(max_loras, lora_config)
 
         return linear, logits_processor, lora_logits_processor
@@ -867,3 +869,216 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
 
     torch.allclose(ref_q, actual_q)
     torch.allclose(ref_k, actual_k)
+
+
+@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
+@pytest.mark.parametrize("seed", list(range(256)))
+def test_vocab_parallel_embedding_indices(tp_size, seed):
+    random.seed(seed)
+    vocab_size = random.randint(4000, 64000)
+    added_vocab_size = random.randint(0, 1024)
+    org_vocab_size = vocab_size - added_vocab_size
+    last_org_vocab_end_index = 0
+    last_added_vocab_end_index = org_vocab_size
+    computed_vocab_size = 0
+    computed_org_vocab_size = 0
+    computed_added_vocab_size = 0
+    vocab_size_padded = -1
+
+    all_org_tokens = []
+    all_added_tokens = []
+    token_ids = []
+
+    for tp_rank in range(tp_size):
+        with patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank",
+                return_value=tp_rank
+        ), patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size",
+                return_value=tp_size):
+            vocab_embedding = VocabParallelEmbedding(
+                vocab_size, 1, org_num_embeddings=org_vocab_size)
+        vocab_size_padded = vocab_embedding.num_embeddings_padded
+        shard_indices = vocab_embedding.shard_indices
+        # Assert that the ranges are contiguous
+        assert shard_indices.org_vocab_start_index == last_org_vocab_end_index
+        assert (shard_indices.added_vocab_start_index ==
+                last_added_vocab_end_index)
+
+        # Ensure that we are not exceeding the vocab size
+        computed_vocab_size += shard_indices.num_elements_padded
+        computed_org_vocab_size += shard_indices.num_org_elements
+        computed_added_vocab_size += shard_indices.num_added_elements
+
+        # Ensure that the ranges are not overlapping
+        all_org_tokens.extend(
+            range(shard_indices.org_vocab_start_index,
+                  shard_indices.org_vocab_end_index))
+        all_added_tokens.extend(
+            range(shard_indices.added_vocab_start_index,
+                  shard_indices.added_vocab_end_index))
+
+        token_ids.extend(
+            range(shard_indices.org_vocab_start_index,
+                  shard_indices.org_vocab_end_index))
+        token_ids.extend([-1] * (shard_indices.num_org_elements_padded -
+                                 shard_indices.num_org_elements))
+        token_ids.extend(
+            range(shard_indices.added_vocab_start_index,
+                  shard_indices.added_vocab_end_index))
+        token_ids.extend([-1] * (shard_indices.num_added_elements_padded -
+                                 shard_indices.num_added_elements))
+
+        last_org_vocab_end_index = shard_indices.org_vocab_end_index
+        last_added_vocab_end_index = shard_indices.added_vocab_end_index
+
+    assert computed_vocab_size == vocab_size_padded
+    assert computed_org_vocab_size == org_vocab_size
+    assert computed_added_vocab_size == added_vocab_size
+
+    # Ensure that the ranges are not overlapping
+    assert len(all_org_tokens) == len(set(all_org_tokens))
+    assert len(all_added_tokens) == len(set(all_added_tokens))
+    assert not set(all_org_tokens).intersection(set(all_added_tokens))
+
+    token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
+    reindex_mapping = vocab_embedding.get_sharded_to_full_mapping()
+    assert reindex_mapping is not None or tp_size == 1
+    if reindex_mapping is not None:
+        reindexed_token_ids = token_ids_tensor[reindex_mapping]
+        expected = torch.tensor(list(range(0, vocab_size)))
+        assert reindexed_token_ids[:vocab_size].equal(expected)
+        assert torch.all(reindexed_token_ids[vocab_size:] == -1)
+
+
+def test_get_masked_input_and_mask():
+    x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+
+    # base tp 1 case, no padding
+    modified_x, _ = get_masked_input_and_mask(x,
+                                              org_vocab_start_index=0,
+                                              org_vocab_end_index=8,
+                                              added_vocab_start_index=8,
+                                              added_vocab_end_index=12,
+                                              num_org_vocab_padding=0)
+    assert torch.equal(x, modified_x)
+
+    # tp 2 case, no padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=8,
+        added_vocab_start_index=10,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=0)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5]))
+
+    # tp 4 case, no padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=2,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=9,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_1, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=2,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=9,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_2, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=6,
+        added_vocab_start_index=10,
+        added_vocab_end_index=11,
+        num_org_vocab_padding=0)
+    modified_x_rank_3, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=6,
+        org_vocab_end_index=8,
+        added_vocab_start_index=11,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=0)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0]))
+    assert torch.equal(modified_x_rank_2,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0]))
+    assert torch.equal(modified_x_rank_3,
+                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2]))
+
+    # base tp 1 case, with padding
+    modified_x, _ = get_masked_input_and_mask(x,
+                                              org_vocab_start_index=0,
+                                              org_vocab_end_index=8,
+                                              added_vocab_start_index=8,
+                                              added_vocab_end_index=12,
+                                              num_org_vocab_padding=2)
+    assert torch.equal(modified_x,
+                       torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13]))
+
+    # tp 2 case, with padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=8,
+        added_vocab_start_index=10,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=2)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7]))
+
+    # tp 4 case, with padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=2,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=9,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_1, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=2,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=9,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_2, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=6,
+        added_vocab_start_index=10,
+        added_vocab_end_index=11,
+        num_org_vocab_padding=2)
+    modified_x_rank_3, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=6,
+        org_vocab_end_index=8,
+        added_vocab_start_index=11,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=2)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0]))
+    assert torch.equal(modified_x_rank_2,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0]))
+    assert torch.equal(modified_x_rank_3,
+                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4]))
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
index f5a571e81acba..7143a99bea081 100644
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -36,11 +36,10 @@ def do_sample(llm, lora_path: str, lora_id: int):
     return generated_texts
 
 
-@pytest.mark.parametrize("tp_size", [1])
-def test_llama_lora(sql_lora_files, tp_size):
-    # Cannot use as it will initialize torch.cuda too early...
-    # if torch.cuda.device_count() < tp_size:
-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+@pytest.mark.parametrize("tp_size", [1, 2, 4])
+def test_llama_lora(sql_lora_files, tp_size, num_gpus_available):
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
     llm = vllm.LLM(MODEL_PATH,
                    enable_lora=True,
@@ -80,11 +79,9 @@ def test_llama_lora(sql_lora_files, tp_size):
     print("removing lora")
 
 
-@pytest.mark.skip("Requires multiple GPUs")
-def test_llama_tensor_parallel_equality(sql_lora_files):
-    # Cannot use as it will initialize torch.cuda too early...
-    # if torch.cuda.device_count() < 4:
-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
+def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
+    if num_gpus_available < 4:
+        pytest.skip("Not enough GPUs for tensor parallelism 4")
 
     llm_tp1 = vllm.LLM(MODEL_PATH,
                        enable_lora=True,
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index 4361e5452cdff..b58145eda2141 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -102,22 +102,21 @@ def batched_generate(
     return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def lora_llm(long_context_infos):
     scaling_factors = [
         context_len_to_scaling_factor[info["context_length"]]
         for info in long_context_infos.values()
     ]
 
-    llm = vllm.LLM(
-        "meta-llama/Llama-2-13b-chat-hf",
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=2,
-        long_lora_scaling_factors=tuple(scaling_factors),
-        max_num_batched_tokens=4096 * 8,
-        tensor_parallel_size=4,
-    )
+    llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf",
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=2,
+                   long_lora_scaling_factors=tuple(scaling_factors),
+                   max_num_batched_tokens=4096 * 8,
+                   tensor_parallel_size=4,
+                   distributed_executor_backend="mp")
     yield llm
     del llm
 
@@ -154,6 +153,7 @@ def test_rotary_emb_replaced(dist_init):
     assert rotary_emb_count == 32
 
 
+@pytest.mark.skip_global_cleanup
 def test_batched_rope_kernel(lora_llm, long_context_infos):
     """We test the batched kernel by comparing the results of batched an
         non-batched generation.
@@ -188,6 +188,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
             f"same:\n{batched}\n{non_batched}")
 
 
+@pytest.mark.skip_global_cleanup
 def test_self_consistency(lora_llm, long_context_infos):
     """We test consistency of the batched kernel by permuting batched
     inputs and comparing the results to the non-permuted batched results.
@@ -227,6 +228,7 @@ def test_self_consistency(lora_llm, long_context_infos):
                 f"\n{permutated_batched_results[permutation[i]]}")
 
 
+@pytest.mark.skip_global_cleanup
 def test_quality(lora_llm, long_context_infos):
     """We test the quality of the answers given by the LoRA model by
         comparing the generated text to the merged model's outputs.
@@ -257,6 +259,7 @@ def test_quality(lora_llm, long_context_infos):
     assert np.mean(scores) > 0.5
 
 
+@pytest.mark.skip_global_cleanup
 def test_max_len(lora_llm, long_context_infos):
     """Test that we raise an ValueError when the input of a given LoRA
         model exceeds the maximum length."""
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 8540e98da366a..022fb36b346f4 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -1,3 +1,4 @@
+import multiprocessing as mp
 import os
 import shutil
 from tempfile import TemporaryDirectory
@@ -18,9 +19,7 @@
 
 # Create a sampling params object.
 sampling_params = SamplingParams(
-    temperature=0.8,
-    top_p=0.95,
-    seed=0,
+    temperature=0,
     max_tokens=256,
     ignore_eos=True,
 )
@@ -43,48 +42,85 @@ def test_filter_subtensors():
         assert tensor.equal(state_dict[key])
 
 
+@pytest.fixture(scope="module")
+def llama_2_7b_files():
+    with TemporaryDirectory() as cache_dir:
+        input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
+                                      cache_dir=cache_dir,
+                                      ignore_patterns="*.bin*")
+        yield input_dir
+
+
+def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
+    llm_sharded_writer = LLM(model=input_dir, **kwargs)
+
+    # Dump worker states to output directory
+    llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
+        path=output_dir)
+    # Copy metadata files to output directory
+    for file in os.listdir(input_dir):
+        if not any(file.endswith(ext) for ext in weights_patterns):
+            shutil.copy(f"{input_dir}/{file}", output_dir)
+
+
+def _run_generate(input_dir, queue: mp.Queue, **kwargs):
+    llm = LLM(model=input_dir, **kwargs)
+    gen = llm.generate(prompts, sampling_params)
+    queue.put([g.outputs[0].__dict__ for g in gen])
+    queue.close()
+    queue.join_thread()
+
+
 @pytest.mark.parametrize("enable_lora", [False, True])
-def test_sharded_state_loader(enable_lora):
-    weights_patterns = ("*.bin", "*.pt", "*.safetensors")
+@pytest.mark.parametrize("tp_size", [1, 2])
+def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
+                              llama_2_7b_files):
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
-    with TemporaryDirectory() as cache_dir, TemporaryDirectory() as output_dir:
-        input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
-                                      cache_dir=cache_dir)
-
-        llm = LLM(
-            model=input_dir,
-            worker_use_ray=True,
-            gpu_memory_utilization=0.3,
-        )
-
-        # Dump worker states to output directory
-        model_executor = llm.llm_engine.model_executor
-        model_executor.save_sharded_state(path=output_dir)
-        # Copy metadata files to output directory
-        for file in os.listdir(input_dir):
-            if not any(file.endswith(ext) for ext in weights_patterns):
-                shutil.copy(f"{input_dir}/{file}", output_dir)
-        del llm.llm_engine.model_executor
-
-        llm_before = LLM(
-            model=input_dir,
-            worker_use_ray=True,
-            enable_lora=enable_lora,
-            gpu_memory_utilization=0.3,
-        )
-        gen_before = llm_before.generate(prompts, sampling_params)
-        out_before = [gen.outputs[0].__dict__ for gen in gen_before]
-        del llm_before.llm_engine.model_executor
-
-        llm_after = LLM(
-            model=output_dir,
-            worker_use_ray=True,
-            enable_lora=enable_lora,
-            gpu_memory_utilization=0.3,
-            load_format="sharded_state",
-        )
-        gen_after = llm_after.generate(prompts, sampling_params)
-        out_after = [gen.outputs[0].__dict__ for gen in gen_after]
-        del llm_after.llm_engine.model_executor
+    weights_patterns = ("*.safetensors", )
+    gpu_memory_utilization = 0.8
+    input_dir = llama_2_7b_files
+    ctx = mp.get_context("spawn")
+
+    # Run in separate processes for memory & CUDA isolation
+    with TemporaryDirectory() as output_dir:
+        p = ctx.Process(target=_run_writer,
+                        args=(input_dir, output_dir, weights_patterns),
+                        kwargs=dict(
+                            tensor_parallel_size=tp_size,
+                            distributed_executor_backend="mp",
+                            gpu_memory_utilization=gpu_memory_utilization,
+                            enforce_eager=True,
+                        ))
+        p.start()
+        p.join()
+
+        queue = ctx.Queue()
+
+        p = ctx.Process(target=_run_generate,
+                        args=(input_dir, queue),
+                        kwargs=dict(
+                            distributed_executor_backend="mp",
+                            enable_lora=enable_lora,
+                            gpu_memory_utilization=gpu_memory_utilization,
+                            tensor_parallel_size=tp_size,
+                        ))
+        p.start()
+        p.join()
+        out_before = queue.get()
+
+        p = ctx.Process(target=_run_generate,
+                        args=(output_dir, queue),
+                        kwargs=dict(
+                            distributed_executor_backend="mp",
+                            enable_lora=enable_lora,
+                            gpu_memory_utilization=gpu_memory_utilization,
+                            tensor_parallel_size=tp_size,
+                            load_format="sharded_state",
+                        ))
+        p.start()
+        p.join()
+        out_after = queue.get()
 
         assert out_before == out_after
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 24b74476c3b85..e3ab1708c3fdf 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -215,19 +215,19 @@ def create_lora_weights(
             lora_config: LoRAConfig,
             model_config: Optional[PretrainedConfig] = None) -> None:
 
-        lora_vocab_start_idx = self.base_layer.org_vocab_size
-        weights_idx = None
-        if self.base_layer.vocab_end_index > lora_vocab_start_idx:
+        if self.base_layer.num_added_embeddings_per_partition > 0:
             # We can start adding lora weights
-            weights_idx = max(
-                lora_vocab_start_idx - self.base_layer.vocab_start_index, 0)
-            self.embeddings_slice = (self.base_layer.vocab_start_index -
-                                     self.base_layer.org_vocab_size +
-                                     weights_idx,
-                                     self.base_layer.vocab_end_index -
-                                     self.base_layer.org_vocab_size)
-            self.embeddings_weights = self.base_layer.weight.data[weights_idx:]
-            self.embeddings_weights.fill_(0)
+            self.embeddings_weights = self.base_layer.weight.data[
+                self.base_layer.num_org_embeddings_per_partition:self.
+                base_layer.num_org_embeddings_per_partition +
+                self.base_layer.num_added_embeddings_per_partition]
+            self.embeddings_slice = (
+                self.base_layer.shard_indices.added_vocab_start_index -
+                self.base_layer.org_vocab_size,
+                self.base_layer.shard_indices.added_vocab_end_index -
+                self.base_layer.org_vocab_size)
+            self.base_layer.weight.data[
+                self.base_layer.num_org_embeddings_per_partition:].fill_(0)
         else:
             self.embeddings_slice = None
             self.embeddings_weights = None
@@ -1025,19 +1025,31 @@ def can_replace_layer(cls, source_layer: nn.Module,
 
 
 class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+    """
+    LoRA wrapper for LogitsProcessor, with extra logic to handle the
+    application of the LoRA adapter and added LoRA vocabulary.
+
+    Args:
+        base_layer: LogitsProcessor layer
+        hidden_size: hidden size of the model
+        dtype: data type of the model
+        device: device of the model
+        sharded_to_full_mapping: index mapping from sharded vocab to full vocab
+            received from base_layer.get_sharded_to_full_mapping(). If None,
+            no reindexing will be done.
+    """
 
-    def __init__(
-        self,
-        base_layer: LogitsProcessor,
-        hidden_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> None:
+    def __init__(self, base_layer: LogitsProcessor, hidden_size: int,
+                 dtype: torch.dtype, device: torch.device,
+                 sharded_to_full_mapping: Optional[List[int]]) -> None:
         super().__init__()
         self.base_layer = base_layer
         self.hidden_size = hidden_size
         self.dtype = dtype
         self.device = device
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.sharded_to_full_mapping = sharded_to_full_mapping
 
     @property
     def logits_as_input(self):
@@ -1098,6 +1110,13 @@ def create_lora_weights(
             dtype=self.dtype,
             device=self.device,
         )
+        if self.sharded_to_full_mapping is not None:
+            self.sharded_to_full_mapping_gpu = torch.tensor(
+                self.sharded_to_full_mapping,
+                device=self.device,
+                dtype=torch.long)
+        else:
+            self.sharded_to_full_mapping_gpu = None
         # Lazily initialized.
         self.indices: torch.Tensor
         self.indices_len: List[int]
@@ -1154,6 +1173,25 @@ def _get_logits(
         if logits is None:
             return None
 
+        if self.sharded_to_full_mapping_gpu is not None:
+            # Reindex full logits tensor to ensure 1:1 mapping between
+            # index and token_id
+            # Example for:
+            #   org_vocab_size = 4
+            #   added_vocab_size = 2
+            #   pad_to_size = 8
+            #   tp_size = 2
+
+            # indices:  [0, 1, 2,  3, 4, 5, 6,  7]
+            # token_id: [0, 1, 4, -1, 2, 3, 5, -1]
+
+            # Therefore, the mapping is expected to be:
+            # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex,
+            # we get:
+            # indices:  [0, 1, 2, 3, 4, 5,  6,  7]
+            # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
+            logits = logits[:, self.sharded_to_full_mapping_gpu]
+
         lora_logits = torch.empty(
             self.embeddings_tensors.shape[0] + 1,
             self.embeddings_tensors.shape[1],
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index fcc7f24721939..b0198a50b1c52 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -67,7 +67,8 @@ def from_layer_logits_processor(
     model_config: Optional[PretrainedConfig] = None,
 ) -> LogitsProcessorWithLoRA:
     ret = LogitsProcessorWithLoRA(layer, lm_head.embedding_dim,
-                                  lm_head.weight.dtype, lm_head.weight.device)
+                                  lm_head.weight.dtype, lm_head.weight.device,
+                                  lm_head.get_sharded_to_full_mapping())
     ret.create_lora_weights(max_loras, lora_config, model_config)
     return ret
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 4585b1679cb5c..60eb5b404e2ca 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -1,4 +1,5 @@
-from typing import Optional, Sequence
+from dataclasses import dataclass
+from typing import List, Optional, Sequence, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -18,18 +19,107 @@ def pad_vocab_size(vocab_size: int,
     return ((vocab_size + pad_to - 1) // pad_to) * pad_to
 
 
-def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size: int,
-                                              rank: int) -> Sequence[int]:
+def vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size: int,
+        rank: int,
+        offset: int = 0) -> Sequence[int]:
     index_f = rank * per_partition_vocab_size
     index_l = index_f + per_partition_vocab_size
-    return index_f, index_l
+    return index_f + offset, index_l + offset
 
 
-def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int,
-                                       world_size: int) -> Sequence[int]:
+def vocab_range_from_global_vocab_size(global_vocab_size: int,
+                                       rank: int,
+                                       world_size: int,
+                                       offset: int = 0) -> Sequence[int]:
     per_partition_vocab_size = divide(global_vocab_size, world_size)
     return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
-                                                     rank)
+                                                     rank,
+                                                     offset=offset)
+
+
+@dataclass
+class VocabParallelEmbeddingShardIndices:
+    """Indices for a shard of a vocab parallel embedding."""
+    padded_org_vocab_start_index: int
+    padded_org_vocab_end_index: int
+    padded_added_vocab_start_index: int
+    padded_added_vocab_end_index: int
+
+    org_vocab_start_index: int
+    org_vocab_end_index: int
+    added_vocab_start_index: int
+    added_vocab_end_index: int
+
+    @property
+    def num_org_elements(self) -> int:
+        return self.org_vocab_end_index - self.org_vocab_start_index
+
+    @property
+    def num_added_elements(self) -> int:
+        return self.added_vocab_end_index - self.added_vocab_start_index
+
+    @property
+    def num_org_elements_padded(self) -> int:
+        return (self.padded_org_vocab_end_index -
+                self.padded_org_vocab_start_index)
+
+    @property
+    def num_added_elements_padded(self) -> int:
+        return (self.padded_added_vocab_end_index -
+                self.padded_added_vocab_start_index)
+
+    @property
+    def num_org_vocab_padding(self) -> int:
+        return self.num_org_elements_padded - self.num_org_elements
+
+    @property
+    def num_added_vocab_padding(self) -> int:
+        return self.num_added_elements_padded - self.num_added_elements
+
+    @property
+    def num_elements_padded(self) -> int:
+        return self.num_org_elements_padded + self.num_added_elements_padded
+
+    def __post_init__(self):
+        # sanity checks
+        assert (self.padded_org_vocab_start_index <=
+                self.padded_org_vocab_end_index)
+        assert (self.padded_added_vocab_start_index <=
+                self.padded_added_vocab_end_index)
+
+        assert self.org_vocab_start_index <= self.org_vocab_end_index
+        assert self.added_vocab_start_index <= self.added_vocab_end_index
+
+        assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
+        assert (self.added_vocab_start_index <=
+                self.padded_added_vocab_start_index)
+        assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
+        assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
+
+        assert self.num_org_elements <= self.num_org_elements_padded
+        assert self.num_added_elements <= self.num_added_elements_padded
+
+
+@torch.jit.script
+def get_masked_input_and_mask(
+        input_: torch.Tensor, org_vocab_start_index: int,
+        org_vocab_end_index: int, num_org_vocab_padding: int,
+        added_vocab_start_index: int,
+        added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    # torch.jit.script will fuse all of the pointwise ops below
+    # into a single kernel, making it very fast
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
+                                                          org_vocab_end_index)
+    added_vocab_mask = (input_ >= added_vocab_start_index) & (
+        input_ < added_vocab_end_index)
+    added_offset = added_vocab_start_index - (
+        org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
+    valid_offset = (org_vocab_start_index *
+                    org_vocab_mask) + (added_offset * added_vocab_mask)
+    vocab_mask = org_vocab_mask | added_vocab_mask
+    input_ = vocab_mask * (input_ - valid_offset)
+    return input_, ~vocab_mask
 
 
 class VocabParallelEmbedding(torch.nn.Module):
@@ -38,13 +128,36 @@ class VocabParallelEmbedding(torch.nn.Module):
     Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
     make sure it is divisible by the number of model parallel GPUs.
 
+    In order to support various loading methods, we ensure that LoRA-added
+    embeddings are always at the end of TP-sharded tensors. In other words,
+    we shard base embeddings and LoRA embeddings separately (both padded),
+    and place them in the same tensor.
+    In this example, we will have the original vocab size = 1010,
+    added vocab size = 16 and padding to 64. Therefore, the total
+    vocab size with padding will be 1088 (because we first pad 1010 to
+    1024, add 16, and then pad to 1088).
+    Therefore, the tensor format looks like the following:
+    TP1, rank 0 (no sharding):
+                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
+    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1015 |  -1  | ... |  -1  |
+                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
+
+    TP2, rank 0:
+                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
+    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1000 | ... | 1015 |  -1  | ... |  -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  520 | ... | 543 |
+    TP2, rank 1:
+                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
+    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 |
+
     Args:
         num_embeddings: vocabulary size.
         embedding_dim: size of hidden state.
         params_dtype: type of the parameters.
         org_num_embeddings: original vocabulary size (without LoRA).
         padding_size: padding size for the vocabulary.
-    """
+    """  # noqa: E501
 
     def __init__(self,
                  num_embeddings: int,
@@ -55,21 +168,39 @@ def __init__(self,
         super().__init__()
 
         # Keep the input dimensions.
+        tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
         self.num_embeddings = num_embeddings
+        self.padding_size = padding_size
         self.org_vocab_size = org_num_embeddings or num_embeddings
-        self.num_embeddings_padded = pad_vocab_size(num_embeddings,
-                                                    padding_size)
+        num_added_embeddings = num_embeddings - self.org_vocab_size
+        self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size,
+                                                    self.padding_size)
+        self.num_embeddings_padded = pad_vocab_size(
+            self.org_vocab_size_padded + num_added_embeddings,
+            self.padding_size)
+        assert self.org_vocab_size_padded <= self.num_embeddings_padded
+
+        self.shard_indices = self._get_indices(self.num_embeddings_padded,
+                                               self.org_vocab_size_padded,
+                                               self.num_embeddings,
+                                               self.org_vocab_size, tp_rank,
+                                               self.tp_size)
         self.embedding_dim = embedding_dim
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
-        self.tp_size = get_tensor_model_parallel_world_size()
         # Divide the weight matrix along the vocaburaly dimension.
-        self.vocab_start_index, self.vocab_end_index = (
-            vocab_range_from_global_vocab_size(
-                self.num_embeddings_padded, get_tensor_model_parallel_rank(),
-                self.tp_size))
-        self.num_embeddings_per_partition = (self.vocab_end_index -
-                                             self.vocab_start_index)
+        self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
+        self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
+                                                   self.tp_size)
+        assert (self.shard_indices.num_elements_padded ==
+                self.num_embeddings_per_partition)
+        self.num_org_embeddings_per_partition = (
+            self.shard_indices.org_vocab_end_index -
+            self.shard_indices.org_vocab_start_index)
+        self.num_added_embeddings_per_partition = (
+            self.shard_indices.added_vocab_end_index -
+            self.shard_indices.added_vocab_start_index)
         self.weight = Parameter(
             torch.empty(self.num_embeddings_per_partition,
                         self.embedding_dim,
@@ -79,28 +210,107 @@ def __init__(self,
             "weight_loader": self.weight_loader
         })
 
+    @classmethod
+    def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
+                     vocab_size: int, org_vocab_size: int, tp_rank: int,
+                     tp_size: int) -> VocabParallelEmbeddingShardIndices:
+        """Get start and end indices for vocab parallel embedding, following the
+        layout outlined in the class docstring, based on the given tp_rank and
+        tp_size."""
+        num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded
+        padded_org_vocab_start_index, padded_org_vocab_end_index = (
+            vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank,
+                                               tp_size))
+        padded_added_vocab_start_index, padded_added_vocab_end_index = (
+            vocab_range_from_global_vocab_size(num_added_embeddings_padded,
+                                               tp_rank,
+                                               tp_size,
+                                               offset=org_vocab_size))
+        # remove padding
+        org_vocab_start_index = min(padded_org_vocab_start_index,
+                                    org_vocab_size)
+        org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size)
+        added_vocab_start_index = min(padded_added_vocab_start_index,
+                                      vocab_size)
+        added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size)
+        return VocabParallelEmbeddingShardIndices(
+            padded_org_vocab_start_index, padded_org_vocab_end_index,
+            padded_added_vocab_start_index, padded_added_vocab_end_index,
+            org_vocab_start_index, org_vocab_end_index,
+            added_vocab_start_index, added_vocab_end_index)
+
+    def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
+        """Get a mapping that can be used to reindex the gathered
+        logits for sampling.
+        
+        During sampling, we gather logits from all ranks. The relationship
+        of index->token_id will follow the same format as outlined in the class
+        docstring. However, after the gather, we want to reindex the final
+        logits tensor to map index->token_id one-to-one (the index is always
+        equal the token_id it corresponds to). The indices returned by this
+        method allow us to do that.
+        """
+        if self.tp_size < 2:
+            return None
+
+        base_embeddings: List[int] = []
+        added_embeddings: List[int] = []
+        padding: List[int] = []
+        for tp_rank in range(self.tp_size):
+            shard_indices = self._get_indices(self.num_embeddings_padded,
+                                              self.org_vocab_size_padded,
+                                              self.num_embeddings,
+                                              self.org_vocab_size, tp_rank,
+                                              self.tp_size)
+            range_start = self.num_embeddings_per_partition * tp_rank
+            range_end = self.num_embeddings_per_partition * (tp_rank + 1)
+            base_embeddings.extend(
+                range(range_start,
+                      range_start + shard_indices.num_org_elements))
+            padding.extend(
+                range(range_start + shard_indices.num_org_elements,
+                      range_start + shard_indices.num_org_elements_padded))
+            added_embeddings.extend(
+                range(
+                    range_start + shard_indices.num_org_elements_padded,
+                    range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements))
+            padding.extend(
+                range(
+                    range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements,
+                    range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements_padded))
+            assert (range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements_padded == range_end)
+        ret = base_embeddings + added_embeddings + padding
+        assert len(ret) == self.num_embeddings_padded
+        return ret
+
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         parallel_dim = param.parallel_dim
         assert loaded_weight.shape[parallel_dim] == self.org_vocab_size
-        loaded_weight = loaded_weight[self.vocab_start_index:self.
-                                      vocab_end_index]
+        loaded_weight = loaded_weight[self.shard_indices.org_vocab_start_index:
+                                      self.shard_indices.org_vocab_end_index]
         param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
+        param[loaded_weight.shape[0]:].data.fill_(0)
 
     def forward(self, input_):
         if self.tp_size > 1:
             # Build the mask.
-            input_mask = ((input_ < self.vocab_start_index) |
-                          (input_ >= self.vocab_end_index))
-            # Mask the input.
-            masked_input = input_.clone() - self.vocab_start_index
-            masked_input[input_mask] = 0
+            masked_input, input_mask = get_masked_input_and_mask(
+                input_, self.shard_indices.org_vocab_start_index,
+                self.shard_indices.org_vocab_end_index,
+                self.shard_indices.num_org_vocab_padding,
+                self.shard_indices.added_vocab_start_index,
+                self.shard_indices.added_vocab_end_index)
         else:
             masked_input = input_
             # Get the embeddings.
         output_parallel = F.embedding(masked_input, self.weight)
         # Mask the output embedding.
         if self.tp_size > 1:
-            output_parallel[input_mask, :] = 0.0
+            output_parallel.masked_fill_(input_mask.unsqueeze(1), 0)
         # Reduce across all the model parallel GPUs.
         output = tensor_model_parallel_all_reduce(output_parallel)
         return output
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 67c03ad600083..c59288b4f73c6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -35,6 +35,7 @@
 _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
     _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
 ]
+_NUM_WARMUP_ITERS = 2
 
 
 class ModelInput(NamedTuple):
@@ -975,16 +976,18 @@ def capture(
         **kwargs,
     ) -> None:
         assert self._graph is None
-        # Run the model once without capturing the graph.
+        # Run the model a few times without capturing the graph.
         # This is to make sure that the captured graph does not include the
         # kernel launches for initial benchmarking (e.g., Triton autotune).
-        self.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-            **kwargs,
-        )
+        # Note one iteration is not enough for torch.jit.script
+        for _ in range(_NUM_WARMUP_ITERS):
+            self.model(
+                input_ids,
+                positions,
+                kv_caches,
+                attn_metadata,
+                **kwargs,
+            )
         torch.cuda.synchronize()
 
         # Capture the graph.

From 15063741e30881d7a982c3436c3299a0551327dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Fri, 7 Jun 2024 11:17:21 +0800
Subject: [PATCH 06/30] [Misc] Missing error message for custom ops import
 (#5282)

---
 vllm/_custom_ops.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 8a6f6d96d81f3..462ba8a753105 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -5,8 +5,10 @@
 try:
     from vllm._C import cache_ops as vllm_cache_ops
     from vllm._C import ops as vllm_ops
-except ImportError:
-    pass
+except ImportError as e:
+    from vllm.logger import init_logger
+    logger = init_logger(__name__)
+    logger.warning("Failed to import from vllm._C with %r", e)
 
 
 # activation ops

From baa15a9ec320a6b90222df0aaed13b89e3bafc9c Mon Sep 17 00:00:00 2001
From: Itay Etelis <92247226+Etelis@users.noreply.github.com>
Date: Fri, 7 Jun 2024 06:29:24 +0300
Subject: [PATCH 07/30] [Feature][Frontend]: Add support for `stream_options`
 in `ChatCompletionRequest` (#5135)

---
 tests/entrypoints/test_openai_server.py | 101 ++++++++++++++++++++++++
 vllm/entrypoints/openai/protocol.py     |  14 ++++
 vllm/entrypoints/openai/serving_chat.py |  44 ++++++++---
 3 files changed, 149 insertions(+), 10 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 3721b047e43d9..b7d0946ba7244 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1343,5 +1343,106 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 17
 
 
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_stream_options(server, client: openai.AsyncOpenAI,
+                              model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=None
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options=None,
+    )
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert len(chunks) > 0
+    assert "usage" not in chunk
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False},
+    )
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert len(chunks) > 0
+    assert "usage" not in chunk
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True},
+    )
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+            chunks.append(chunk.choices[0].text)
+        else:
+            assert chunk.usage is None
+            finish_reason_count += 1
+
+    # The last message should have usage and no choices
+    last_message = await stream.__anext__()
+    assert last_message.usage is not None
+    assert last_message.usage.prompt_tokens > 0
+    assert last_message.usage.completion_tokens > 0
+    assert last_message.usage.total_tokens == (
+        last_message.usage.prompt_tokens +
+        last_message.usage.completion_tokens)
+    assert last_message.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None},
+        )
+
+    # Test stream=False, stream_options={"include_usage": False}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": False},
+        )
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True},
+        )
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 11ac28e758c39..fa33318786b9a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -102,6 +102,10 @@ class ResponseFormat(OpenAIBaseModel):
     type: Literal["text", "json_object"]
 
 
+class StreamOptions(OpenAIBaseModel):
+    include_usage: Optional[bool]
+
+
 class FunctionDefinition(OpenAIBaseModel):
     name: str
     description: Optional[str] = None
@@ -140,6 +144,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
                                 le=torch.iinfo(torch.long).max)
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
+    stream_options: Optional[StreamOptions] = None
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
     tools: Optional[List[ChatCompletionToolsParam]] = None
@@ -269,6 +274,15 @@ def logit_bias_logits_processor(
             logits_processors=logits_processors,
         )
 
+    @model_validator(mode='before')
+    @classmethod
+    def validate_stream_options(cls, values):
+        if (values.get('stream_options') is not None
+                and not values.get('stream')):
+            raise ValueError(
+                "stream_options can only be set if stream is true")
+        return values
+
     @model_validator(mode="before")
     @classmethod
     def check_guided_decoding_count(cls, data):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index afd87f49c1c45..883567abf415b 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -247,6 +247,9 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
+                        if (request.stream_options
+                                and request.stream_options.include_usage):
+                            chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
 
@@ -274,6 +277,9 @@ async def chat_completion_stream_generator(
                                     choices=[choice_data],
                                     logprobs=None,
                                     model=model_name)
+                                if (request.stream_options and
+                                        request.stream_options.include_usage):
+                                    chunk.usage = None
                                 data = chunk.model_dump_json(
                                     exclude_unset=True)
                                 yield f"data: {data}\n\n"
@@ -327,17 +333,14 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
+                        if (request.stream_options
+                                and request.stream_options.include_usage):
+                            chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
                     else:
                         # Send the finish response for each request.n only once
                         prompt_tokens = len(res.prompt_token_ids)
-                        final_usage = UsageInfo(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=previous_num_tokens[i],
-                            total_tokens=prompt_tokens +
-                            previous_num_tokens[i],
-                        )
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
                             delta=delta_message,
@@ -350,12 +353,33 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
-                        if final_usage is not None:
-                            chunk.usage = final_usage
-                        data = chunk.model_dump_json(exclude_unset=True,
-                                                     exclude_none=True)
+                        if (request.stream_options
+                                and request.stream_options.include_usage):
+                            chunk.usage = None
+                        data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
                         finish_reason_sent[i] = True
+
+                    if (request.stream_options
+                            and request.stream_options.include_usage):
+                        final_usage = UsageInfo(
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=previous_num_tokens[i],
+                            total_tokens=prompt_tokens +
+                            previous_num_tokens[i],
+                        )
+
+                        final_usage_chunk = ChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=[],
+                            model=model_name,
+                            usage=final_usage)
+                        final_usage_data = (final_usage_chunk.model_dump_json(
+                            exclude_unset=True, exclude_none=True))
+                        yield f"data: {final_usage_data}\n\n"
+
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))

From 388596c91437a51d428a447594e9faec340c29b2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 6 Jun 2024 22:15:11 -0700
Subject: [PATCH 08/30] [Misc][Utils] allow get_open_port to be called for
 multiple times (#5333)

---
 tests/test_utils.py | 16 +++++++++++++++-
 vllm/envs.py        |  3 +++
 vllm/utils.py       | 10 +++++++++-
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index a6c3896fa43bf..0b674ea6a85c1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,11 +1,13 @@
 import asyncio
+import os
+import socket
 import sys
 from typing import (TYPE_CHECKING, Any, AsyncIterator, Awaitable, Protocol,
                     Tuple, TypeVar)
 
 import pytest
 
-from vllm.utils import deprecate_kwargs, merge_async_iterators
+from vllm.utils import deprecate_kwargs, get_open_port, merge_async_iterators
 
 from .utils import error_on_warning
 
@@ -116,3 +118,15 @@ def dummy(*, old_arg: object = None, new_arg: object = None):
 
     with pytest.warns(DeprecationWarning, match="abcd"):
         dummy(old_arg=1)
+
+
+def test_get_open_port():
+    os.environ["VLLM_PORT"] = "5678"
+    # make sure we can get multiple ports, even if the env var is set
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
+        s1.bind(("localhost", get_open_port()))
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
+            s2.bind(("localhost", get_open_port()))
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
+                s3.bind(("localhost", get_open_port()))
+    os.environ.pop("VLLM_PORT")
diff --git a/vllm/envs.py b/vllm/envs.py
index bef343d08429c..7d5c7371b7741 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -99,6 +99,9 @@
     lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
 
     # used in distributed environment to manually set the communication port
+    # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
+    # VLLM_PORT will be used as the first port, and the rest will be generated
+    # by incrementing the VLLM_PORT value.
     # '0' is used to make mypy happy
     'VLLM_PORT':
     lambda: int(os.getenv('VLLM_PORT', '0'))
diff --git a/vllm/utils.py b/vllm/utils.py
index 2781eceb7ba98..2bd24d086f690 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -289,7 +289,15 @@ def get_distributed_init_method(ip: str, port: int) -> str:
 def get_open_port() -> int:
     port = envs.VLLM_PORT
     if port is not None:
-        return port
+        while True:
+            try:
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    s.bind(("", port))
+                    return port
+            except OSError:
+                port += 1  # Increment port number if already in use
+                logger.info("Port %d is already in use, trying port %d",
+                            port - 1, port)
     # try ipv4
     try:
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:

From 8d75fe48ca5f46b7af0f5201d8500b9604eed769 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 7 Jun 2024 04:42:35 -0400
Subject: [PATCH 09/30] [Kernel] Switch fp8 layers to use the CUTLASS kernels
 (#5183)

Switching from torch._scaled_mm to vLLM's cutlass fp8 kernels when supported as we are seeing 5-15% improvement in e2e performance on neuralmagic/Meta-Llama-3-8B-Instruct-FP8

see https://docs.google.com/spreadsheets/d/1GiAnmzyGHgZ6zL_LDSTm35Bdrt4A8AaFEurDlISYYA4/ for some quick e2e benchmarks and #5144 for comparisons across different GEMM sizes.
---
 vllm/_custom_ops.py                           |  4 +-
 .../model_executor/layers/quantization/fp8.py | 66 ++++++++++++++-----
 2 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 462ba8a753105..cae6822166b66 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -179,7 +179,7 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 # cutlass
 def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
-                         a_scales: torch.Tensor, b_scales: torch.Tensor,
+                         scale_a: torch.Tensor, scale_b: torch.Tensor,
                          out_dtype: Type[torch.dtype]) -> torch.Tensor:
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
@@ -188,7 +188,7 @@ def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
     n = b.shape[1]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    vllm_ops.cutlass_scaled_mm_dq(out, a, b, a_scales, b_scales)
+    vllm_ops.cutlass_scaled_mm_dq(out, a, b, scale_a, scale_b)
 
     return out
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index bf3a59e3d709b..136a64623d7fb 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -17,6 +17,24 @@
 logger = init_logger(__name__)
 
 
+def cutlass_fp8_supported() -> bool:
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    version = torch.version.cuda
+    version = version[0] * 10 + version[1]
+
+    # CUTLASS FP8 kernels need at least
+    #   CUDA 12.0 on SM90 systems (Hopper)
+    #   CUDA 12.4 on SM89 systems (Lovelace)
+    gpu_is_supported = False
+    if capability >= 900:
+        gpu_is_supported = version > 120
+    elif capability >= 890:
+        gpu_is_supported = version > 124
+
+    return gpu_is_supported
+
+
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
 
@@ -92,6 +110,7 @@ class Fp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
 
     def _create_scale_param(
         self,
@@ -233,25 +252,40 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
         # ops.scaled_fp8_quant supports both dynamic and static quant.
         #   If dynamic, layer.act_scale is None and x_scale computed from x.
         #   If static,  layer.act_scale is scalar and x_scale set to act_scale.
-        qinput, x_scale = ops.scaled_fp8_quant(x,
-                                               layer.act_scale,
-                                               batch_dim_padding=17)
-
-        # Fused GEMM_DQ -- note we padded the input above because
-        # torch._scaled_mm is more performant for matrices with
-        # batch dimension > 16. Note that this could change
-        # in the future.
-        output, _ = torch._scaled_mm(
-            qinput,
-            layer.weight,
-            out_dtype=x.dtype,
-            scale_a=x_scale,
-            scale_b=layer.weight_scale,
-            bias=bias,
-        )
+
+        if bias is None and self.cutlass_fp8_supported:
+            qinput, x_scale = ops.scaled_fp8_quant(x, layer.act_scale)
+
+            # Fused GEMM_DQ
+            output = ops.cutlass_scaled_mm_dq(
+                qinput,
+                layer.weight,
+                out_dtype=x.dtype,
+                scale_a=x_scale,
+                scale_b=layer.weight_scale,
+            )
+
+        else:
+            qinput, x_scale = ops.scaled_fp8_quant(x,
+                                                   layer.act_scale,
+                                                   batch_dim_padding=17)
+
+            # Fused GEMM_DQ -- note we padded the input above because
+            # torch._scaled_mm is more performant for matrices with
+            # batch dimension > 16. Note that this could change
+            # in the future.
+            output, _ = torch._scaled_mm(
+                qinput,
+                layer.weight,
+                out_dtype=x.dtype,
+                scale_a=x_scale,
+                scale_b=layer.weight_scale,
+                bias=bias,
+            )
 
         return torch.narrow(output, 0, 0, x.shape[0])
 

From 18a277b52dd2a64ee4c0111fc8cda126031e5889 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 7 Jun 2024 03:01:56 -0700
Subject: [PATCH 10/30] Remove Ray health check (#4693)

---
 vllm/executor/ray_gpu_executor.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index bed356d1b6e58..89d1c4ac7cbc2 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -293,23 +293,6 @@ def _compiled_ray_dag(self):
             ])
         return forward_dag.experimental_compile()
 
-    def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        self._check_if_any_actor_is_dead()
-
-    def _check_if_any_actor_is_dead(self):
-        if not self.workers:
-            return
-
-        dead_actors = []
-        for actor in self.workers:
-            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
-            if actor_state["State"] == "DEAD":
-                dead_actors.append(actor)
-        if dead_actors:
-            raise RuntimeError("At least one Worker is dead. "
-                               f"Dead Workers: {dead_actors}. ")
-
 
 class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
 

From dc49fb892ca32cb364dfc39d711ab84d3b35a28f Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Fri, 7 Jun 2024 21:35:42 +0800
Subject: [PATCH 11/30] Addition of lacked ignored_seq_groups in
 _schedule_chunked_prefill (#5296)

---
 vllm/core/scheduler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 399665082f838..0159053b4dc6a 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -905,7 +905,8 @@ def _schedule_chunked_prefill(self):
             blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
             blocks_to_copy=running_scheduled.blocks_to_copy +
             swapped_in.blocks_to_copy,
-            ignored_seq_groups=prefills.ignored_seq_groups,
+            ignored_seq_groups=prefills.ignored_seq_groups +
+            swapped_in.infeasible_seq_groups,
             num_lookahead_slots=running_scheduled.num_lookahead_slots,
             running_queue_size=len(self.running),
             preempted=(len(running_scheduled.preempted) +

From ca3ea51bde6c22d0afb3aa0a3fdba6d568095a0a Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 7 Jun 2024 12:36:26 -0400
Subject: [PATCH 12/30] [Kernel] Dynamic Per-Token Activation Quantization
 (#5037)

Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 csrc/ops.h                                    |   3 +
 csrc/pybind.cpp                               |   3 +
 .../compressed_tensors/int8_quant_kernels.cu  |  75 ++++++++++--
 csrc/reduction_utils.cuh                      |  54 +++++++--
 tests/kernels/test_int8_quant.py              |  44 ++++++-
 tests/quantization/test_compressed_tensors.py |  19 ++-
 vllm/_custom_ops.py                           |  28 +++--
 .../compressed_tensors/compressed_tensors.py  |  87 +++++++------
 .../compressed_tensors/schemes/__init__.py    |   2 +
 .../compressed_tensors_w8a8_dynamictoken.py   |  85 +++++++++++++
 .../compressed_tensors_w8a8_statictensor.py   |   2 +-
 .../quantization/compressed_tensors/utils.py  | 114 ++++++++++++++++++
 12 files changed, 440 insertions(+), 76 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/utils.py

diff --git a/csrc/ops.h b/csrc/ops.h
index 4952e826ec8ac..06b60e748886f 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -97,6 +97,9 @@ int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor const& scale);
 
+void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
+                               torch::Tensor& scales);
+
 void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
                      torch::Tensor lookup_table);
 
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index cdbec4a34d77f..547823aa1b04e 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -70,6 +70,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
           "Compute int8 quantized tensor for given scaling factor");
 
+  ops.def("dynamic_scaled_int8_quant", &dynamic_scaled_int8_quant,
+          "Compute int8 quantized tensor and scaling factor");
+
   // Cache ops
   pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
   cache_ops.def("swap_blocks", &swap_blocks,
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index 11baa5d414c19..280b0327111da 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -3,6 +3,7 @@
 #include <cmath>
 
 #include "../../dispatch_utils.h"
+#include "../../reduction_utils.cuh"
 
 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
@@ -27,17 +28,48 @@ namespace vllm {
 
 template <typename scalar_t, typename scale_type>
 __global__ void static_scaled_int8_quant_kernel(
-    const scalar_t* __restrict__ input, int8_t* __restrict__ out,
-    const scale_type* scale_ptr, const int hidden_size) {
-  const int tid = threadIdx.x;
-  const int token_idx = blockIdx.x;
-  scale_type scale = *scale_ptr;
+    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
+    scale_type const* scale_ptr, const int hidden_size) {
+  int const tid = threadIdx.x;
+  int const token_idx = blockIdx.x;
+  scale_type const scale = *scale_ptr;
 
   for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[token_idx * hidden_size + i] =
-        float_to_int8_rn(((float)input[token_idx * hidden_size + i]) / scale);
+    out[token_idx * hidden_size + i] = float_to_int8_rn(
+        static_cast<float>(input[token_idx * hidden_size + i]) / scale);
   }
 }
+
+template <typename scalar_t, typename scale_type>
+__global__ void dynamic_scaled_int8_quant_kernel(
+    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
+    scale_type* scale, const int hidden_size) {
+  int const tid = threadIdx.x;
+  int const token_idx = blockIdx.x;
+  float absmax_val = 0.0f;
+  float const zero = 0.0f;
+
+  for (int i = tid; i < hidden_size; i += blockDim.x) {
+    float val = static_cast<float>(input[token_idx * hidden_size + i]);
+    val = val > zero ? val : -val;
+    absmax_val = val > absmax_val ? val : absmax_val;
+  }
+
+  float const block_absmax_val_maybe = blockReduceMax(absmax_val);
+  __shared__ float block_absmax_val;
+  if (tid == 0) {
+    block_absmax_val = block_absmax_val_maybe;
+    scale[token_idx] = block_absmax_val / 127.0f;
+  }
+  __syncthreads();
+
+  float const tmp_scale = 127.0f / block_absmax_val;
+  for (int i = tid; i < hidden_size; i += blockDim.x) {
+    out[token_idx * hidden_size + i] = float_to_int8_rn(
+        static_cast<float>(input[token_idx * hidden_size + i]) * tmp_scale);
+  }
+}
+
 }  // namespace vllm
 
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
@@ -47,10 +79,10 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
   TORCH_CHECK(out.is_contiguous());
   TORCH_CHECK(scale.numel() == 1);
 
-  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
-  dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, 1024));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
@@ -60,3 +92,24 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                                          scale.data_ptr<float>(), hidden_size);
       });
 }
+
+void dynamic_scaled_int8_quant(
+    torch::Tensor& out,          // [..., hidden_size]
+    torch::Tensor const& input,  // [..., hidden_size]
+    torch::Tensor& scales) {
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, 1024));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
+        vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float>
+            <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
+                                         out.data_ptr<int8_t>(),
+                                         scales.data_ptr<float>(), hidden_size);
+      });
+}
diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh
index 9af4aae516151..08063356012b8 100644
--- a/csrc/reduction_utils.cuh
+++ b/csrc/reduction_utils.cuh
@@ -21,29 +21,47 @@
 #include "cuda_compat.h"
 
 namespace vllm {
+
+namespace detail {
+
+template <typename T>
+__inline__ __device__ T _max(T a, T b) {
+  return max(a, b);
+}
+
+template <typename T>
+__inline__ __device__ T _sum(T a, T b) {
+  return a + b;
+}
+
+}  // namespace detail
+
+template <typename T>
+using ReduceFnType = T (*)(T, T);
+
+// Helper function to return the next largest power of 2
+static constexpr int _nextPow2(unsigned int num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
 template <typename T, int numLanes = WARP_SIZE>
-__inline__ __device__ T warpReduceSum(T val) {
+__inline__ __device__ T warpReduce(T val, ReduceFnType<T> fn) {
   static_assert(numLanes > 0 && (numLanes & (numLanes - 1)) == 0,
                 "numLanes is not a positive power of 2!");
   static_assert(numLanes <= WARP_SIZE);
 #pragma unroll
   for (int mask = numLanes >> 1; mask > 0; mask >>= 1)
-    val += VLLM_SHFL_XOR_SYNC(val, mask);
-  return val;
-}
+    val = fn(val, VLLM_SHFL_XOR_SYNC(val, mask));
 
-// Helper function to return the next largest power of 2
-static constexpr int _nextPow2(unsigned int num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+  return val;
 }
 
-/* Calculate the sum of all elements in a block */
 template <typename T, int maxBlockSize = 1024>
-__inline__ __device__ T blockReduceSum(T val) {
+__inline__ __device__ T blockReduce(T val, ReduceFnType<T> fn) {
   static_assert(maxBlockSize <= 1024);
   if constexpr (maxBlockSize > WARP_SIZE) {
-    val = warpReduceSum<T>(val);
+    val = warpReduce<T>(val, fn);
     // Calculates max number of lanes that need to participate in the last
     // warpReduce
     constexpr int maxActiveLanes = (maxBlockSize + WARP_SIZE - 1) / WARP_SIZE;
@@ -56,12 +74,22 @@ __inline__ __device__ T blockReduceSum(T val) {
 
     val = (threadIdx.x < blockDim.x / float(WARP_SIZE)) ? shared[lane]
                                                         : (T)(0.0f);
-    val = warpReduceSum<T, _nextPow2(maxActiveLanes)>(val);
+    val = warpReduce<T, _nextPow2(maxActiveLanes)>(val, fn);
   } else {
     // A single warpReduce is equal to blockReduce
-    val = warpReduceSum<T, _nextPow2(maxBlockSize)>(val);
+    val = warpReduce<T, _nextPow2(maxBlockSize)>(val, fn);
   }
   return val;
 }
 
+template <typename T, int maxBlockSize = 1024>
+__inline__ __device__ T blockReduceMax(T val) {
+  return blockReduce<T, maxBlockSize>(val, detail::_max<T>);
+}
+
+template <typename T, int maxBlockSize = 1024>
+__inline__ __device__ T blockReduceSum(T val) {
+  return blockReduce<T, maxBlockSize>(val, detail::_sum<T>);
+}
+
 }  // namespace vllm
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 29890118c93dc..aab7af9d2cbf6 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -4,27 +4,59 @@
 from vllm._C import ops
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192]  # Arbitrary values for testing
+HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
+                8193]  # Arbitrary values for testing
 NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
 SEEDS = [0]
 SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
 
 
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
+                                   dtype: torch.dtype, seed: int) -> None:
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    int8_traits = torch.iinfo(torch.int8)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
+
+    x_token_max, _ = x.max(dim=1)
+    x_token_max = x_token_max.to(dtype=torch.float32)
+    scales = (x_token_max / float(127.0))[:, None].to(device="cuda",
+                                                      dtype=torch.float32)
+    torch_out = (x / scales).round().clamp(int8_traits.min,
+                                           int8_traits.max).to(torch.int8)
+
+    ops_out = torch.empty_like(x, dtype=torch.int8, device="cuda")
+    scales_out = torch.empty_like(scales, dtype=torch.float32, device="cuda")
+    ops.dynamic_scaled_int8_quant(ops_out, x, scales_out)
+
+    assert torch.allclose(scales_out, scales)
+    assert torch.allclose(torch_out, ops_out,
+                          atol=1)  # big atol to account for rounding errors
+
+
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("scale", SCALE)
 @torch.inference_mode()
-def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype,
-               seed: int, scale: float) -> None:
+def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
+                                  dtype: torch.dtype, seed: int,
+                                  scale: float) -> None:
     torch.random.manual_seed(seed)
     torch.cuda.manual_seed(seed)
+    int8_traits = torch.iinfo(torch.int8)
+
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
 
-    out1 = (x / scale).round().clamp(
-        torch.iinfo(torch.int8).min,
-        torch.iinfo(torch.int8).max).to(torch.int8)
+    out1 = (x / scale).round().clamp(int8_traits.min,
+                                     int8_traits.max).to(torch.int8)
     out2 = torch.empty_like(x, dtype=torch.int8)
     scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda")
 
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index b83286992da3d..8b48f418fe49f 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -6,7 +6,8 @@
 import torch
 
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsW8A8StaticTensor)
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
@@ -34,3 +35,19 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):
     assert qkv_proj.weight_scale.shard_splitter is not None
     assert qkv_proj.weight_scale.logical_widths is not None
     assert qkv_proj.input_scale.dtype is torch.float32
+
+
+def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
+    model_path = "nm-testing/tinyllama-one-shot-dynamic-test"
+    llm = vllm_runner(model_path,
+                      quantization="sparseml",
+                      enforce_eager=True,
+                      dtype=torch.float16)
+    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+    layer = model.model.layers[0]
+
+    qkv_proj = layer.self_attn.qkv_proj
+
+    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
+    assert qkv_proj.weight.dtype is torch.int8
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index cae6822166b66..7e12f1ba14cde 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -266,21 +266,33 @@ def scaled_fp8_quant(
 
 
 # int8
-def static_scaled_int8_quant(input: torch.Tensor,
-                             scale: torch.Tensor) -> torch.Tensor:
+def scaled_int8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
     """
-    Quantize the input tensor to int8 and return the quantized tensor.
+    Quantize the input tensor to int8 and return the quantized tensor and scale.
 
     Args:
         input: The input tensor to be quantized to int8.
-        scale: Scaling factor for the int8 quantization.
+        scale: Optional scaling factor for the int8 quantization.
+            When not provided, we invoke dynamic-per-token quantization.
 
     Returns:
-        torch.Tensor: Output tensor in int8.
+      Tuple[Torch.Tensor, Torch.Tensor] : Output int8 tensor and scales.
     """
-    q = torch.empty_like(input, dtype=torch.int8)
-    vllm_ops.static_scaled_int8_quant(q, input, scale)
-    return q
+    output = torch.empty_like(input, dtype=torch.int8)
+    if scale is not None:
+        # static-per-tensor quantization.
+        vllm_ops.static_scaled_int8_quant(output, input, scale)
+        return output, scale
+
+    # dynamic-per-token quantization.
+    input_scales = torch.empty((input.numel() // input.shape[-1], 1),
+                               device=input.device,
+                               dtype=torch.float32)
+    vllm_ops.dynamic_scaled_int8_quant(output, input, input_scales)
+    return output, input_scales
 
 
 # moe
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 19e464bd64325..d2b0ce0dbbf0b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,12 +1,16 @@
 from typing import Any, Dict, List, Optional
 
 import torch
+from pydantic import BaseModel
 
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsScheme, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsW8A8StaticTensor)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match)
 
 
 class CompressedTensorsConfig(QuantizationConfig):
@@ -47,10 +51,12 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
             targets = quant_config.get("targets")
             for target in targets:
                 layer_quant_details[target] = {}
-                layer_quant_details[target]["weight"] = quant_config.get(
-                    "weights")
-                layer_quant_details[target]["input"] = quant_config.get(
-                    "input_activations")
+                layer_quant_details[target][
+                    "weight"] = QuantizationArgs.parse_obj(
+                        quant_config.get("weights"))
+                layer_quant_details[target][
+                    "input"] = QuantizationArgs.parse_obj(
+                        quant_config.get("input_activations"))
 
         return cls(layer_quant_details=layer_quant_details, ignore=ignore)
 
@@ -58,40 +64,46 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
     def get_config_filenames(cls) -> List[str]:
         return []
 
-    def _get_schema(self, weight_quant: Dict, input_quant: Dict):
-        # TODO: Refactor as additional cases are supported
-
-        weight_bit = weight_quant.get("num_bits")
-        input_bit = input_quant.get("num_bits")
-
-        weight_strategy = weight_quant.get("strategy")
-        input_strategy = input_quant.get("strategy")
-
-        weight_symmetric = weight_quant.get("symmetric")
-        input_symmetric = input_quant.get("symmetric")
+    def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
+                               input_quant: BaseModel) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        is_tensor = (weight_quant.strategy == input_quant.strategy ==
+                     QuantizationStrategy.TENSOR.value)
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+        is_static = not weight_quant.dynamic and not input_quant.dynamic
+
+        return is_8_bits and is_tensor and is_symmetric and is_static
+
+    def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
+                               input_quant: BaseModel) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        is_token_tensor = (weight_quant.strategy
+                           == QuantizationStrategy.TENSOR.value) and (
+                               input_quant.strategy
+                               == QuantizationStrategy.TOKEN.value)
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+
+        return is_8_bits and is_token_tensor and is_symmetric and is_dynamic
+
+    def _get_schema(self, weight_quant: BaseModel,
+                    input_quant: BaseModel) -> "CompressedTensorsScheme":
+        if self._is_static_tensor_w8a8(weight_quant, input_quant):
+            return CompressedTensorsW8A8StaticTensor()
 
-        is_8_bits = weight_bit == input_bit == 8
-        is_tensor = weight_strategy == input_strategy == "tensor"
-        is_symmetric = weight_symmetric and input_symmetric
+        if self._is_dynamic_token_w8a8(weight_quant, input_quant):
+            return CompressedTensorsW8A8DynamicToken()
 
-        if is_8_bits and is_tensor and is_symmetric and \
-                torch.cuda.is_available():
-            # CompressedTensorsW8A8StaticTensor only supports CUDA path for
-            # now.
-            return CompressedTensorsW8A8StaticTensor()
-        raise NotImplementedError(
-            "Scheme not supported. Only CUDA, 8-bit static symmtetric "
-            "per tensor quantization is currently supported")
+        raise NotImplementedError("Scheme not supported.")
 
     def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme":
 
-        # TODO: update with matching function from `compressed_tensors`
-        layer_type_name = None
-        layer_name_class = type(layer).__name__.lower()
-        for target in self.layer_quant_details:
-            if target.lower() in layer_name_class:
-                layer_type_name = target
-                break
+        layer_type_name = find_first_name_or_class_match(
+            name="",
+            module=layer,
+            targets=self.layer_quant_details.keys(),
+            check_contains=True)
+
         if layer_type_name is None:
             raise ValueError(f"Could not matching target for layer {layer}")
 
@@ -117,7 +129,9 @@ def create_weights(self, layer: torch.nn.Module,
                        **extra_weight_attrs):
         """
         Use the CompressedTensorsScheme associated with each layer to create 
-        the necessary parameters for the layer.
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+
         """
         weight_loader = extra_weight_attrs.get("weight_loader")
 
@@ -139,7 +153,8 @@ def apply(self,
         """
         Use the output of create_weights and the CompressedTensorsScheme 
         associated with the layer to apply the forward pass with the 
-        layer input.
+        layer input.  See LinearMethodBase for param details
+
         """
 
         if bias is not None:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 831905b63e2c9..9a910f061f580 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,5 +1,7 @@
 from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
 from .compressed_tensors_unquantized import (  # noqa: F401
     CompressedTensorsUnquantized)
+from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
+    CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
     CompressedTensorsW8A8StaticTensor)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
new file mode 100644
index 0000000000000..25b707caeef33
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -0,0 +1,85 @@
+from typing import Callable, List, Tuple, Union
+
+import torch
+from torch.nn import Parameter
+
+from vllm import _custom_ops as custom_ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsW8A8DynamicToken"]
+
+
+class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
+
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        assert isinstance(shard_id, str)
+        qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+
+    def scales_shard_splitter(
+            self, param: torch.Tensor, loaded_weight: torch.Tensor,
+            shard_id: Union[str, int],
+            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        shard_id = self._shard_id_as_int(shard_id)
+        offset = sum(logical_widths[:shard_id])
+        size = logical_widths[shard_id]
+        # update loaded weight with copies for broadcast.
+        loaded_weight = loaded_weight.repeat(size)
+        return param[offset:offset + size], loaded_weight
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        # When the scales have a single value, it is required that they be
+        # on the CPU for performance and CUDA Graphs compatibility. Please
+        # refer to the comment in
+        # CompressedTensorsW8A8StaticTensor::create_weights for further
+        # information.
+        is_tensor_partitioned = len(output_partition_sizes) != 1
+        weight_scale_dim = sum(
+            output_partition_sizes) if is_tensor_partitioned else 1
+
+        weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
+                                      requires_grad=False)
+
+        weight_scale = Parameter(torch.empty(weight_scale_dim,
+                                             dtype=torch.float32),
+                                 requires_grad=False)
+
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=torch.int8),
+                           requires_grad=False)
+
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        set_weight_attrs(weight, {"weight_loader": weight_loader})
+        set_weight_attrs(weight, {"logical_widths": output_partition_sizes})
+
+        layer.register_parameter("weight_scale", weight_scale)
+        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+        set_weight_attrs(
+            weight_scale, {
+                "shard_splitter": self.scales_shard_splitter,
+                "logical_widths": output_partition_sizes
+            })
+
+        layer.register_parameter("weight_zero_point", weight_zero_point)
+        set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader})
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+        weight = layer.weight
+        weight_scale = layer.weight_scale
+
+        x_q, input_scales = custom_ops.scaled_int8_quant(x)
+        return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), input_scales,
+                                               weight_scale, x.dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
index 2dfc6e2b07782..7559fc0f95b24 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -97,7 +97,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         act_scale = layer.input_scale
 
         # Input quantize
-        x_q = custom_ops.static_scaled_int8_quant(x, act_scale)
+        x_q, _ = custom_ops.scaled_int8_quant(x, act_scale)
 
         return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale,
                                                weight_scale, x.dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
new file mode 100644
index 0000000000000..fcc6649101845
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -0,0 +1,114 @@
+import re
+from enum import Enum
+from typing import Any, Dict, Iterable, Optional
+
+from pydantic import BaseModel, Field
+from torch.nn import Module
+
+
+class QuantizationType(str, Enum):
+    """
+    Enum storing quantization type options
+    """
+
+    INT = "int"
+    FLOAT = "float"
+
+
+class QuantizationStrategy(str, Enum):
+    """
+    Enum storing quantization strategy options
+    """
+
+    TENSOR = "tensor"
+    CHANNEL = "channel"
+    GROUP = "group"
+    BLOCK = "block"
+    TOKEN = "token"
+
+
+class QuantizationArgs(BaseModel):
+    """
+    User facing arguments used to define a quantization config 
+    for weights or activations
+
+    :param num_bits: quantization bit depth
+    :param type: dtype to quantized to, either int or float
+    :param symmetric: whether or not quantization scale is symmetric
+    :param strategy: string determining the scope of scale/zero-point to apply
+    :param group_size: group length to use for the group strategy
+    :param block_structure: 2d block structure to use for the block 
+    strategy, must be of the format "2x4", "8x16", etc.
+    :param dynamic: set True to perform dynamic quantization -
+        values will not be calibrated during calibration phase, 
+        instead during inference new quantization ranges will be 
+        observed with every sample. Defaults to False for static
+        quantization. Note that enabling dynamic quantization 
+        will change the default observer to a memoryless one
+    """
+
+    num_bits: int = 8
+    type: QuantizationType = QuantizationType.INT
+    symmetric: bool = True
+    group_size: Optional[int] = None
+    strategy: Optional[QuantizationStrategy] = None
+    block_structure: Optional[str] = None
+    dynamic: bool = False
+    observer: str = Field(
+        default="minmax",
+        description=("The class to use to compute the quantization param - "
+                     "scale and zero-point'"),
+    )
+    observer_kwargs: Dict[str, Any] = Field(
+        default_factory=dict,
+        description=
+        ("optional dict of kwargs to be passed directly to torch quantization "
+         "Observers constructor excluding quantization range or symmetry"),
+    )
+
+
+def find_first_name_or_class_match(
+        name: str,
+        module: Module,
+        targets: Iterable[str],
+        check_contains: bool = False) -> Optional[str]:
+    """
+    Helper function to map the quantization details listed in the config 
+    for a given list of targets against each model layer. First uses the
+    layer name to try and find a match. If no name match is found, uses
+    the layer class name. Returns None otherwise.
+
+    :param name: layer name
+    :param module: torch.nn.Module
+    :param targets: list of targets to match the layer against
+    :param check_contains: whether or not to do a substring match
+    """
+
+    return _find_first_match(name, targets) or _find_first_match(
+        module.__class__.__name__, targets, check_contains)
+
+
+def _find_first_match(value: str,
+                      targets: Iterable[str],
+                      check_contains: bool = False) -> Optional[str]:
+    """
+    Returns first element of target that matches value either
+    exactly or as a regex after 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+
+    :param value: string to compare the list of targets against
+    :param targets: list of targets to match the layer against
+    :param check_contains: whether or not to do a substring match
+    """
+
+    for target in targets:
+        if target.startswith("re:"):
+            pattern = target[3:]
+            if re.match(pattern, value):
+                return target
+        elif check_contains:
+            if target.lower() in value.lower():
+                return target
+        elif target == value:
+            return target
+    return None

From 7a9cb294ae317b28a60165b34c8398c762869a74 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 7 Jun 2024 11:23:32 -0700
Subject: [PATCH 13/30] [Frontend] Add OpenAI Vision API Support (#5237)

Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/vlm.rst                    |  68 ++++-
 .../serving/openai_compatible_server.md       |   4 +-
 examples/template_llava.jinja                 |  23 ++
 tests/entrypoints/test_openai_vision.py       | 286 ++++++++++++++++++
 tests/multimodal/test_utils.py                |  75 +++++
 vllm/config.py                                |  12 +-
 vllm/entrypoints/openai/serving_chat.py       | 113 ++++++-
 vllm/envs.py                                  |   6 +
 vllm/multimodal/utils.py                      |  85 ++++++
 9 files changed, 653 insertions(+), 19 deletions(-)
 create mode 100644 examples/template_llava.jinja
 create mode 100644 tests/entrypoints/test_openai_vision.py
 create mode 100644 tests/multimodal/test_utils.py
 create mode 100644 vllm/multimodal/utils.py

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 52afda747aab8..b917688a529d1 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -3,7 +3,7 @@
 Using VLMs
 ==========
 
-This document shows you how to run and serve Vision Language Models (VLMs) using vLLM.
+vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM.
 
 Engine Arguments
 ----------------
@@ -54,3 +54,69 @@ For now, we only support a single image per text prompt. To pass an image to the
         print(generated_text)
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
+
+Online OpenAI Vision API Compatible Inference
+----------------------------------------------
+
+You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
+
+.. note::
+    Currently, vLLM supports only **single** ``image_url`` input per ``messages``. Support for multi-image inputs will be
+    added in the future.
+
+Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with vLLM API server.
+
+.. important::
+    Since OpenAI Vision API is based on `Chat <https://platform.openai.com/docs/api-reference/chat>`_ API, a chat template 
+    is **required** to launch the API server if the model's tokenizer does not come with one. In this example, we use the 
+    HuggingFace Llava chat template that you can find in the example folder `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
+
+.. code-block:: bash
+
+    python -m vllm.entrypoints.openai.api_server \
+        --model llava-hf/llava-1.5-7b-hf \
+        --image-input-type pixel_values \
+        --image-token-id 32000 \
+        --image-input-shape 1,3,336,336 \
+        --image-feature-size 576 \
+        --chat-template template_llava.jinja
+
+To consume the server, you can use the OpenAI client like in the example below:
+
+.. code-block:: python
+
+    from openai import OpenAI
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    chat_response = client.chat.completions.create(
+        model="llava-hf/llava-1.5-7b-hf",
+        messages=[{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What's in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                    },
+                },
+            ],
+        }],
+    )
+    print("Chat response:", chat_response)
+
+.. note::
+
+    By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
+
+    .. code-block:: shell
+
+        export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+
+.. note::
+    The prompt formatting with the image token ``<image>`` is not needed when serving VLMs with the API server since the prompt will be 
+    processed automatically by the server.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index a912949352b86..6248d84683753 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -30,6 +30,8 @@ Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-refer
 - Chat: `tools`, and `tool_choice`.
 - Completions: `suffix`.
 
+vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst).
+
 ## Extra Parameters
 vLLM supports a set of parameters that are not part of the OpenAI API.
 In order to use them, you can pass them as extra parameters in the OpenAI client.
@@ -120,4 +122,4 @@ It is the callers responsibility to prompt the model with the tool information,
 
 vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
 
-Please refer to the OpenAI API reference documentation for more information.
\ No newline at end of file
+Please refer to the OpenAI API reference documentation for more information.
diff --git a/examples/template_llava.jinja b/examples/template_llava.jinja
new file mode 100644
index 0000000000000..6a902ee167725
--- /dev/null
+++ b/examples/template_llava.jinja
@@ -0,0 +1,23 @@
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+
+    {%- if message['role'] == 'user' -%}
+        {{ 'USER: ' + message['content'] + '\n' }}
+    {%- elif message['role'] == 'assistant' -%}
+        {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{ 'ASSISTANT:' }}
+{% endif %}
diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py
new file mode 100644
index 0000000000000..cc03b04e0b0e0
--- /dev/null
+++ b/tests/entrypoints/test_openai_vision.py
@@ -0,0 +1,286 @@
+from pathlib import Path
+from typing import Dict
+
+import openai
+import pytest
+import pytest_asyncio
+import ray
+
+from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
+
+from ..utils import ServerRunner
+
+MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
+                       "examples/template_llava.jinja")
+assert LLAVA_CHAT_TEMPLATE.exists()
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+pytestmark = pytest.mark.openai
+
+
+@pytest.fixture(scope="module")
+def server():
+    ray.init()
+    server_runner = ServerRunner.remote([
+        "--model",
+        MODEL_NAME,
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
+        "--image-input-type",
+        "pixel_values",
+        "--image-token-id",
+        "32000",
+        "--image-input-shape",
+        "1,3,336,336",
+        "--image-feature-size",
+        "576",
+        "--chat-template",
+        str(LLAVA_CHAT_TEMPLATE),
+    ])
+    ray.get(server_runner.ready.remote())
+    yield server_runner
+    ray.shutdown()
+
+
+@pytest.fixture(scope="session")
+def client():
+    client = openai.AsyncOpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    yield client
+
+
+@pytest_asyncio.fixture(scope="session")
+async def base64_encoded_image() -> Dict[str, str]:
+    return {
+        image_url:
+        encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
+                                         model_name: str, image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_base64encoded(
+        server, client: openai.AsyncOpenAI, model_name: str, image_url: str,
+        base64_encoded_image: Dict[str, str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url":
+                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
+                                    model_name: str, image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_multi_image_input(server, client: openai.AsyncOpenAI,
+                                 model_name: str, image_url: str):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    with pytest.raises(openai.BadRequestError):  # test multi-image input
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+        )
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    completion = completion.choices[0].text
+    assert completion is not None and len(completion) >= 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
new file mode 100644
index 0000000000000..5a6395ac9e42a
--- /dev/null
+++ b/tests/multimodal/test_utils.py
@@ -0,0 +1,75 @@
+import base64
+import mimetypes
+from tempfile import NamedTemporaryFile
+from typing import Dict, Tuple
+
+import numpy as np
+import pytest
+import pytest_asyncio
+from PIL import Image
+
+from vllm.multimodal.utils import ImageFetchAiohttp
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+
+@pytest_asyncio.fixture(scope="session")
+async def url_images() -> Dict[str, Image.Image]:
+    return {
+        image_url: await ImageFetchAiohttp.fetch_image(image_url)
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+def get_supported_suffixes() -> Tuple[str, ...]:
+    # We should at least test the file types mentioned in GPT-4 with Vision
+    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
+
+    # Additional file types that are supported by us
+    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')
+
+    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
+
+
+def _image_equals(a: Image.Image, b: Image.Image) -> bool:
+    return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("suffix", get_supported_suffixes())
+async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
+                                  image_url: str, suffix: str):
+    url_image = url_images[image_url]
+
+    try:
+        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
+    except KeyError:
+        try:
+            mime_type = mimetypes.types_map[suffix]
+        except KeyError:
+            pytest.skip('No MIME type')
+
+    with NamedTemporaryFile(suffix=suffix) as f:
+        try:
+            url_image.save(f.name)
+        except Exception as e:
+            if e.args[0] == 'cannot write mode RGBA as JPEG':
+                pytest.skip('Conversion not supported')
+
+            raise
+
+        base64_image = base64.b64encode(f.read()).decode("utf-8")
+        data_url = f"data:{mime_type};base64,{base64_image}"
+
+        data_image = await ImageFetchAiohttp.fetch_image(data_url)
+        if _image_equals(url_image, Image.open(f)):
+            assert _image_equals(url_image, data_image)
+        else:
+            pass  # Lossy format; only check that image can be opened
diff --git a/vllm/config.py b/vllm/config.py
index 4efdb6cab52c4..a980168190adc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -5,7 +5,7 @@
                     Union)
 
 import torch
-from transformers import PretrainedConfig
+from transformers import PretrainedConfig, PreTrainedTokenizerBase
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -1119,6 +1119,16 @@ def get_image_input_enum_type(cls, value: str) -> ImageInputType:
                              f"Expecting to choose from "
                              f"{[x.name for x in cls.ImageInputType]}.") from e
 
+    #TODO(ywang96): make this a cached property once we refactor the
+    # VisionLanguageConfig class.
+    def get_image_token_text(
+            self, tokenizer: PreTrainedTokenizerBase) -> Tuple[str, str]:
+        """Get the image token placeholder text to be inserted into the 
+        text prompt and the string representation of the image token id.
+        """
+        image_token_str = tokenizer.decode(self.image_token_id)
+        return image_token_str * self.image_feature_size, image_token_str
+
     def as_cli_args_dict(self) -> Dict[str, Any]:
         """Flatten vision language config to pure args.
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 883567abf415b..c025e7e96826c 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,15 +1,16 @@
 import codecs
 import time
-from dataclasses import dataclass
-from typing import (AsyncGenerator, AsyncIterator, Dict, Iterable, List,
-                    Optional)
+from dataclasses import dataclass, field
+from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable,
+                    List, Optional)
 from typing import Sequence as GenericSequence
 from typing import TypedDict, Union, cast, final
 
 from fastapi import Request
-from openai.types.chat import ChatCompletionContentPartTextParam
+from openai.types.chat import (ChatCompletionContentPartImageParam,
+                               ChatCompletionContentPartTextParam)
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionContentPartParam, ChatCompletionLogProb,
@@ -21,9 +22,13 @@
     FunctionCall, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     OpenAIServing)
+from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
+from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.utils import (async_get_and_parse_image,
+                                   get_full_image_text_prompt)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.utils import random_uuid
@@ -40,6 +45,8 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
     messages: List[ConversationMessage]
+    image_futures: List[Awaitable[ImagePixelData]] = field(
+        default_factory=list)
 
 
 class OpenAIServingChat(OpenAIServing):
@@ -94,19 +101,76 @@ def _parse_chat_message_content_parts(
         parts: Iterable[ChatCompletionContentPartParam],
     ) -> ChatMessageParseResult:
         texts: List[str] = []
+        image_futures: List[Awaitable[ImagePixelData]] = []
 
-        for _, part in enumerate(parts):
+        vlm_config: Optional[VisionLanguageConfig] = getattr(
+            self.engine.engine, "vision_language_config", None)
+        model_config = getattr(self.engine.engine, "model_config", None)
+
+        for part in parts:
             part_type = part["type"]
             if part_type == "text":
                 text = cast(ChatCompletionContentPartTextParam, part)["text"]
 
                 texts.append(text)
+            elif part_type == "image_url":
+                if vlm_config is None:
+                    raise ValueError(
+                        "'image_url' input is not supported as the loaded "
+                        "model is not multimodal.")
+
+                elif len(image_futures) == 0:
+                    assert self.tokenizer is not None
+                    image_url = cast(ChatCompletionContentPartImageParam,
+                                     part)["image_url"]
+
+                    if image_url.get("detail", "auto") != "auto":
+                        logger.warning(
+                            "'image_url.detail' is currently not supported and "
+                            "will be ignored.")
+
+                    image_future = async_get_and_parse_image(image_url["url"])
+                    image_futures.append(image_future)
+
+                else:
+                    raise NotImplementedError(
+                        "Multiple 'image_url' input is currently not supported."
+                    )
+
             else:
                 raise NotImplementedError(f"Unknown part type: {part_type}")
 
-        messages = [ConversationMessage(role=role, content="\n".join(texts))]
+        text_prompt = "\n".join(texts)
+
+        if vlm_config is not None and len(image_futures):
+
+            (image_token_prompt,
+             image_token_str) = vlm_config.get_image_token_text(self.tokenizer)
 
-        return ChatMessageParseResult(messages=messages)
+            # NOTE: If image token string (e.g, <image>) is already present
+            # in the text prompt, we assume it follows the same format required
+            # by the engine.
+            if image_token_str in text_prompt:
+                logger.warning(
+                    "Detected image token string in the text prompt. "
+                    "Skipping prompt formatting.")
+                messages = [
+                    ConversationMessage(role=role, content=text_prompt)
+                ]
+
+            else:
+                full_prompt = get_full_image_text_prompt(
+                    image_prompt=image_token_prompt,
+                    text_prompt=text_prompt,
+                    config=model_config)
+                messages = [
+                    ConversationMessage(role=role, content=full_prompt)
+                ]
+        else:
+            messages = [ConversationMessage(role=role, content=text_prompt)]
+
+        return ChatMessageParseResult(messages=messages,
+                                      image_futures=image_futures)
 
     def _parse_chat_message_content(
         self,
@@ -116,10 +180,10 @@ def _parse_chat_message_content(
         content = message.get("content")
 
         if content is None:
-            return ChatMessageParseResult(messages=[])
+            return ChatMessageParseResult(messages=[], image_futures=[])
         if isinstance(content, str):
             messages = [ConversationMessage(role=role, content=content)]
-            return ChatMessageParseResult(messages=messages)
+            return ChatMessageParseResult(messages=messages, image_futures=[])
 
         return self._parse_chat_message_content_parts(role, content)
 
@@ -144,11 +208,13 @@ async def create_chat_completion(
 
         try:
             conversation: List[ConversationMessage] = []
+            image_futures: List[Awaitable[ImagePixelData]] = []
 
             for msg in request.messages:
-                parsed_msg = self._parse_chat_message_content(msg)
+                chat_parsed_result = self._parse_chat_message_content(msg)
 
-                conversation.extend(parsed_msg.messages)
+                conversation.extend(chat_parsed_result.messages)
+                image_futures.extend(chat_parsed_result.image_futures)
 
             prompt = self.tokenizer.apply_chat_template(
                 conversation=conversation,
@@ -159,6 +225,17 @@ async def create_chat_completion(
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
 
+        # Fetch image data
+        image_data: Optional[ImagePixelData] = None
+        try:
+            if len(image_futures):
+                # since we support only single image currently
+                assert len(image_futures) == 1
+                image_data = await image_futures[0]
+        except Exception as e:
+            logger.error("Error in loading image data: %s", e)
+            return self.create_error_response(str(e))
+
         request_id = f"cmpl-{random_uuid()}"
         try:
             # Tokenize/detokenize depending on prompt format (string/token list)
@@ -183,11 +260,15 @@ async def create_chat_completion(
         except ValueError as e:
             return self.create_error_response(str(e))
 
+        inputs: PromptInputs = {
+            "prompt": prompt_text,
+            "prompt_token_ids": prompt_ids,
+        }
+        if image_data is not None:
+            inputs["multi_modal_data"] = image_data
+
         result_generator = self.engine.generate(
-            {
-                "prompt": prompt_text,
-                "prompt_token_ids": prompt_ids
-            },
+            inputs,
             sampling_params,
             request_id,
             lora_request,
diff --git a/vllm/envs.py b/vllm/envs.py
index 7d5c7371b7741..b140aa6d658e6 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -29,6 +29,7 @@
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
@@ -216,6 +217,11 @@
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
     lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
+
+    # Timeout for fetching images when serving multimodal models
+    # Default is 5 seconds
+    "VLLM_IMAGE_FETCH_TIMEOUT":
+    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
new file mode 100644
index 0000000000000..b8ad6f8f78e26
--- /dev/null
+++ b/vllm/multimodal/utils.py
@@ -0,0 +1,85 @@
+import base64
+from io import BytesIO
+from typing import Optional, Union
+
+import aiohttp
+from PIL import Image
+
+from vllm.config import ModelConfig
+from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
+from vllm.multimodal.image import ImagePixelData
+
+
+class ImageFetchAiohttp:
+    aiohttp_client: Optional[aiohttp.ClientSession] = None
+
+    @classmethod
+    def get_aiohttp_client(cls) -> aiohttp.ClientSession:
+        if cls.aiohttp_client is None:
+            timeout = aiohttp.ClientTimeout(total=VLLM_IMAGE_FETCH_TIMEOUT)
+            connector = aiohttp.TCPConnector()
+            cls.aiohttp_client = aiohttp.ClientSession(timeout=timeout,
+                                                       connector=connector)
+
+        return cls.aiohttp_client
+
+    @classmethod
+    async def fetch_image(cls, image_url: str) -> Image.Image:
+        """Load PIL image from a url or base64 encoded openai GPT4V format"""
+
+        if image_url.startswith('http'):
+            # Avoid circular import
+            from vllm import __version__ as VLLM_VERSION
+
+            client = cls.get_aiohttp_client()
+            headers = {"User-Agent": f"vLLM/{VLLM_VERSION}"}
+
+            async with client.get(url=image_url, headers=headers) as response:
+                response.raise_for_status()
+                image_raw = await response.read()
+            image = Image.open(BytesIO(image_raw))
+
+        # Only split once and assume the second part is the base64 encoded image
+        elif image_url.startswith('data:image'):
+            image = load_image_from_base64(image_url.split(',', 1)[1])
+
+        else:
+            raise ValueError("Invalid image url: A valid image url must start "
+                             "with either 'data:image' or 'http'.")
+
+        return image
+
+
+async def async_get_and_parse_image(image_url: str) -> ImagePixelData:
+    with await ImageFetchAiohttp.fetch_image(image_url) as image:
+        return ImagePixelData(image)
+
+
+def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
+    """encode image to base64 format."""
+
+    buffered = BytesIO()
+    if format == 'JPEG':
+        image = image.convert('RGB')
+    image.save(buffered, format)
+    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+
+def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
+    """Load image from base64 format."""
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+# TODO(ywang96): move this to a model registry for preprocessing vision
+# language prompts based on the model type.
+def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
+                               config: ModelConfig) -> str:
+    """Combine image and text prompts for vision language model depending on
+    the model architecture."""
+
+    if config.hf_config.model_type == "llava":
+        full_prompt = f"{image_prompt}\n{text_prompt}"
+    else:
+        raise ValueError(
+            f"Unsupported model type: {config.hf_config.model_type}")
+    return full_prompt

From 6840a716104c8c17303b938673c2ac019e541700 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Sat, 8 Jun 2024 05:09:13 +0800
Subject: [PATCH 14/30] [Misc] Remove unused cuda_utils.h in CPU backend
 (#5345)

---
 csrc/cpu/pybind.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/csrc/cpu/pybind.cpp b/csrc/cpu/pybind.cpp
index 63082393c8102..e5b2ce4f30113 100644
--- a/csrc/cpu/pybind.cpp
+++ b/csrc/cpu/pybind.cpp
@@ -1,5 +1,4 @@
 #include "cache.h"
-#include "cuda_utils.h"
 #include "ops.h"
 #include <torch/extension.h>
 

From 767c727a81ae9ec570d30d55b7afc783775d5a05 Mon Sep 17 00:00:00 2001
From: Calvinn Ng <39899397+Calvinnncy97@users.noreply.github.com>
Date: Sat, 8 Jun 2024 05:10:21 +0800
Subject: [PATCH 15/30] fix DbrxFusedNormAttention missing cache_config (#5340)

Co-authored-by: team <calvinn.ng@ahrefs.com>
---
 vllm/model_executor/models/dbrx.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 8ff19a2015e0f..59af42445f323 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -247,11 +247,12 @@ class DbrxFusedNormAttention(nn.Module):
     def __init__(
         self,
         config: DbrxConfig,
+        cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
         self.d_model = config.d_model
-        self.attn = DbrxAttention(config, quant_config)
+        self.attn = DbrxAttention(config, cache_config, quant_config)
         self.norm_1 = nn.LayerNorm(self.d_model)
         self.norm_2 = nn.LayerNorm(self.d_model)
 

From e69ded7d1c8a4f6ed26e64090bdc050c06cde3b9 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Fri, 7 Jun 2024 17:42:05 -0700
Subject: [PATCH 16/30] [Bug Fix] Fix the support check for FP8 CUTLASS 
 (#5352)

Bug description:
With torch 2.4.0.dev20240603+cu121,
cutlass_fp8_supported outputs False, and the (capability, version) before the comparison is (90, 11111111112)

This PR fixes the support check for FP8 CUTLASS ( cutlass_fp8_supported) which was introduced in https://github.com/vllm-project/vllm/pull/5183.
---
 vllm/model_executor/layers/quantization/fp8.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 136a64623d7fb..de94bad7c38e6 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -20,16 +20,16 @@
 def cutlass_fp8_supported() -> bool:
     capability = torch.cuda.get_device_capability()
     capability = capability[0] * 10 + capability[1]
-    version = torch.version.cuda
-    version = version[0] * 10 + version[1]
+    major, minor = torch.version.cuda.split(".")
+    version = int(major) * 10 + int(minor)
 
     # CUTLASS FP8 kernels need at least
     #   CUDA 12.0 on SM90 systems (Hopper)
     #   CUDA 12.4 on SM89 systems (Lovelace)
     gpu_is_supported = False
-    if capability >= 900:
+    if capability >= 90:
         gpu_is_supported = version > 120
-    elif capability >= 890:
+    elif capability >= 89:
         gpu_is_supported = version > 124
 
     return gpu_is_supported
@@ -103,7 +103,7 @@ class Fp8LinearMethod(LinearMethodBase):
     1. Only support per-tensor quantization due to torch._scaled_mm support.
     2. Only support float8_e4m3fn data type due to the limitation of
        torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
-       
+
     Args:
         quant_config: The quantization config.
     """
@@ -298,8 +298,8 @@ def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module):
-        """Create "weight" (aka kv_scale) for an attention layer. 
-        
+        """Create "weight" (aka kv_scale) for an attention layer.
+
         Args:
             layer: The layer that is using the QuantizeMethodBase factory.
         """

From b3376e5c76c199acb216addec7c32ac5299bef31 Mon Sep 17 00:00:00 2001
From: Benjamin Kitor <bkitor@gmail.com>
Date: Fri, 7 Jun 2024 18:20:16 -0700
Subject: [PATCH 17/30] [Misc] Add args for selecting distributed executor to
 benchmarks (#5335)

---
 benchmarks/benchmark_latency.py    | 10 +++++++++-
 benchmarks/benchmark_throughput.py | 13 +++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index f69d91a086a9f..1a41b66b38824 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -36,7 +36,8 @@ def main(args: argparse.Namespace):
               enable_chunked_prefill=args.enable_chunked_prefill,
               download_dir=args.download_dir,
               block_size=args.block_size,
-              gpu_memory_utilization=args.gpu_memory_utilization)
+              gpu_memory_utilization=args.gpu_memory_utilization,
+              distributed_executor_backend=args.distributed_executor_backend)
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -221,5 +222,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         help='the fraction of GPU memory to be used for '
                         'the model executor, which can range from 0 to 1.'
                         'If unspecified, will use the default value of 0.9.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 7c8cb5ee8cea2..90f7433e0ae28 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -78,6 +78,7 @@ def run_vllm(
     enable_prefix_caching: bool,
     enable_chunked_prefill: bool,
     max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
     gpu_memory_utilization: float = 0.9,
     download_dir: Optional[str] = None,
 ) -> float:
@@ -100,6 +101,7 @@ def run_vllm(
         download_dir=download_dir,
         enable_chunked_prefill=enable_chunked_prefill,
         max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
     )
 
     # Add the requests to the engine.
@@ -225,8 +227,8 @@ def main(args: argparse.Namespace):
             args.enforce_eager, args.kv_cache_dtype,
             args.quantization_param_path, args.device,
             args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.gpu_memory_utilization,
-            args.download_dir)
+            args.max_num_batched_tokens, args.distributed_executor_backend,
+            args.gpu_memory_utilization, args.download_dir)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -368,6 +370,13 @@ def main(args: argparse.Namespace):
         type=str,
         default=None,
         help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model

From c96fc067479453b02e92d9378eeeaebb6b3816de Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Fri, 7 Jun 2024 22:13:12 -0400
Subject: [PATCH 18/30] [ROCm][AMD] Use pytorch sdpa math backend to do naive
 attention (#4965)

---
 vllm/attention/backends/rocm_flash_attn.py | 62 ++++++++++------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index e92e6c5e2dc8d..9294068c64d1a 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -247,7 +247,7 @@ def __init__(
                     self.use_naive_attn = True
 
             if self.use_naive_attn:
-                self.attn_func = _naive_attention
+                self.attn_func = _sdpa_attention
                 logger.debug("Using naive attention in ROCmBackend")
 
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -342,11 +342,18 @@ def forward(
                         # Interleave for MQA workaround.
                         key = self.repeat_kv(key, self.num_queries_per_kv)
                         value = self.repeat_kv(value, self.num_queries_per_kv)
+                    query = query.movedim(0, query.dim() - 2)
+                    key = key.movedim(0, key.dim() - 2)
+                    value = value.movedim(0, value.dim() - 2)
+                    # sdpa math backend attention
                     out = self.attn_func(
                         query,
                         key,
                         value,
                         prefill_meta.seq_lens,
+                        num_tokens,
+                        self.num_heads,
+                        self.head_size,
                         self.scale,
                     )
                 else:
@@ -402,45 +409,34 @@ def forward(
         return output.view(num_tokens, hidden_size)
 
 
-def _naive_attention(
+def _sdpa_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     seq_lens: List[int],
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
     scale: float,
 ) -> torch.Tensor:
-    output = torch.empty_like(query)
     start = 0
-    for _, seq_len in enumerate(seq_lens):
+    output = torch.empty((num_tokens, num_heads, head_size),
+                         dtype=query.dtype,
+                         device=query.device)
+
+    for seq_len in seq_lens:
         end = start + seq_len
-        out = _naive_masked_attention(
-            query[start:end],
-            key[start:end],
-            value[start:end],
-            scale,
-        )
-        # TODO(woosuk): Unnecessary copy. Optimize.
-        output[start:end].copy_(out)
-        start += seq_len
+        with torch.backends.cuda.sdp_kernel(enable_math=True,
+                                            enable_flash=False,
+                                            enable_mem_efficient=False):
+            sub_out = torch.nn.functional.scaled_dot_product_attention(
+                query[:, start:end, :],
+                key[:, start:end, :],
+                value[:, start:end, :],
+                dropout_p=0.0,
+                is_causal=True,
+                scale=scale).movedim(query.dim() - 2, 0)
+            output[start:end, :, :] = sub_out
+            start = end
 
     return output
-
-
-def _naive_masked_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    scale: float,
-) -> torch.Tensor:
-    seq_len, head_size, head_dim = query.shape
-    attn_mask = torch.triu(torch.ones(seq_len,
-                                      seq_len,
-                                      dtype=query.dtype,
-                                      device=query.device),
-                           diagonal=1)
-    attn_mask = attn_mask * torch.finfo(query.dtype).min
-    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
-    attn_weights = attn_weights + attn_mask.float()
-    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
-    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
-    return out

From 9fb900f90cbb5614c3e7d67446325ad8b7ac04b2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 7 Jun 2024 22:31:32 -0700
Subject: [PATCH 19/30] [CI/Test] improve robustness of test (hf_runner)
 (#5347)

[CI/Test] improve robustness of test by replacing del with context manager (hf_runner) (#5347)
---
 .../test_basic_correctness.py                  |  5 ++---
 .../basic_correctness/test_chunked_prefill.py  |  5 ++---
 tests/basic_correctness/test_preemption.py     | 17 +++++++----------
 tests/conftest.py                              |  5 ++++-
 .../test_basic_distributed_correctness.py      |  5 ++---
 .../test_chunked_prefill_distributed.py        |  5 ++---
 tests/models/test_big_models.py                |  5 ++---
 tests/models/test_embedding.py                 |  5 ++---
 tests/models/test_llava.py                     |  9 ++++-----
 tests/models/test_mistral.py                   |  7 +++----
 tests/models/test_models.py                    |  5 ++---
 tests/samplers/test_beam_search.py             |  7 +++----
 tests/samplers/test_logprobs.py                | 11 +++++------
 tests/tensorizer_loader/test_tensorizer.py     | 18 ++++++++----------
 14 files changed, 48 insertions(+), 61 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 7d8117447ca0a..4561c8b1e143a 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -43,9 +43,8 @@ def test_models(
     if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
         pytest.skip("Skipping non-eager test for FlashInferBackend.")
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(model,
                              dtype=dtype,
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 47d582c726c66..8f8494f355b78 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -40,9 +40,8 @@ def test_models(
         enable_chunked_prefill = True
         max_num_batched_tokens = chunked_prefill_token_size
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(
         model,
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 29a4c39cd25a1..58610e9e1016c 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -43,9 +43,8 @@ def test_chunked_prefill_recompute(
         enable_chunked_prefill = True
         max_num_batched_tokens = chunked_prefill_token_size
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(
         model,
@@ -82,9 +81,8 @@ def test_preemption(
 ) -> None:
     """By default, recompute preemption is enabled"""
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(
         model,
@@ -137,10 +135,9 @@ def test_swap(
 ) -> None:
     """Use beam search enables swapping."""
     example_prompts = example_prompts[:1]
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                               max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                                   max_tokens)
 
     vllm_model = vllm_runner(
         model,
diff --git a/tests/conftest.py b/tests/conftest.py
index 1a7037eb2f290..5becf84a1245f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -354,7 +354,10 @@ def generate_greedy_logprobs_limit(
     def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
         return self.model.encode(prompts)
 
-    def __del__(self):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
         del self.model
         cleanup()
 
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 3ba5cea389c2f..23e35482d6f16 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -42,9 +42,8 @@ def test_models(
     backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
     enforce_eager = backend_by_env_var == "FLASHINFER"
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(
         model,
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index db938cc613c6b..9af48831db97f 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -45,9 +45,8 @@ def test_models(
     enable_chunked_prefill = True
     max_num_batched_tokens = chunked_prefill_token_size
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(
         model,
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index ea95e6a49f03a..4af9824e0cbed 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -34,9 +34,8 @@ def test_models(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py
index 668ed3a520a36..8ad9ac2d4f59d 100644
--- a/tests/models/test_embedding.py
+++ b/tests/models/test_embedding.py
@@ -28,9 +28,8 @@ def test_models(
     model: str,
     dtype: str,
 ) -> None:
-    hf_model = hf_runner(model, dtype=dtype, is_embedding_model=True)
-    hf_outputs = hf_model.encode(example_prompts)
-    del hf_model
+    with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
 
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.encode(example_prompts)
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index f03dbdbb770e5..1e7ee52832b13 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -84,11 +84,10 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     """
     model_id, vlm_config = model_and_config
 
-    hf_model = hf_runner(model_id, dtype=dtype, is_vision_model=True)
-    hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                          max_tokens,
-                                          images=hf_images)
-    del hf_model
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
 
     vllm_image_prompts = [
         p.replace("<image>", "<image>" * vlm_config.image_feature_size)
diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py
index 76b248cf14e98..178d23d3382a0 100644
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -26,10 +26,9 @@ def test_models(
     num_logprobs: int,
 ) -> None:
     # TODO(sang): Sliding window should be tested separately.
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy_logprobs_limit(
-        example_prompts, max_tokens, num_logprobs)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
 
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index e4609620387fa..a80ac5d946259 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -34,9 +34,8 @@ def test_models(
     # To pass the small model tests, we need full precision.
     assert dtype == "float"
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 2682f284505bd..2e373cb87cd19 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -30,10 +30,9 @@ def test_beam_search_single_input(
     beam_width: int,
 ) -> None:
     example_prompts = example_prompts[:1]
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                               max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                                   max_tokens)
 
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 61720cccf50b4..25d59391ca983 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -32,12 +32,11 @@ def test_get_prompt_logprobs(
         max_num_batched_tokens = chunked_prefill_token_size
 
     max_tokens = 5
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_logprobs = hf_model.generate_greedy_logprobs(
-        example_prompts,
-        max_tokens=max_tokens,
-    )
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_logprobs = hf_model.generate_greedy_logprobs(
+            example_prompts,
+            max_tokens=max_tokens,
+        )
 
     vllm_model = vllm_runner(
         model,
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 1579d53a7fe29..648de4dbdccba 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -116,16 +116,14 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
 
 def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
                                                 tmp_path):
-    hf_model = hf_runner(model_ref)
-    model_path = tmp_path / (model_ref + ".tensors")
-    max_tokens = 50
-    outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
-    with open_stream(model_path, "wb+") as stream:
-        serializer = TensorSerializer(stream)
-        serializer.write_module(hf_model.model)
-    del hf_model
-    gc.collect()
-    torch.cuda.empty_cache()
+    with hf_runner(model_ref) as hf_model:
+        model_path = tmp_path / (model_ref + ".tensors")
+        max_tokens = 50
+        outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
+        with open_stream(model_path, "wb+") as stream:
+            serializer = TensorSerializer(stream)
+            serializer.write_module(hf_model.model)
+
     loaded_hf_model = vllm_runner(model_ref,
                                   load_format="tensorizer",
                                   model_loader_extra_config=TensorizerConfig(

From 8ea5e44a435e8731fd6f5ba4c329dd112752532a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 8 Jun 2024 01:59:20 -0700
Subject: [PATCH 20/30] [CI/Test] improve robustness of test (vllm_runner)
 (#5357)

[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)
---
 .../test_basic_correctness.py                 |  11 +-
 .../basic_correctness/test_chunked_prefill.py |  21 ++-
 tests/basic_correctness/test_preemption.py    | 148 +++++++++---------
 tests/conftest.py                             |   5 +-
 .../test_basic_distributed_correctness.py     |  15 +-
 .../test_chunked_prefill_distributed.py       |  21 ++-
 tests/engine/test_stop_reason.py              |   5 +-
 tests/engine/test_stop_strings.py             |   3 +-
 tests/metrics/test_metrics.py                 |  84 +++++-----
 tests/models/test_aqlm.py                     |   7 +-
 tests/models/test_big_models.py               |  16 +-
 tests/models/test_embedding.py                |   5 +-
 tests/models/test_gptq_marlin.py              |  37 ++---
 tests/models/test_gptq_marlin_24.py           |  22 ++-
 tests/models/test_llava.py                    |  15 +-
 tests/models/test_marlin.py                   |  24 ++-
 tests/models/test_mistral.py                  |   8 +-
 tests/models/test_models.py                   |  16 +-
 tests/quantization/test_bitsandbytes.py       | 124 +++++++--------
 tests/quantization/test_compressed_tensors.py |  63 ++++----
 tests/quantization/test_fp8.py                |  10 +-
 tests/samplers/test_beam_search.py            |  13 +-
 tests/samplers/test_ignore_eos.py             |  15 +-
 tests/samplers/test_logits_processor.py       |  82 +++++-----
 tests/samplers/test_logprobs.py               |  30 ++--
 tests/samplers/test_ranks.py                  |  38 +++--
 tests/samplers/test_seeded_generate.py        |   5 +-
 tests/tensorizer_loader/test_tensorizer.py    | 106 ++++++-------
 28 files changed, 455 insertions(+), 494 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 4561c8b1e143a..805b8883b9d94 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -46,12 +46,11 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(model,
-                             dtype=dtype,
-                             enforce_eager=enforce_eager,
-                             gpu_memory_utilization=0.7)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(model,
+                     dtype=dtype,
+                     enforce_eager=enforce_eager,
+                     gpu_memory_utilization=0.7) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 8f8494f355b78..48d6091282b89 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -43,17 +43,16 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        max_num_batched_tokens=max_num_batched_tokens,
-        enable_chunked_prefill=enable_chunked_prefill,
-        tensor_parallel_size=tensor_parallel_size,
-        enforce_eager=enforce_eager,
-        max_num_seqs=max_num_seqs,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 58610e9e1016c..7f20b2d934942 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -46,17 +46,16 @@ def test_chunked_prefill_recompute(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        max_num_batched_tokens=max_num_batched_tokens,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_seqs=max_num_seqs,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -84,17 +83,16 @@ def test_preemption(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        disable_log_stats=False,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    total_preemption = (
-        vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            disable_log_stats=False,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        total_preemption = (
+            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -139,19 +137,18 @@ def test_swap(
         hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                                    max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        swap_space=10,
-        disable_log_stats=False,
-    )
-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    total_preemption = (
-        vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            swap_space=10,
+            disable_log_stats=False,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
+                                                       beam_width, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        total_preemption = (
+            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
 
     for i in range(len(example_prompts)):
         hf_output_ids, _ = hf_outputs[i]
@@ -196,28 +193,28 @@ def test_swap_infeasible(
     decode_blocks = max_tokens // BLOCK_SIZE
     example_prompts = example_prompts[:1]
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        swap_space=10,
-        block_size=BLOCK_SIZE,
-        # Since beam search have more than 1 sequence, prefill + decode blocks
-        # are not enough to finish.
-        num_gpu_blocks_override=prefill_blocks + decode_blocks,
-        max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
-    )
-    sampling_params = SamplingParams(n=beam_width,
-                                     use_beam_search=True,
-                                     temperature=0.0,
-                                     max_tokens=max_tokens,
-                                     ignore_eos=True)
-    req_outputs = vllm_model.model.generate(
-        example_prompts,
-        sampling_params=sampling_params,
-    )
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            swap_space=10,
+            block_size=BLOCK_SIZE,
+            # Since beam search have more than 1 sequence, prefill +
+            # decode blocks are not enough to finish.
+            num_gpu_blocks_override=prefill_blocks + decode_blocks,
+            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
+    ) as vllm_model:
+        sampling_params = SamplingParams(n=beam_width,
+                                         use_beam_search=True,
+                                         temperature=0.0,
+                                         max_tokens=max_tokens,
+                                         ignore_eos=True)
+        req_outputs = vllm_model.model.generate(
+            example_prompts,
+            sampling_params=sampling_params,
+        )
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
+
     # Verify the request is ignored and not hang.
     assert req_outputs[0].outputs[0].finish_reason == "length"
 
@@ -236,25 +233,26 @@ def test_preemption_infeasible(
     BLOCK_SIZE = 16
     prefill_blocks = 2
     decode_blocks = max_tokens // BLOCK_SIZE
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        block_size=BLOCK_SIZE,
-        # Not enough gpu blocks to complete a single sequence.
-        # preemption should happen, and the sequence should be
-        # ignored instead of hanging forever.
-        num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
-        max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-    )
-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
-    req_outputs = vllm_model.model.generate(
-        example_prompts,
-        sampling_params=sampling_params,
-    )
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            block_size=BLOCK_SIZE,
+            # Not enough gpu blocks to complete a single sequence.
+            # preemption should happen, and the sequence should be
+            # ignored instead of hanging forever.
+            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+    ) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
+        req_outputs = vllm_model.model.generate(
+            example_prompts,
+            sampling_params=sampling_params,
+        )
+
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
 
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
     # Verify the request is ignored and not hang.
     for req_output in req_outputs:
         outputs = req_output.outputs
diff --git a/tests/conftest.py b/tests/conftest.py
index 5becf84a1245f..e0680467d78b9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -493,7 +493,10 @@ def encode(self, prompts: List[str]) -> List[List[float]]:
             outputs.append(embedding)
         return outputs
 
-    def __del__(self):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
         del self.model
         cleanup()
 
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 23e35482d6f16..eb423aef230cb 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -45,14 +45,13 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        tensor_parallel_size=2,
-        enforce_eager=enforce_eager,
-        distributed_executor_backend=distributed_executor_backend)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=2,
+                     enforce_eager=enforce_eager,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index 9af48831db97f..4e4e468c4377a 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -48,17 +48,16 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        tensor_parallel_size=2,
-        max_num_seqs=max_num_seqs,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
index 7b886507c04f2..b0bd6c4aa95d3 100644
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -19,9 +19,8 @@
 
 @pytest.fixture
 def vllm_model(vllm_runner):
-    vllm_model = vllm_runner(MODEL)
-    yield vllm_model
-    del vllm_model
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
 
 
 def test_stop_reason(vllm_model, example_prompts):
diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
index 6b747beb4b543..1584b85aeb064 100644
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -10,7 +10,8 @@
 
 @pytest.fixture(scope="session")
 def vllm_model(vllm_runner):
-    return vllm_runner(MODEL)
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
 
 
 @pytest.mark.skip_global_cleanup
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index e0aa14f165c2d..c1164739eee31 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model,
-                             dtype=dtype,
-                             disable_log_stats=False,
-                             gpu_memory_utilization=0.4)
-    tokenizer = vllm_model.model.get_tokenizer()
-    prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
-    # This test needs at least 2 prompts in a batch of different lengths to
-    # verify their token count is correct despite padding.
-    assert len(example_prompts) > 1, "at least 2 prompts are required"
-    assert prompt_token_counts[0] != prompt_token_counts[1], (
-        "prompts of different lengths are required")
-    vllm_prompt_token_count = sum(prompt_token_counts)
-
-    _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-    stat_logger = vllm_model.model.llm_engine.stat_logger
-    metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
-        **stat_logger.labels)._value.get()
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.4) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        prompt_token_counts = [
+            len(tokenizer.encode(p)) for p in example_prompts
+        ]
+        # This test needs at least 2 prompts in a batch of different lengths to
+        # verify their token count is correct despite padding.
+        assert len(example_prompts) > 1, "at least 2 prompts are required"
+        assert prompt_token_counts[0] != prompt_token_counts[1], (
+            "prompts of different lengths are required")
+        vllm_prompt_token_count = sum(prompt_token_counts)
+
+        _ = vllm_model.generate_greedy(example_prompts, max_tokens)
+        stat_logger = vllm_model.model.llm_engine.stat_logger
+        metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
+            **stat_logger.labels)._value.get()
 
     assert vllm_prompt_token_count == metric_count, (
         f"prompt token count: {vllm_prompt_token_count!r}\n"
@@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model,
-                             dtype=dtype,
-                             disable_log_stats=False,
-                             gpu_memory_utilization=0.4)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    tokenizer = vllm_model.model.get_tokenizer()
-    stat_logger = vllm_model.model.llm_engine.stat_logger
-    metric_count = stat_logger.metrics.counter_generation_tokens.labels(
-        **stat_logger.labels)._value.get()
-    vllm_generation_count = 0
-    for i in range(len(example_prompts)):
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        prompt_ids = tokenizer.encode(example_prompts[i])
-        # vllm_output_ids contains both prompt tokens and generation tokens.
-        # We're interested only in the count of the generation tokens.
-        vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.4) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        tokenizer = vllm_model.model.get_tokenizer()
+        stat_logger = vllm_model.model.llm_engine.stat_logger
+        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
+            **stat_logger.labels)._value.get()
+        vllm_generation_count = 0
+        for i in range(len(example_prompts)):
+            vllm_output_ids, vllm_output_str = vllm_outputs[i]
+            prompt_ids = tokenizer.encode(example_prompts[i])
+            # vllm_output_ids contains both prompt tokens and generation tokens.
+            # We're interested only in the count of the generation tokens.
+            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
 
     assert vllm_generation_count == metric_count, (
         f"generation token count: {vllm_generation_count!r}\n"
@@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens(
     [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
 def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
                                    served_model_name: List[str]) -> None:
-    vllm_model = vllm_runner(model,
-                             dtype=dtype,
-                             disable_log_stats=False,
-                             gpu_memory_utilization=0.3,
-                             served_model_name=served_model_name)
-    stat_logger = vllm_model.model.llm_engine.stat_logger
-    metrics_tag_content = stat_logger.labels["model_name"]
-
-    del vllm_model
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.3,
+                     served_model_name=served_model_name) as vllm_model:
+        stat_logger = vllm_model.model.llm_engine.stat_logger
+        metrics_tag_content = stat_logger.labels["model_name"]
 
     if served_model_name is None or served_model_name == []:
         assert metrics_tag_content == model, (
diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 85d74f7f5b03d..c4ecf846e633c 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -82,10 +82,9 @@ def test_models(
     num_logprobs: int,
 ) -> None:
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
     # loop through the prompts to compare against the ground truth generations
     for prompt_idx in range(len(example_prompts)):
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index 4af9824e0cbed..ef78283731775 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -37,9 +37,8 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -57,9 +56,8 @@ def test_model_print(
     model: str,
     dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    # This test is for verifying whether the model's extra_repr
-    # can be printed correctly.
-    print(vllm_model.model.llm_engine.model_executor.driver_worker.
-          model_runner.model)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py
index 8ad9ac2d4f59d..6556998b68a74 100644
--- a/tests/models/test_embedding.py
+++ b/tests/models/test_embedding.py
@@ -31,9 +31,8 @@ def test_models(
     with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.encode(example_prompts)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
 
     similarities = compare_embeddings(hf_outputs, vllm_outputs)
     all_similarities = torch.stack(similarities)
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index 814471b47763d..e957450cce97b 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -70,32 +70,29 @@ def test_models(
     model_name, revision = model
 
     # Run marlin.
-    gptq_marlin_model = vllm_runner(model_name=model_name,
-                                    revision=revision,
-                                    dtype=dtype,
-                                    quantization="marlin",
-                                    max_model_len=MAX_MODEL_LEN,
-                                    tensor_parallel_size=1)
-
-    gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
-        example_prompts[:-1], max_tokens, num_logprobs)
-    del gptq_marlin_model
+    with vllm_runner(model_name=model_name,
+                     revision=revision,
+                     dtype=dtype,
+                     quantization="marlin",
+                     max_model_len=MAX_MODEL_LEN,
+                     tensor_parallel_size=1) as gptq_marlin_model:
+
+        gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)
     _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error
 
     # Run gptq.
     # The naive gptq kernel doesn't support bf16 yet.
     # Here we always compare fp16/bf16 gpt marlin kernel
     # to fp16 gptq kernel.
-    gptq_model = vllm_runner(model_name=model_name,
-                             revision=revision,
-                             dtype="half",
-                             quantization="gptq",
-                             max_model_len=MAX_MODEL_LEN,
-                             tensor_parallel_size=1)
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts[:-1],
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
+    with vllm_runner(model_name=model_name,
+                     revision=revision,
+                     dtype="half",
+                     quantization="gptq",
+                     max_model_len=MAX_MODEL_LEN,
+                     tensor_parallel_size=1) as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py
index cc35ee803ff01..195c3e5b5863e 100644
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -61,20 +61,16 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    marlin_24_model = vllm_runner(model_pair.model_marlin,
-                                  dtype=dtype,
-                                  quantization="gptq_marlin_24")
-    marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
-        example_prompts, max_tokens, num_logprobs)
-    del marlin_24_model
+    with vllm_runner(model_pair.model_marlin,
+                     dtype=dtype,
+                     quantization="gptq_marlin_24") as marlin_24_model:
+        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
-    gptq_model = vllm_runner(model_pair.model_gptq,
-                             dtype=dtype,
-                             quantization="gptq")
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 1e7ee52832b13..1f446362167a1 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -94,14 +94,13 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
         for p in HF_IMAGE_PROMPTS
     ]
 
-    vllm_model = vllm_runner(model_id,
-                             dtype=dtype,
-                             enforce_eager=True,
-                             **vlm_config.as_cli_args_dict())
-    vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                              max_tokens,
-                                              images=vllm_images)
-    del vllm_model
+    with vllm_runner(model_id,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
 
     for i in range(len(HF_IMAGE_PROMPTS)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index 8520b26718bf5..761ba6aa4d592 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -59,20 +59,16 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    marlin_model = vllm_runner(model_pair.model_marlin,
-                               dtype=dtype,
-                               quantization="marlin")
-    marlin_outputs = marlin_model.generate_greedy_logprobs(
-        example_prompts, max_tokens, num_logprobs)
-    del marlin_model
-
-    gptq_model = vllm_runner(model_pair.model_gptq,
-                             dtype=dtype,
-                             quantization="gptq")
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
+    with vllm_runner(model_pair.model_marlin,
+                     dtype=dtype,
+                     quantization="marlin") as marlin_model:
+        marlin_outputs = marlin_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py
index 178d23d3382a0..6acc057fe588c 100644
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -30,11 +30,9 @@ def test_models(
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index a80ac5d946259..71238d6909a69 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -37,9 +37,8 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -57,9 +56,8 @@ def test_model_print(
     model: str,
     dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    # This test is for verifying whether the model's extra_repr
-    # can be printed correctly.
-    print(vllm_model.model.llm_engine.model_executor.driver_worker.
-          model_runner.model)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 4e9feb3c48148..31e938d15a1f6 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -16,65 +16,65 @@
     capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
     reason='bitsandbytes is not supported on this GPU type.')
 def test_load_bnb_model(vllm_runner) -> None:
-    llm = vllm_runner('huggyllama/llama-7b',
-                      quantization='bitsandbytes',
-                      load_format='bitsandbytes',
-                      enforce_eager=True)
-
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
-
-    # check the weights in MLP & SelfAttention are quantized to torch.uint8
-    qweight = model.model.layers[0].mlp.gate_up_proj.qweight
-    assert qweight.dtype == torch.uint8, (
-        f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}')
-
-    qweight = model.model.layers[0].mlp.down_proj.qweight
-    assert qweight.dtype == torch.uint8, (
-        f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}')
-
-    qweight = model.model.layers[0].self_attn.o_proj.qweight
-    assert qweight.dtype == torch.uint8, (
-        f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}')
-
-    qweight = model.model.layers[0].self_attn.qkv_proj.qweight
-    assert qweight.dtype == torch.uint8, (
-        f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}')
-
-    # some weights should not be quantized
-    weight = model.lm_head.weight
-    assert weight.dtype != torch.uint8, (
-        'lm_head weight dtype should not be torch.uint8')
-
-    weight = model.model.embed_tokens.weight
-    assert weight.dtype != torch.uint8, (
-        'embed_tokens weight dtype should not be torch.uint8')
-
-    weight = model.model.layers[0].input_layernorm.weight
-    assert weight.dtype != torch.uint8, (
-        'input_layernorm weight dtype should not be torch.uint8')
-
-    weight = model.model.layers[0].post_attention_layernorm.weight
-    assert weight.dtype != torch.uint8, (
-        'input_layernorm weight dtype should not be torch.uint8')
-
-    # check the output of the model is expected
-    sampling_params = SamplingParams(temperature=0.0,
-                                     logprobs=1,
-                                     prompt_logprobs=1,
-                                     max_tokens=8)
-
-    prompts = ['That which does not kill us', 'To be or not to be,']
-    expected_outputs = [
-        'That which does not kill us makes us stronger.',
-        'To be or not to be, that is the question.'
-    ]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-    assert len(outputs) == len(prompts)
-
-    for index in range(len(outputs)):
-        # compare the first line of the output
-        actual_output = outputs[index][1][0].split('\n', 1)[0]
-        expected_output = expected_outputs[index].split('\n', 1)[0]
-        assert actual_output == expected_output, (
-            f'Expected: {expected_output}, but got: {actual_output}')
+    with vllm_runner('huggyllama/llama-7b',
+                     quantization='bitsandbytes',
+                     load_format='bitsandbytes',
+                     enforce_eager=True) as llm:
+
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+
+        # check the weights in MLP & SelfAttention are quantized to torch.uint8
+        qweight = model.model.layers[0].mlp.gate_up_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}')
+
+        qweight = model.model.layers[0].mlp.down_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}')
+
+        qweight = model.model.layers[0].self_attn.o_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}')
+
+        qweight = model.model.layers[0].self_attn.qkv_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}')
+
+        # some weights should not be quantized
+        weight = model.lm_head.weight
+        assert weight.dtype != torch.uint8, (
+            'lm_head weight dtype should not be torch.uint8')
+
+        weight = model.model.embed_tokens.weight
+        assert weight.dtype != torch.uint8, (
+            'embed_tokens weight dtype should not be torch.uint8')
+
+        weight = model.model.layers[0].input_layernorm.weight
+        assert weight.dtype != torch.uint8, (
+            'input_layernorm weight dtype should not be torch.uint8')
+
+        weight = model.model.layers[0].post_attention_layernorm.weight
+        assert weight.dtype != torch.uint8, (
+            'input_layernorm weight dtype should not be torch.uint8')
+
+        # check the output of the model is expected
+        sampling_params = SamplingParams(temperature=0.0,
+                                         logprobs=1,
+                                         prompt_logprobs=1,
+                                         max_tokens=8)
+
+        prompts = ['That which does not kill us', 'To be or not to be,']
+        expected_outputs = [
+            'That which does not kill us makes us stronger.',
+            'To be or not to be, that is the question.'
+        ]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+        assert len(outputs) == len(prompts)
+
+        for index in range(len(outputs)):
+            # compare the first line of the output
+            actual_output = outputs[index][1][0].split('\n', 1)[0]
+            expected_output = expected_outputs[index].split('\n', 1)[0]
+            assert actual_output == expected_output, (
+                f'Expected: {expected_output}, but got: {actual_output}')
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 8b48f418fe49f..9d94d2ecfb222 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -12,42 +12,45 @@
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
     model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
-    llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True)
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
-    layer = model.model.layers[0]
+    with vllm_runner(model_path, quantization="sparseml",
+                     enforce_eager=True) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
 
-    qkv_proj = layer.self_attn.qkv_proj
-    o_proj = layer.self_attn.o_proj
-    gate_up_proj = layer.mlp.gate_up_proj
-    down_proj = layer.mlp.down_proj
+        qkv_proj = layer.self_attn.qkv_proj
+        o_proj = layer.self_attn.o_proj
+        gate_up_proj = layer.mlp.gate_up_proj
+        down_proj = layer.mlp.down_proj
 
-    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(gate_up_proj.quant_method,
+                          CompressedTensorsLinearMethod)
+        assert isinstance(down_proj.quant_method,
+                          CompressedTensorsLinearMethod)
 
-    assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
 
-    assert qkv_proj.weight.dtype is torch.int8
-    assert o_proj.weight.dtype is torch.int8
-    assert gate_up_proj.weight.dtype is torch.int8
+        assert qkv_proj.weight.dtype is torch.int8
+        assert o_proj.weight.dtype is torch.int8
+        assert gate_up_proj.weight.dtype is torch.int8
 
-    assert qkv_proj.weight_scale.shard_splitter is not None
-    assert qkv_proj.weight_scale.logical_widths is not None
-    assert qkv_proj.input_scale.dtype is torch.float32
+        assert qkv_proj.weight_scale.shard_splitter is not None
+        assert qkv_proj.weight_scale.logical_widths is not None
+        assert qkv_proj.input_scale.dtype is torch.float32
 
 
 def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
     model_path = "nm-testing/tinyllama-one-shot-dynamic-test"
-    llm = vllm_runner(model_path,
-                      quantization="sparseml",
-                      enforce_eager=True,
-                      dtype=torch.float16)
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
-    layer = model.model.layers[0]
-
-    qkv_proj = layer.self_attn.qkv_proj
-
-    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
-    assert qkv_proj.weight.dtype is torch.int8
+    with vllm_runner(model_path,
+                     quantization="sparseml",
+                     enforce_eager=True,
+                     dtype=torch.float16) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
+        assert qkv_proj.weight.dtype is torch.int8
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 607544a1c8394..fccce7f7b59a7 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -16,9 +16,9 @@
     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
     reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
-    llm = vllm_runner("facebook/opt-125m", quantization="fp8")
+    with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
 
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
-    fc1 = model.model.decoder.layers[0].fc1
-    assert isinstance(fc1.quant_method, Fp8LinearMethod)
-    assert fc1.weight.dtype == torch.float8_e4m3fn
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        fc1 = model.model.decoder.layers[0].fc1
+        assert isinstance(fc1.quant_method, Fp8LinearMethod)
+        assert fc1.weight.dtype == torch.float8_e4m3fn
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 2e373cb87cd19..64f3ce94b7a83 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -2,10 +2,8 @@
 
 Run `pytest tests/samplers/test_beam_search.py`.
 """
-import gc
 
 import pytest
-import torch
 
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
@@ -34,14 +32,9 @@ def test_beam_search_single_input(
         hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                                    max_tokens)
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-    del vllm_model
-    # NOTE(woosuk): For some reason, the following GC is required to avoid
-    # GPU OOM errors in the following tests using `vllm_runner`.
-    gc.collect()
-    torch.cuda.empty_cache()
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
+                                                       beam_width, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, _ = hf_outputs[i]
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 67b5168bea0e6..dc2482d85a91f 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -22,11 +22,12 @@ def test_ignore_eos(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
 
-    for prompt in example_prompts:
-        ignore_eos_output = vllm_model.model.generate(
-            prompt, sampling_params=sampling_params)
-        output_length = len(ignore_eos_output[0].outputs[0].token_ids)
-        assert output_length == max_tokens
+        for prompt in example_prompts:
+            ignore_eos_output = vllm_model.model.generate(
+                prompt, sampling_params=sampling_params)
+            output_length = len(ignore_eos_output[0].outputs[0].token_ids)
+            assert output_length == max_tokens
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 0ccbabfff6403..2979470120710 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -14,46 +14,46 @@ def test_logits_processor_force_generate(
     model: str,
     dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    tokenizer = vllm_model.model.get_tokenizer()
-    repeat_times = 2
-    enforced_answers = " vLLM"
-    vllm_token_ids = tokenizer.encode(enforced_answers,
-                                      add_special_tokens=False)
-    max_tokens = len(vllm_token_ids) * repeat_times
-
-    def pick_vllm(token_ids, logits):
-        token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
-        logits[token_id] = torch.finfo(logits.dtype).max
-        return logits
-
-    params_with_logprobs = SamplingParams(
-        logits_processors=[pick_vllm],
-        prompt_logprobs=3,
-        max_tokens=max_tokens,
-    )
-
-    # test logits_processors when prompt_logprobs is not None
-    vllm_model.model._add_request(
-        example_prompts[0],
-        params=params_with_logprobs,
-    )
-
-    # test prompt_logprobs is not None
-    vllm_model.model._add_request(
-        example_prompts[1],
-        params=SamplingParams(
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        repeat_times = 2
+        enforced_answers = " vLLM"
+        vllm_token_ids = tokenizer.encode(enforced_answers,
+                                          add_special_tokens=False)
+        max_tokens = len(vllm_token_ids) * repeat_times
+
+        def pick_vllm(token_ids, logits):
+            token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
+            logits[token_id] = torch.finfo(logits.dtype).max
+            return logits
+
+        params_with_logprobs = SamplingParams(
+            logits_processors=[pick_vllm],
             prompt_logprobs=3,
             max_tokens=max_tokens,
-        ),
-    )
-
-    # test grouped requests
-    vllm_model.model._add_request(
-        example_prompts[2],
-        params=SamplingParams(max_tokens=max_tokens),
-    )
-
-    outputs = vllm_model.model._run_engine(use_tqdm=False)
-
-    assert outputs[0].outputs[0].text == enforced_answers * repeat_times
+        )
+
+        # test logits_processors when prompt_logprobs is not None
+        vllm_model.model._add_request(
+            example_prompts[0],
+            params=params_with_logprobs,
+        )
+
+        # test prompt_logprobs is not None
+        vllm_model.model._add_request(
+            example_prompts[1],
+            params=SamplingParams(
+                prompt_logprobs=3,
+                max_tokens=max_tokens,
+            ),
+        )
+
+        # test grouped requests
+        vllm_model.model._add_request(
+            example_prompts[2],
+            params=SamplingParams(max_tokens=max_tokens),
+        )
+
+        outputs = vllm_model.model._run_engine(use_tqdm=False)
+
+        assert outputs[0].outputs[0].text == enforced_answers * repeat_times
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 25d59391ca983..233540cdc391f 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -38,21 +38,21 @@ def test_get_prompt_logprobs(
             max_tokens=max_tokens,
         )
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        max_logprobs=num_top_logprobs,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_num_seqs=max_num_seqs,
-    )
-    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
-                                          logprobs=num_top_logprobs,
-                                          prompt_logprobs=num_top_logprobs,
-                                          temperature=0.0,
-                                          detokenize=detokenize)
-    vllm_results = vllm_model.model.generate(
-        example_prompts, sampling_params=vllm_sampling_params)
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_logprobs=num_top_logprobs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
+                                              logprobs=num_top_logprobs,
+                                              prompt_logprobs=num_top_logprobs,
+                                              temperature=0.0,
+                                              detokenize=detokenize)
+        vllm_results = vllm_model.model.generate(
+            example_prompts, sampling_params=vllm_sampling_params)
 
     # Test whether logprobs are included in the results.
     for result in vllm_results:
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index 5e93238d709ec..ed2fee1ae252e 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -17,16 +17,27 @@ def test_ranks(
     num_top_logprobs = 5
     num_prompt_logprobs = 5
 
-    vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
-
-    ## Test greedy logprobs ranks
-    vllm_sampling_params = SamplingParams(temperature=0.0,
-                                          top_p=1.0,
-                                          max_tokens=max_tokens,
-                                          logprobs=num_top_logprobs,
-                                          prompt_logprobs=num_prompt_logprobs)
-    vllm_results = vllm_model.generate_w_logprobs(example_prompts,
-                                                  vllm_sampling_params)
+    with vllm_runner(model, dtype=dtype,
+                     max_logprobs=num_top_logprobs) as vllm_model:
+
+        ## Test greedy logprobs ranks
+        vllm_sampling_params = SamplingParams(
+            temperature=0.0,
+            top_p=1.0,
+            max_tokens=max_tokens,
+            logprobs=num_top_logprobs,
+            prompt_logprobs=num_prompt_logprobs)
+        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
+                                                      vllm_sampling_params)
+
+        ## Test non-greedy logprobs ranks
+        sampling_params = SamplingParams(temperature=1.0,
+                                         top_p=1.0,
+                                         max_tokens=max_tokens,
+                                         logprobs=num_top_logprobs,
+                                         prompt_logprobs=num_prompt_logprobs)
+        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+
     for result in vllm_results:
         assert result[2] is not None
         assert len(result[2]) == len(result[0])
@@ -35,13 +46,6 @@ def test_ranks(
             assert token in logprobs
             assert logprobs[token].rank == 1
 
-    ## Test non-greedy logprobs ranks
-    sampling_params = SamplingParams(temperature=1.0,
-                                     top_p=1.0,
-                                     max_tokens=max_tokens,
-                                     logprobs=num_top_logprobs,
-                                     prompt_logprobs=num_prompt_logprobs)
-    res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
     for result in res:
         assert result[2] is not None
         assert len(result[2]) == len(result[0])
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index fef5ff3fb9e8e..88067f19c8f07 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -17,9 +17,8 @@
 
 @pytest.fixture
 def vllm_model(vllm_runner):
-    vllm_model = vllm_runner(MODEL, dtype="half")
-    yield vllm_model
-    del vllm_model
+    with vllm_runner(MODEL, dtype="half") as vllm_model:
+        yield vllm_model
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 648de4dbdccba..b558bfc6df21b 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,4 +1,3 @@
-import gc
 import json
 import os
 import subprocess
@@ -7,7 +6,6 @@
 import openai
 import pytest
 import ray
-import torch
 
 from vllm import SamplingParams
 # yapf: disable
@@ -71,47 +69,43 @@ def test_can_deserialize_s3(vllm_runner):
     model_ref = "EleutherAI/pythia-1.4b"
     tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
 
-    loaded_hf_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                                   load_format="tensorizer",
                                   model_loader_extra_config=TensorizerConfig(
                                       tensorizer_uri=tensorized_path,
                                       num_readers=1,
                                       s3_endpoint="object.ord1.coreweave.com",
-                                  ))
+                                  )) as loaded_hf_model:
 
-    deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) # noqa: E501
 
-    assert deserialized_outputs
+        assert deserialized_outputs
 
 
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
         vllm_runner, tmp_path):
-    vllm_model = vllm_runner(model_ref)
-    model_path = tmp_path / (model_ref + ".tensors")
-    key_path = tmp_path / (model_ref + ".key")
-    outputs = vllm_model.generate(prompts, sampling_params)
-
-    config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
-    serialize_vllm_model(vllm_model.model.llm_engine,
-                         config_for_serializing,
-                         encryption_key_path=key_path)
+    with vllm_runner(model_ref) as vllm_model:
+        model_path = tmp_path / (model_ref + ".tensors")
+        key_path = tmp_path / (model_ref + ".key")
+        outputs = vllm_model.generate(prompts, sampling_params)
 
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
+        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
+        serialize_vllm_model(vllm_model.model.llm_engine,
+                            config_for_serializing,
+                            encryption_key_path=key_path)
 
     config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                 encryption_keyfile=key_path)
 
-    loaded_vllm_model = vllm_runner(
+    with vllm_runner(
         model_ref,
         load_format="tensorizer",
-        model_loader_extra_config=config_for_deserializing)
+        model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501
 
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
 
-    assert outputs == deserialized_outputs
+        assert outputs == deserialized_outputs
 
 
 def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
@@ -124,17 +118,17 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
             serializer = TensorSerializer(stream)
             serializer.write_module(hf_model.model)
 
-    loaded_hf_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                                   load_format="tensorizer",
                                   model_loader_extra_config=TensorizerConfig(
                                       tensorizer_uri=model_path,
                                       num_readers=1,
-                                  ))
+                                  )) as loaded_hf_model:
 
-    deserialized_outputs = loaded_hf_model.generate_greedy(
-        prompts, max_tokens=max_tokens)
+        deserialized_outputs = loaded_hf_model.generate_greedy(
+            prompts, max_tokens=max_tokens)
 
-    assert outputs == deserialized_outputs
+        assert outputs == deserialized_outputs
 
 
 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
@@ -148,16 +142,13 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     test_prompts = create_test_prompts(lora_path)
 
     # Serialize model before deserializing and binding LoRA adapters
-    vllm_model = vllm_runner(model_ref, )
-    model_path = tmp_path / (model_ref + ".tensors")
+    with vllm_runner(model_ref, ) as vllm_model:
+        model_path = tmp_path / (model_ref + ".tensors")
 
-    serialize_vllm_model(vllm_model.model.llm_engine,
-                         TensorizerConfig(tensorizer_uri=model_path))
+        serialize_vllm_model(vllm_model.model.llm_engine,
+                            TensorizerConfig(tensorizer_uri=model_path))
 
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
-    loaded_vllm_model = vllm_runner(
+    with vllm_runner(
         model_ref,
         load_format="tensorizer",
         model_loader_extra_config=TensorizerConfig(
@@ -170,10 +161,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
         max_cpu_loras=2,
         max_num_seqs=50,
         max_model_len=1000,
-    )
-    process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
+    ) as loaded_vllm_model:
+        process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
 
-    assert loaded_vllm_model
+        assert loaded_vllm_model
 
 
 def test_load_without_tensorizer_load_format(vllm_runner):
@@ -186,19 +177,15 @@ def test_load_without_tensorizer_load_format(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
     ## Serialize model
-    vllm_model = vllm_runner(model_ref, )
-    model_path = tmp_path / (model_ref + ".tensors")
-
-    serialize_vllm_model(vllm_model.model.llm_engine,
-                         TensorizerConfig(tensorizer_uri=model_path))
+    with vllm_runner(model_ref, ) as vllm_model:
+        model_path = tmp_path / (model_ref + ".tensors")
 
-    model_loader_extra_config = {
-        "tensorizer_uri": str(model_path),
-    }
+        serialize_vllm_model(vllm_model.model.llm_engine,
+                            TensorizerConfig(tensorizer_uri=model_path))
 
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
+        model_loader_extra_config = {
+            "tensorizer_uri": str(model_path),
+        }
 
     ## Start OpenAI API server
     openai_args = [
@@ -260,18 +247,15 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
     model_path = tmp_path / (model_ref + ".tensors")
     config = TensorizerConfig(tensorizer_uri=str(model_path))
 
-    vllm_model = vllm_runner(model_ref)
-    outputs = vllm_model.generate(prompts, sampling_params)
-    serialize_vllm_model(vllm_model.model.llm_engine, config)
+    with vllm_runner(model_ref) as vllm_model:
+        outputs = vllm_model.generate(prompts, sampling_params)
+        serialize_vllm_model(vllm_model.model.llm_engine, config)
 
-    assert is_vllm_tensorized(config)
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
+        assert is_vllm_tensorized(config)
 
-    loaded_vllm_model = vllm_runner(model_ref,
-                                    load_format="tensorizer",
-                                    model_loader_extra_config=config)
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+    with vllm_runner(model_ref,
+                    load_format="tensorizer",
+                    model_loader_extra_config=config) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
 
-    assert outputs == deserialized_outputs
+        assert outputs == deserialized_outputs

From c09dade2a263b6f684d2fbf390c9c1c64761e953 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sat, 8 Jun 2024 13:54:05 -0400
Subject: [PATCH 21/30] [Misc][Breaking] Change FP8 checkpoint format from
 act_scale -> input_scale (#5353)

---
 .../model_executor/layers/quantization/fp8.py | 30 +++++++++----------
 vllm/model_executor/models/mixtral.py         | 16 +++++-----
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index de94bad7c38e6..0cf2bd927a800 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -171,10 +171,10 @@ def create_weights(
                 output_partition_sizes=output_partition_sizes,
                 **extra_weight_attrs)
 
-            # ACTIVATION SCALE
+            # INPUT ACTIVATION SCALE
             if self.quant_config.activation_scheme == "static":
                 self._create_scale_param(
-                    scale_name="act_scale",
+                    scale_name="input_scale",
                     layer=layer,
                     output_partition_sizes=output_partition_sizes,
                     **extra_weight_attrs)
@@ -207,7 +207,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.weight = Parameter(qweight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.logical_widths = None
-            layer.act_scale = None
+            layer.input_scale = None
             return
 
         # If checkpoint is fp8, requantize the separately quantized logical
@@ -232,18 +232,18 @@ def process_weights_after_loading(self, layer: Module) -> None:
             weight = layer.weight
             layer.weight = Parameter(weight.t(), requires_grad=False)
 
-            # ACT_SCALE
+            # INPUT ACTIVATION SCALE
             #   Dynamic: set to None (required input to ops.scaled_fp8_quant).
-            #   Static:  set to max of the act_scales (since they are equal).
+            #   Static:  set to max of the input_scales (since they are equal).
             if self.quant_config.activation_scheme == "dynamic":
-                layer.act_scale = None
+                layer.input_scale = None
             elif self.quant_config.activation_scheme == "static":
-                if not all_close_1d(layer.act_scale):
+                if not all_close_1d(layer.input_scale):
                     raise ValueError(
-                        "All the act_scales for the logical weights of a layer "
-                        f"must be equal. But got {layer.act_scale}")
-                layer.act_scale = Parameter(layer.act_scale.max(),
-                                            requires_grad=False)
+                        "All the input_scales for the logical weights of a "
+                        f"layer must be equal. But got {layer.input_scale}")
+                layer.input_scale = Parameter(layer.input_scale.max(),
+                                              requires_grad=False)
             else:
                 raise ValueError(
                     f"Unknown scheme {self.quant_config.activation_scheme}")
@@ -254,11 +254,11 @@ def apply(self,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
         # ops.scaled_fp8_quant supports both dynamic and static quant.
-        #   If dynamic, layer.act_scale is None and x_scale computed from x.
-        #   If static,  layer.act_scale is scalar and x_scale set to act_scale.
+        #   If dynamic, layer.input_scale is None and x_scale computed from x.
+        #   If static, layer.input_scale is scalar and x_scale is input_scale.
 
         if bias is None and self.cutlass_fp8_supported:
-            qinput, x_scale = ops.scaled_fp8_quant(x, layer.act_scale)
+            qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
 
             # Fused GEMM_DQ
             output = ops.cutlass_scaled_mm_dq(
@@ -271,7 +271,7 @@ def apply(self,
 
         else:
             qinput, x_scale = ops.scaled_fp8_quant(x,
-                                                   layer.act_scale,
+                                                   layer.input_scale,
                                                    batch_dim_padding=17)
 
             # Fused GEMM_DQ -- note we padded the input above because
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0f82549780ba4..3faf54d292b99 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -147,7 +147,7 @@ def __init__(
                     "weight_loader": self.weight_loader,
                 })
 
-            # ACT_SCALE (for fp8)
+            # INPUT_SCALE (for fp8)
             if quant_config.activation_scheme == "static":
                 if not quant_config.is_checkpoint_fp8_serialized:
                     raise ValueError(
@@ -182,11 +182,11 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             param_data[expert_id, :, :] = loaded_weight[:, shard]
 
         # Loading scales
-        if "act_scale" in weight_name or "w2.weight_scale" in weight_name:
+        if "input_scale" in weight_name or "w2.weight_scale" in weight_name:
             if param_data[expert_id] != 1 and (param_data[expert_id] -
                                                loaded_weight).abs() > 1e-5:
                 raise ValueError(
-                    "act_scales of w1 and w3 of a layer "
+                    "input_scales of w1 and w3 of a layer "
                     f"must be equal. But got {param_data[expert_id]} "
                     f"vs. {loaded_weight}")
             param_data[expert_id] = loaded_weight
@@ -225,9 +225,9 @@ def process_weights_after_loading(self):
             self.w2_weight = nn.Parameter(w2_weight, requires_grad=False)
 
         else:
-            # If checkpoint is fp8 + static, cleanup act_scales.
-            #   Since state_dict has an act_scale per expert but our kernels
-            #   are passed one act_scale shared across all experts.
+            # If checkpoint is fp8 + static, cleanup input_scales.
+            #   Since state_dict has an input_scale per expert but our kernels
+            #   are passed one input_scale shared across all experts.
             if self.quant_config.activation_scheme == "static":
                 if self.a13_scale is None or self.a2_scale is None:
                     raise ValueError(
@@ -237,7 +237,7 @@ def process_weights_after_loading(self):
                 if (not all_close_1d(self.a13_scale)
                         or not all_close_1d(self.a2_scale)):
                     print_warning_once(
-                        "Found act_scales that are not equal for "
+                        "Found input_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
                         "for each layer. ")
 
@@ -576,7 +576,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             # These are the activation scales for the experts
             # (param_name, weight_name, expert_id)
             ("a13_scale" if weight_name in ["w1", "w3"] else "a2_scale",
-             f"experts.{expert_id}.{weight_name}.act_scale", expert_id)
+             f"experts.{expert_id}.{weight_name}.input_scale", expert_id)
             for expert_id in range(self.config.num_local_experts)
             for weight_name in ["w1", "w2", "w3"]
         ]

From 0373e1837e1a85c595fa9fc67c775bc6cbe105a2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 8 Jun 2024 19:14:43 -0700
Subject: [PATCH 22/30] [Core][CUDA Graph] add output buffer for cudagraph
 (#5074)

[Core][CUDA Graph] add output buffer for cudagraph to reduce memory footprint (#5074)
---
 vllm/worker/model_runner.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index c59288b4f73c6..7879a5de5b7bd 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,3 +1,4 @@
+import gc
 import time
 import warnings
 from collections import defaultdict
@@ -894,6 +895,10 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
 
+        # Prepare buffer for outputs. These will be reused for all batch sizes.
+        # It will be filled after the first graph capture.
+        hidden_states: Optional[torch.Tensor] = None
+
         graph_batch_size = _get_graph_batch_size(
             self.scheduler_config.max_num_seqs)
         batch_size_capture_list = [
@@ -930,9 +935,11 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                     self.set_active_loras(set(), lora_mapping)
 
                 graph_runner = CUDAGraphRunner(self.model)
-                graph_runner.capture(
+                hidden_states = graph_runner.capture(
                     input_tokens[:batch_size],
                     input_positions[:batch_size],
+                    hidden_states[:batch_size]
+                    if hidden_states is not None else None,
                     kv_caches,
                     attn_metadata,
                     memory_pool=self.graph_memory_pool,
@@ -969,12 +976,13 @@ def capture(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
+        hidden_states: Optional[torch.Tensor],
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         memory_pool: Optional[Tuple[int, int]],
         stream: torch.cuda.Stream,
         **kwargs,
-    ) -> None:
+    ) -> torch.Tensor:
         assert self._graph is None
         # Run the model a few times without capturing the graph.
         # This is to make sure that the captured graph does not include the
@@ -993,13 +1001,21 @@ def capture(
         # Capture the graph.
         self._graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
-            hidden_states = self.model(
+            output_hidden_states = self.model(
                 input_ids,
                 positions,
                 kv_caches,
                 attn_metadata,
                 **kwargs,
             )
+            if hidden_states is not None:
+                hidden_states.copy_(output_hidden_states)
+            else:
+                hidden_states = output_hidden_states
+            del output_hidden_states
+            # make sure `output_hidden_states` is deleted
+            # in the graph's memory pool
+            gc.collect()
         torch.cuda.synchronize()
 
         # Save the input and output buffers.
@@ -1012,7 +1028,7 @@ def capture(
             "block_tables": attn_metadata.decode_metadata.block_tables,
         }
         self.output_buffers = {"hidden_states": hidden_states}
-        return
+        return hidden_states
 
     def forward(
         self,

From 5d7e3d0176e0dbcf144c64b7d14d996c55e36c50 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 8 Jun 2024 20:50:14 -0700
Subject: [PATCH 23/30] [mis][ci/test] fix flaky test in
 test_sharded_state_loader.py (#5361)

[mis][ci/test] fix flaky test in tests/test_sharded_state_loader.py (#5361)
---
 tests/test_sharded_state_loader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 022fb36b346f4..de79c3b945d4d 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -39,7 +39,8 @@ def test_filter_subtensors():
     filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
     assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
     for key, tensor in filtered_state_dict.items():
-        assert tensor.equal(state_dict[key])
+        # NOTE: don't use `euqal` here, as the tensor might contain NaNs
+        assert tensor is state_dict[key]
 
 
 @pytest.fixture(scope="module")

From 5467ac319636245ded483b31967ac43e543c5fa3 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Sun, 9 Jun 2024 16:23:30 -0400
Subject: [PATCH 24/30] [Kernel][Misc] Use TORCH_LIBRARY instead of
 PYBIND11_MODULE for custom ops (#5047)

---
 CMakeLists.txt                                |  22 +-
 Dockerfile.rocm                               |   6 +-
 cmake/cpu_extension.cmake                     |  12 +-
 cmake/utils.cmake                             |  11 +-
 csrc/activation_kernels.cu                    |   2 +-
 csrc/attention/attention_kernels.cu           |  34 ++-
 csrc/cache.h                                  |  14 +-
 csrc/cache_kernels.cu                         |  13 +-
 csrc/cpu/attention.cpp                        |  26 +-
 csrc/cpu/cache.cpp                            |  13 +-
 csrc/cpu/cpu_types.hpp                        |   2 +-
 csrc/cpu/layernorm.cpp                        |   4 +-
 csrc/cpu/pos_encoding.cpp                     |   2 +-
 csrc/cpu/pybind.cpp                           |  43 ---
 csrc/cpu/torch_bindings.cpp                   | 106 +++++++
 csrc/cuda_utils.h                             |   6 +-
 csrc/cuda_utils_kernels.cu                    |   6 +-
 csrc/custom_all_reduce.cu                     |  22 +-
 csrc/dispatch_utils.h                         |   2 +-
 csrc/layernorm_kernels.cu                     |   6 +-
 csrc/moe/moe_ops.cpp                          |   8 -
 csrc/moe/moe_ops.h                            |   2 +-
 csrc/moe/topk_softmax_kernels.cu              |   2 +-
 csrc/moe/torch_bindings.cpp                   |  12 +
 csrc/moe_align_block_size_kernels.cu          |   6 +-
 csrc/ops.h                                    |  68 +++--
 csrc/pos_encoding_kernels.cu                  |  12 +-
 csrc/punica/punica_ops.cu                     |   6 +-
 csrc/punica/punica_ops.h                      |   6 +-
 csrc/punica/punica_pybind.cpp                 |  13 -
 csrc/punica/torch_bindings.cpp                |  18 ++
 csrc/pybind.cpp                               | 114 -------
 csrc/quantization/aqlm/gemm_kernels.cu        |   2 +-
 csrc/quantization/awq/gemm_kernels.cu         |   8 +-
 .../compressed_tensors/int8_quant_kernels.cu  |   2 +-
 .../cutlass_w8a8/scaled_mm_dq_c2x.cu          |   2 +-
 .../cutlass_w8a8/scaled_mm_dq_c3x.cu          |   2 +-
 .../cutlass_w8a8/scaled_mm_dq_entry.cu        |   2 +-
 csrc/quantization/fp8/common.cu               |   2 +-
 csrc/quantization/gptq/q_gemm.cu              |   6 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |   2 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cuh |   2 +-
 .../marlin/dense/marlin_cuda_kernel.cu        |   2 +-
 .../marlin/sparse/marlin_24_cuda_kernel.cu    |   2 +-
 .../squeezellm/quant_cuda_kernel.cu           |   1 -
 csrc/registration.h                           |  22 ++
 csrc/torch_bindings.cpp                       | 283 ++++++++++++++++++
 setup.py                                      |   2 +-
 tests/kernels/test_int8_quant.py              |   7 +-
 vllm/_custom_ops.py                           | 217 ++++++++++----
 vllm/attention/backends/flash_attn.py         |  10 +-
 .../device_communicators/custom_all_reduce.py |  34 ++-
 vllm/lora/punica.py                           |  45 ++-
 .../layers/fused_moe/fused_moe.py             |   3 +-
 vllm/utils.py                                 |   7 +-
 55 files changed, 833 insertions(+), 451 deletions(-)
 delete mode 100644 csrc/cpu/pybind.cpp
 create mode 100644 csrc/cpu/torch_bindings.cpp
 delete mode 100644 csrc/moe/moe_ops.cpp
 create mode 100644 csrc/moe/torch_bindings.cpp
 delete mode 100644 csrc/punica/punica_pybind.cpp
 create mode 100644 csrc/punica/torch_bindings.cpp
 delete mode 100644 csrc/pybind.cpp
 create mode 100644 csrc/registration.h
 create mode 100644 csrc/torch_bindings.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a197063f33601..ad6736c47f459 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,19 +66,6 @@ endif()
 #
 find_package(Torch REQUIRED)
 
-#
-# Normally `torch.utils.cpp_extension.CUDAExtension` would add
-# `libtorch_python.so` for linking against an extension. Torch's cmake
-# configuration does not include this library (presumably since the cmake
-# config is used for standalone C++ binaries that link against torch).
-# The `libtorch_python.so` library defines some of the glue code between
-# torch/python via pybind and is required by VLLM extensions for this
-# reason. So, add it by manually with `find_library` using torch's
-# installed library path.
-#
-find_library(torch_python_LIBRARY torch_python PATHS
-  "${TORCH_INSTALL_PREFIX}/lib")
-
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@@ -171,7 +158,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/moe_align_block_size_kernels.cu"
-  "csrc/pybind.cpp")
+  "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   include(FetchContent)
@@ -218,6 +205,7 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  USE_SABI 3
   WITH_SOABI)
 
 #
@@ -225,7 +213,7 @@ define_gpu_extension_target(
 #
 
 set(VLLM_MOE_EXT_SRC
-  "csrc/moe/moe_ops.cpp"
+  "csrc/moe/torch_bindings.cpp"
   "csrc/moe/topk_softmax_kernels.cu")
 
 define_gpu_extension_target(
@@ -235,6 +223,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_MOE_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
+  USE_SABI 3
   WITH_SOABI)
 
 #
@@ -249,7 +238,7 @@ set(VLLM_PUNICA_EXT_SRC
   "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
   "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
   "csrc/punica/punica_ops.cu"
-  "csrc/punica/punica_pybind.cpp")
+  "csrc/punica/torch_bindings.cpp")
 
 #
 # Copy GPU compilation flags+update for punica
@@ -286,6 +275,7 @@ if (VLLM_PUNICA_GPU_ARCHES)
     SOURCES ${VLLM_PUNICA_EXT_SRC}
     COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
     ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
+    USE_SABI 3
     WITH_SOABI)
 else()
   message(WARNING "Unable to create _punica_C target because none of the "
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index e30a2aaf30209..954958df88fc0 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -106,9 +106,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
     && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
     && cd ..
 
 
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 0cf37769a6960..61d4843838ba0 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -12,7 +12,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
-list(APPEND CXX_COMPILE_FLAGS 
+list(APPEND CXX_COMPILE_FLAGS
     "-fopenmp"
     "-DVLLM_CPU_EXTENSION")
 
@@ -44,8 +44,8 @@ if (AVX512_FOUND)
 
     find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
     if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND 
-            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) 
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
             list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
         else()
             message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
@@ -73,7 +73,7 @@ set(VLLM_EXT_SRC
     "csrc/cpu/cache.cpp"
     "csrc/cpu/layernorm.cpp"
     "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/pybind.cpp")
+    "csrc/cpu/torch_bindings.cpp")
 
 define_gpu_extension_target(
     _C
@@ -81,10 +81,10 @@ define_gpu_extension_target(
     LANGUAGE CXX
     SOURCES ${VLLM_EXT_SRC}
     COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-    WITH_SOABI 
+    USE_SABI 3
+    WITH_SOABI
 )
 
 add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
-
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 00c81e4d00ad8..f3c1286dd8498 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -5,7 +5,7 @@
 macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
   file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
   set(Python_EXECUTABLE ${EXECUTABLE})
-  find_package(Python COMPONENTS Interpreter Development.Module)
+  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
   if (NOT Python_FOUND)
     message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
   endif()
@@ -294,6 +294,7 @@ endmacro()
 # INCLUDE_DIRECTORIES <dirs> - Extra include directories.
 # LIBRARIES <libraries>      - Extra link libraries.
 # WITH_SOABI                 - Generate library with python SOABI suffix name.
+# USE_SABI <version>         - Use python stable api <version>
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
@@ -301,7 +302,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
   cmake_parse_arguments(PARSE_ARGV 1
     GPU
     "WITH_SOABI"
-    "DESTINATION;LANGUAGE"
+    "DESTINATION;LANGUAGE;USE_SABI"
     "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
 
   # Add hipify preprocessing step when building with HIP/ROCm.
@@ -315,7 +316,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
     set(GPU_WITH_SOABI)
   endif()
 
-  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
+  if (GPU_USE_SABI)
+    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  else()
+    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  endif()
 
   if (GPU_LANGUAGE STREQUAL "HIP")
     # Make this target dependent on the hipify preprocessor step.
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 867f63f12de4b..86ac2e75e78ee 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 
 #include <cmath>
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 8f89f89786c3b..91083481705cb 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -17,7 +17,7 @@
  * limitations under the License.
  */
 
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <algorithm>
@@ -808,16 +808,17 @@ void paged_attention_v1(
     torch::Tensor&
         key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
     torch::Tensor&
-        value_cache,   // [num_blocks, num_heads, head_size, block_size]
-    int num_kv_heads,  // [num_heads]
-    float scale,
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
     torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& seq_lens,      // [num_seqs]
-    int block_size, int max_seq_len,
+    int64_t block_size, int64_t max_seq_len,
     const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
 
   DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
@@ -972,16 +973,17 @@ void paged_attention_v2(
     torch::Tensor&
         key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
     torch::Tensor&
-        value_cache,   // [num_blocks, num_heads, head_size, block_size]
-    int num_kv_heads,  // [num_heads]
-    float scale,
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
     torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& seq_lens,      // [num_seqs]
-    int block_size, int max_seq_len,
+    int64_t block_size, int64_t max_seq_len,
     const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
   DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
                              CALL_V2_LAUNCHER_BLOCK_SIZE)
@@ -990,4 +992,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
\ No newline at end of file
+#undef DIVIDE_ROUND_UP
diff --git a/csrc/cache.h b/csrc/cache.h
index 435ae3e57f555..86caa9345361d 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <map>
 #include <vector>
@@ -8,14 +8,18 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                  const torch::Tensor& block_mapping);
 
-void copy_blocks(std::vector<torch::Tensor>& key_caches,
-                 std::vector<torch::Tensor>& value_caches,
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
                  const torch::Tensor& block_mapping);
 
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, const float kv_scale);
+                       const std::string& kv_cache_dtype,
+                       const double kv_scale);
 
 void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                              torch::Tensor& key_cache,
@@ -25,4 +29,4 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
 
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const float scale, const std::string& kv_cache_dtype);
+                 const double scale, const std::string& kv_cache_dtype);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index d924ac39b89ca..72041076ae009 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -1,4 +1,4 @@
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
@@ -95,8 +95,11 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
 
 }  // namespace vllm
 
-void copy_blocks(std::vector<torch::Tensor>& key_caches,
-                 std::vector<torch::Tensor>& value_caches,
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
                  const torch::Tensor& block_mapping) {
   int num_layers = key_caches.size();
   TORCH_CHECK(num_layers == value_caches.size());
@@ -255,7 +258,7 @@ void reshape_and_cache(
     torch::Tensor&
         value_cache,  // [num_blocks, num_heads, head_size, block_size]
     torch::Tensor& slot_mapping,  // [num_tokens]
-    const std::string& kv_cache_dtype, const float kv_scale) {
+    const std::string& kv_cache_dtype, const double kv_scale) {
   int num_tokens = key.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
@@ -334,7 +337,7 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
 
 // Only for testing.
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const float kv_scale, const std::string& kv_cache_dtype) {
+                 const double kv_scale, const std::string& kv_cache_dtype) {
   torch::Device src_device = src_cache.device();
   torch::Device dst_device = dst_cache.device();
   TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index ed8cfbd421f0f..8367093325314 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -420,12 +420,13 @@ void paged_attention_v1_impl_launcher(
 
 void paged_attention_v1(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
   TORCH_CHECK(kv_scale == 1.0f);
   TORCH_CHECK(blocksparse_vert_stride <= 1,
               "CPU backend does not support blocksparse attention yet.");
@@ -738,12 +739,13 @@ void paged_attention_v2_impl_launcher(
 void paged_attention_v2(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
   TORCH_CHECK(kv_scale == 1.0f);
   TORCH_CHECK(blocksparse_vert_stride <= 1,
               "CPU backend does not support blocksparse attention yet.");
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
index 2890ba6e2bb32..2b5c3bd6ee70b 100644
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -5,8 +5,8 @@
 
 namespace {
 template <typename scalar_t>
-void copy_blocks_cpu_impl(std::vector<torch::Tensor>& key_caches,
-                          std::vector<torch::Tensor>& value_caches,
+void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
+                          std::vector<torch::Tensor> const& value_caches,
                           const torch::Tensor& mapping_pairs,
                           const int element_num_per_block,
                           const int layer_num) {
@@ -82,8 +82,11 @@ void reshape_and_cache_cpu_impl(
 }
 };  // namespace
 
-void copy_blocks(std::vector<torch::Tensor>& key_caches,
-                 std::vector<torch::Tensor>& value_caches,
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
                  const torch::Tensor& block_mapping) {
   unsigned num_layers = key_caches.size();
   TORCH_CHECK(num_layers == value_caches.size());
@@ -104,7 +107,7 @@ void copy_blocks(std::vector<torch::Tensor>& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, float kv_scale) {
+                       const std::string& kv_cache_dtype, double kv_scale) {
   TORCH_CHECK(kv_scale == 1.0f);
 
   int num_tokens = key.size(0);
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index c1d3ec058b991..034c406a532d5 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -3,7 +3,7 @@
 #define CPU_TYPES_HPP
 
 #include <immintrin.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 
 namespace vec_op {
 
diff --git a/csrc/cpu/layernorm.cpp b/csrc/cpu/layernorm.cpp
index 65d3ddcec5709..a76ad08928a2c 100644
--- a/csrc/cpu/layernorm.cpp
+++ b/csrc/cpu/layernorm.cpp
@@ -88,7 +88,7 @@ void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
 }  // namespace
 
 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-              float epsilon) {
+              double epsilon) {
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
 
@@ -102,7 +102,7 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 }
 
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
-                        torch::Tensor& weight, float epsilon) {
+                        torch::Tensor& weight, double epsilon) {
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
 
diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp
index e8aead17ae5a7..96bce7dda0132 100644
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -168,7 +168,7 @@ void rotary_embedding_gptj_impl(
 };  // namespace
 
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int head_size,
+                      torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox) {
   int num_tokens = query.numel() / query.size(-1);
   int rot_dim = cos_sin_cache.size(1);
diff --git a/csrc/cpu/pybind.cpp b/csrc/cpu/pybind.cpp
deleted file mode 100644
index e5b2ce4f30113..0000000000000
--- a/csrc/cpu/pybind.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#include "cache.h"
-#include "ops.h"
-#include <torch/extension.h>
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  // vLLM custom ops
-  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
-
-  // Attention ops
-  ops.def("paged_attention_v1", &paged_attention_v1,
-          "Compute the attention between an input query and the cached "
-          "keys/values using PagedAttention.");
-  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
-
-  // Activation ops
-  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
-  ops.def("gelu_and_mul", &gelu_and_mul,
-          "Activation function used in GeGLU with `none` approximation.");
-  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
-          "Activation function used in GeGLU with `tanh` approximation.");
-  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
-  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
-
-  // Layernorm
-  ops.def("rms_norm", &rms_norm,
-          "Apply Root Mean Square (RMS) Normalization to the input tensor.");
-
-  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
-          "In-place fused Add and RMS Normalization");
-
-  // Rotary embedding
-  ops.def("rotary_embedding", &rotary_embedding,
-          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
-
-  // Cache ops
-  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
-  cache_ops.def("swap_blocks", &swap_blocks,
-                "Swap in (out) the cache blocks from src to dst");
-  cache_ops.def("copy_blocks", &copy_blocks,
-                "Copy the cache blocks from src to dst");
-  cache_ops.def("reshape_and_cache", &reshape_and_cache,
-                "Reshape the key and value tensors and cache them");
-}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
new file mode 100644
index 0000000000000..a2bf0d49adba5
--- /dev/null
+++ b/csrc/cpu/torch_bindings.cpp
@@ -0,0 +1,106 @@
+#include "cache.h"
+#include "ops.h"
+#include "registration.h"
+
+#include <torch/library.h>
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // vLLM custom ops
+
+  // Attention ops
+  // Compute the attention between an input query and the cached keys/values
+  // using PagedAttention.
+  ops.def(
+      "paged_attention_v1("
+      "    Tensor! out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
+      "    int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
+
+  // PagedAttention V2.
+  ops.def(
+      "paged_attention_v2("
+      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
+      "    int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
+
+  // Activation ops
+
+  // Activation function used in SwiGLU.
+  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("silu_and_mul", torch::kCPU, &silu_and_mul);
+
+  // Activation function used in GeGLU with `none` approximation.
+  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_and_mul", torch::kCPU, &gelu_and_mul);
+
+  // Activation function used in GeGLU with `tanh` approximation.
+  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_tanh_and_mul", torch::kCPU, &gelu_tanh_and_mul);
+
+  // GELU implementation used in GPT-2.
+  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_new", torch::kCPU, &gelu_new);
+
+  // Approximate GELU implementation.
+  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
+
+  // Layernorm
+  // Apply Root Mean Square (RMS) Normalization to the input tensor.
+  ops.def(
+      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
+      "()");
+  ops.impl("rms_norm", torch::kCPU, &rms_norm);
+
+  // In-place fused Add and RMS Normalization.
+  ops.def(
+      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
+      "float epsilon) -> ()");
+  ops.impl("fused_add_rms_norm", torch::kCPU, &fused_add_rms_norm);
+
+  // Rotary embedding
+  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
+  ops.def(
+      "rotary_embedding(Tensor positions, Tensor! query,"
+      "                 Tensor! key, int head_size,"
+      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
+  // Cache ops
+  // Swap in (out) the cache blocks from src to dst.
+  cache_ops.def(
+      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
+  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
+
+  // Copy the cache blocks from src to dst.
+  cache_ops.def(
+      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "block_mapping) -> ()");
+  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
+
+  // Reshape the key and value tensors and cache them.
+  cache_ops.def(
+      "reshape_and_cache(Tensor key, Tensor value,"
+      "                  Tensor! key_cache, Tensor! value_cache,"
+      "                  Tensor slot_mapping,"
+      "                  str kv_cache_dtype,"
+      "                  float kv_scale) -> ()");
+  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h
index 2ba49b339e148..73944f4c14890 100644
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/extension.h>
+int64_t get_device_attribute(int64_t attribute, int64_t device_id);
 
-int get_device_attribute(int attribute, int device_id);
-
-int get_max_shared_memory_per_block_device_attribute(int device_id);
+int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu
index 7d8e2e19720fa..d6f9eb646fad5 100644
--- a/csrc/cuda_utils_kernels.cu
+++ b/csrc/cuda_utils_kernels.cu
@@ -2,7 +2,7 @@
   #include <hip/hip_runtime.h>
   #include <hip/hip_runtime_api.h>
 #endif
-int get_device_attribute(int attribute, int device_id) {
+int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
   int device, value;
   if (device_id < 0) {
     cudaGetDevice(&device);
@@ -14,8 +14,8 @@ int get_device_attribute(int attribute, int device_id) {
   return value;
 }
 
-int get_max_shared_memory_per_block_device_attribute(int device_id) {
-  int attribute;
+int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
+  int64_t attribute;
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
   // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
 
diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu
index 0b1d95848525a..82a3563979f16 100644
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -1,17 +1,17 @@
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include "custom_all_reduce.cuh"
 
-// fake pointer type
-using fptr_t = uint64_t;
+// fake pointer type, must match fptr_t type in ops.h
+using fptr_t = int64_t;
 static_assert(sizeof(void*) == sizeof(fptr_t));
 
 fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
                       const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int rank,
+                      const std::vector<int64_t>& offsets, int64_t rank,
                       bool full_nvlink) {
   int world_size = offsets.size();
   if (world_size > 8)
@@ -55,7 +55,7 @@ bool _is_weak_contiguous(torch::Tensor& t) {
           t.numel() * t.element_size());
 }
 
-bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
+bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
                       bool full_nvlink) {
   auto inp_size = inp.numel() * inp.element_size();
   // custom allreduce requires input byte size to be multiples of 16
@@ -125,7 +125,7 @@ void dispose(fptr_t _fa) {
   delete fa;
 }
 
-int meta_size() { return sizeof(vllm::Signal); }
+int64_t meta_size() { return sizeof(vllm::Signal); }
 
 void register_buffer(fptr_t _fa, torch::Tensor& t,
                      const std::vector<std::string>& handles,
@@ -134,10 +134,16 @@ void register_buffer(fptr_t _fa, torch::Tensor& t,
   fa->register_buffer(handles, offsets, t.data_ptr());
 }
 
-std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
     fptr_t _fa) {
   auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
-  return fa->get_graph_buffer_ipc_meta();
+  auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto handles =
+      torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
+  std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
+  return {handles, std::move(offsets)};
 }
 
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 3ecea03242f06..a634e1c3d4886 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -4,7 +4,7 @@
  */
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index 70a2b3b0a07b1..ca1c04bd880d9 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -1,4 +1,4 @@
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
@@ -291,7 +291,7 @@ fused_add_rms_norm_kernel(
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
               torch::Tensor& input,   // [..., hidden_size]
               torch::Tensor& weight,  // [hidden_size]
-              float epsilon) {
+              double epsilon) {
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
 
@@ -319,7 +319,7 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
 void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                         torch::Tensor& residual,  // [..., hidden_size]
                         torch::Tensor& weight,    // [hidden_size]
-                        float epsilon) {
+                        double epsilon) {
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
 
diff --git a/csrc/moe/moe_ops.cpp b/csrc/moe/moe_ops.cpp
deleted file mode 100644
index 4122f7630d7c7..0000000000000
--- a/csrc/moe/moe_ops.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "moe_ops.h"
-
-#include <torch/extension.h>
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("topk_softmax", &topk_softmax,
-        "Apply topk softmax to the gating outputs.");
-}
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 93e7844ac1993..a251730aa765a 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                   torch::Tensor& token_expert_indices,
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 6ba4fcdb3a3f2..de9747b602524 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -16,7 +16,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
new file mode 100644
index 0000000000000..243752b9a9e8c
--- /dev/null
+++ b/csrc/moe/torch_bindings.cpp
@@ -0,0 +1,12 @@
+#include "registration.h"
+#include "moe_ops.h"
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
+  // Apply topk softmax to the gating outputs.
+  m.def(
+      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
+      "token_expert_indices, Tensor gating_output) -> ()");
+  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe_align_block_size_kernels.cu
index edc441d121029..1f8d75da83bb8 100644
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe_align_block_size_kernels.cu
@@ -1,4 +1,4 @@
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 
 #include <ATen/ATen.h>
@@ -108,8 +108,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
 }
 }  // namespace vllm
 
-void moe_align_block_size(torch::Tensor topk_ids, int num_experts,
-                          int block_size, torch::Tensor sorted_token_ids,
+void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                          int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
                           torch::Tensor num_tokens_post_pad) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
diff --git a/csrc/ops.h b/csrc/ops.h
index 06b60e748886f..0c270a78c331f 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -1,40 +1,42 @@
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/library.h>
 
 void paged_attention_v1(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step);
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step);
 
 void paged_attention_v2(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step);
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step);
 
 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-              float epsilon);
+              double epsilon);
 
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
-                        torch::Tensor& weight, float epsilon);
+                        torch::Tensor& weight, double epsilon);
 
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int head_size,
+                      torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
 
 void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                              torch::Tensor& key, int head_size,
+                              torch::Tensor& key, int64_t head_size,
                               torch::Tensor& cos_sin_cache, bool is_neox,
-                              int rot_dim,
+                              int64_t rot_dim,
                               torch::Tensor& cos_sin_cache_offsets);
 
 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
@@ -60,12 +62,12 @@ torch::Tensor aqlm_dequant(const torch::Tensor& codes,
 
 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                        torch::Tensor _scaling_factors, torch::Tensor _zeros,
-                       int split_k_iters);
+                       int64_t split_k_iters);
 
 torch::Tensor awq_dequantize(torch::Tensor _kernel,
                              torch::Tensor _scaling_factors,
-                             torch::Tensor _zeros, int split_k_iters, int thx,
-                             int thy);
+                             torch::Tensor _zeros, int64_t split_k_iters,
+                             int64_t thx, int64_t thy);
 
 torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                           torch::Tensor& b_scales, torch::Tensor& workspace,
@@ -88,9 +90,9 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
-int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b, torch::Tensor const& a_scales,
-                         torch::Tensor const& b_scales);
+void cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
+                          torch::Tensor const& b, torch::Tensor const& a_scales,
+                          torch::Tensor const& b_scales);
 
 #endif
 
@@ -106,9 +108,9 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int bit);
+                        bool use_exllama, int64_t bit);
 
-void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int bit);
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 
 void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                              torch::Tensor& scale);
@@ -116,28 +118,28 @@ void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
 void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                               torch::Tensor& scale);
 
-void moe_align_block_size(torch::Tensor topk_ids, int num_experts,
-                          int block_size, torch::Tensor sorted_token_ids,
+void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                          int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
                           torch::Tensor num_tokens_post_pad);
 
 #ifndef USE_ROCM
-using fptr_t = uint64_t;
+using fptr_t = int64_t;
 fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
                       const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int rank,
+                      const std::vector<int64_t>& offsets, int64_t rank,
                       bool full_nvlink);
-bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
+bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
                       bool full_nvlink);
 void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
 void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
                       torch::Tensor& out);
 void dispose(fptr_t _fa);
-int meta_size();
+int64_t meta_size();
 void register_buffer(fptr_t _fa, torch::Tensor& t,
                      const std::vector<std::string>& handles,
                      const std::vector<int64_t>& offsets);
-std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
     fptr_t _fa);
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
                             const std::vector<std::vector<int64_t>>& offsets);
diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu
index 69d6dae1c26bc..97184a8735593 100644
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@@ -1,4 +1,4 @@
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
@@ -127,7 +127,7 @@ void rotary_embedding(
                            // [num_tokens, num_heads * head_size]
     torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
                            // [num_tokens, num_kv_heads * head_size]
-    int head_size,
+    int64_t head_size,
     torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
     bool is_neox) {
   int64_t num_tokens = query.numel() / query.size(-1);
@@ -138,7 +138,7 @@ void rotary_embedding(
   int64_t key_stride = key.stride(-2);
 
   dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * rot_dim / 2, 512));
+  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
@@ -168,9 +168,9 @@ void batched_rotary_embedding(
                            // [num_tokens, num_heads * head_size]
     torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
                            // [num_tokens, num_kv_heads * head_size]
-    int head_size,
+    int64_t head_size,
     torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
-    bool is_neox, int rot_dim,
+    bool is_neox, int64_t rot_dim,
     torch::Tensor& cos_sin_cache_offsets  // [num_tokens]
 ) {
   int64_t num_tokens = cos_sin_cache_offsets.size(0);
@@ -180,7 +180,7 @@ void batched_rotary_embedding(
   int64_t key_stride = key.stride(-2);
 
   dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * rot_dim / 2, 512));
+  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
diff --git a/csrc/punica/punica_ops.cu b/csrc/punica/punica_ops.cu
index 61de3b37937cc..dd29820144b34 100644
--- a/csrc/punica/punica_ops.cu
+++ b/csrc/punica/punica_ops.cu
@@ -1,4 +1,4 @@
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cstdint>
 
@@ -88,7 +88,7 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
 }
 
 void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, float scale) {
+                   torch::Tensor indicies, int64_t layer_idx, double scale) {
   CHECK_INPUT(y);
   CHECK_INPUT(x);
   CHECK_INPUT(w);
@@ -320,7 +320,7 @@ void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
 
 void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
                              torch::Tensor indicies, int64_t layer_idx,
-                             float scale, int64_t h_in, int64_t h_out,
+                             double scale, int64_t h_in, int64_t h_out,
                              int64_t y_offset) {
   CHECK_INPUT(y);
   CHECK_INPUT(x);
diff --git a/csrc/punica/punica_ops.h b/csrc/punica/punica_ops.h
index 937e2d1d25d4a..5d625d0564f75 100644
--- a/csrc/punica/punica_ops.h
+++ b/csrc/punica/punica_ops.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, float scale);
+                   torch::Tensor indicies, int64_t layer_idx, double scale);
 
 void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
                              torch::Tensor indicies, int64_t layer_idx,
-                             float scale, int64_t h_in, int64_t h_out,
+                             double scale, int64_t h_in, int64_t h_out,
                              int64_t y_offset);
diff --git a/csrc/punica/punica_pybind.cpp b/csrc/punica/punica_pybind.cpp
deleted file mode 100644
index 9490ad59cdd5f..0000000000000
--- a/csrc/punica/punica_pybind.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#include <torch/extension.h>
-
-#include "punica_ops.h"
-
-//====== pybind ======
-
-#define DEFINE_pybind(name) m.def(#name, &name, #name);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("dispatch_bgmv", &dispatch_bgmv, "dispatch_bgmv");
-  m.def("dispatch_bgmv_low_level", &dispatch_bgmv_low_level,
-        "dispatch_bgmv_low_level");
-}
diff --git a/csrc/punica/torch_bindings.cpp b/csrc/punica/torch_bindings.cpp
new file mode 100644
index 0000000000000..894e229b6d9db
--- /dev/null
+++ b/csrc/punica/torch_bindings.cpp
@@ -0,0 +1,18 @@
+#include "registration.h"
+#include "punica_ops.h"
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
+  m.def(
+      "dispatch_bgmv(Tensor! y, Tensor x, Tensor w, Tensor indicies, int "
+      "layer_idx, float scale) -> ()");
+  m.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv);
+
+  m.def(
+      "dispatch_bgmv_low_level(Tensor! y, Tensor x, Tensor w,"
+      "Tensor indicies, int layer_idx,"
+      "float scale, int h_in, int h_out,"
+      "int y_offset) -> ()");
+  m.impl("dispatch_bgmv_low_level", torch::kCUDA, &dispatch_bgmv_low_level);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
deleted file mode 100644
index 547823aa1b04e..0000000000000
--- a/csrc/pybind.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-#include "cache.h"
-#include "cuda_utils.h"
-#include "ops.h"
-#include <torch/extension.h>
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  // vLLM custom ops
-  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
-
-  // Attention ops
-  ops.def("paged_attention_v1", &paged_attention_v1,
-          "Compute the attention between an input query and the cached "
-          "keys/values using PagedAttention.");
-  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
-
-  // Activation ops
-  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
-  ops.def("gelu_and_mul", &gelu_and_mul,
-          "Activation function used in GeGLU with `none` approximation.");
-  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
-          "Activation function used in GeGLU with `tanh` approximation.");
-  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
-  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
-
-  // Layernorm
-  ops.def("rms_norm", &rms_norm,
-          "Apply Root Mean Square (RMS) Normalization to the input tensor.");
-
-  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
-          "In-place fused Add and RMS Normalization");
-
-  // Rotary embedding
-  ops.def("rotary_embedding", &rotary_embedding,
-          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
-
-  ops.def("batched_rotary_embedding", &batched_rotary_embedding,
-          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key "
-          "(supports multiple loras)");
-
-// Quantization ops
-#ifndef USE_ROCM
-  ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
-  ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
-  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
-  ops.def("marlin_gemm", &marlin_gemm,
-          "Marlin (Dense) Optimized Quantized GEMM for GPTQ");
-  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm,
-          "Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ");
-  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm,
-          "gptq_marlin Optimized Quantized GEMM for GPTQ");
-  ops.def("gptq_marlin_repack", &gptq_marlin_repack,
-          "gptq_marlin repack from GPTQ");
-  ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
-  ops.def("cutlass_scaled_mm_dq", &cutlass_scaled_mm_dq,
-          "CUTLASS w8a8 GEMM, supporting symmetric per-tensor or "
-          "per-row/column quantization.");
-#endif
-
-  ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
-  ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
-  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
-  ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant,
-          "Compute FP8 quantized tensor for given scaling factor");
-  ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant,
-          "Compute FP8 quantized tensor and scaling factor");
-  ops.def("moe_align_block_size", &moe_align_block_size,
-          "Aligning the number of tokens to be processed by each expert such "
-          "that it is divisible by the block size.");
-
-  ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
-          "Compute int8 quantized tensor for given scaling factor");
-
-  ops.def("dynamic_scaled_int8_quant", &dynamic_scaled_int8_quant,
-          "Compute int8 quantized tensor and scaling factor");
-
-  // Cache ops
-  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
-  cache_ops.def("swap_blocks", &swap_blocks,
-                "Swap in (out) the cache blocks from src to dst");
-  cache_ops.def("copy_blocks", &copy_blocks,
-                "Copy the cache blocks from src to dst");
-  cache_ops.def("reshape_and_cache", &reshape_and_cache,
-                "Reshape the key and value tensors and cache them");
-  cache_ops.def("reshape_and_cache_flash", &reshape_and_cache_flash,
-                "Reshape the key and value tensors and cache them");
-  cache_ops.def("convert_fp8", &convert_fp8,
-                "Convert the key and value cache to fp8 data type");
-
-  // Cuda utils
-  pybind11::module cuda_utils =
-      m.def_submodule("cuda_utils", "vLLM cuda utils");
-  cuda_utils.def("get_device_attribute", &get_device_attribute,
-                 "Gets the specified device attribute.");
-
-  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
-                 &get_max_shared_memory_per_block_device_attribute,
-                 "Gets the maximum shared memory per block device attribute.");
-
-#ifndef USE_ROCM
-  // Custom all-reduce kernels
-  pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
-  custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
-  custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
-  custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
-  custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
-  custom_ar.def("dispose", &dispose, "dispose");
-  custom_ar.def("meta_size", &meta_size, "meta_size");
-  custom_ar.def("register_buffer", &register_buffer, "register_buffer");
-  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
-                "get_graph_buffer_ipc_meta");
-  custom_ar.def("register_graph_buffers", &register_graph_buffers,
-                "register_graph_buffers");
-#endif
-}
diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
index 255844eec56d4..8fb9856800867 100644
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@@ -18,7 +18,7 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/cuda/CUDAGuard.h>
 
diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu
index bb8e5bbb23d7f..6d6da5f3d8746 100644
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -7,7 +7,7 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
 }
  */
 
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 
 #include "dequantize.cuh"
@@ -435,8 +435,8 @@ __global__ void __launch_bounds__(64)
 
 torch::Tensor awq_dequantize(torch::Tensor _kernel,
                              torch::Tensor _scaling_factors,
-                             torch::Tensor _zeros, int split_k_iters, int thx,
-                             int thy) {
+                             torch::Tensor _zeros, int64_t split_k_iters,
+                             int64_t thx, int64_t thy) {
   int in_c = _kernel.size(0);
   int qout_c = _kernel.size(1);
   int out_c = qout_c * 8;
@@ -491,7 +491,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
 
 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                        torch::Tensor _scaling_factors, torch::Tensor _zeros,
-                       int split_k_iters) {
+                       int64_t split_k_iters) {
   int num_in_feats = _in_feats.size(0);
   int num_in_channels = _in_feats.size(1);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index 280b0327111da..aa9511daa2772 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <cmath>
 
 #include "../../dispatch_utils.h"
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
index 088fee4783faa..23a8b4070b70e 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
@@ -1,5 +1,5 @@
 #include <stddef.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
index 8fc4ba662ecdd..a99802153643a 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
@@ -4,7 +4,7 @@
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
index eb532f2ac7a9b..423e64a4932e2 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
@@ -1,7 +1,7 @@
 #include <cudaTypedefs.h>
 
 #include <c10/cuda/CUDAGuard.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 
 void cutlass_scaled_mm_dq_sm75(torch::Tensor& c, torch::Tensor const& a,
                                torch::Tensor const& b,
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 55be3305a9b8c..8c5b693bf6ed7 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 
 #include <cmath>
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
index 480c4986c3821..785f1a09c1900 100644
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -6,7 +6,7 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa
 #include <cstdint>
 #include <cstdio>
 
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda_runtime.h>
@@ -1823,7 +1823,7 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int bit) {
+                        bool use_exllama, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
   auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
   at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
@@ -1845,7 +1845,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
   return c;
 }
 
-void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int bit) {
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
   vllm::gptq::shuffle_exllama_weight(
       (uint32_t*)q_weight.data_ptr(),
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index c573b9041065b..0beb9de14c687 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -1867,4 +1867,4 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   return c;
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cuh b/csrc/quantization/gptq_marlin/gptq_marlin.cuh
index ba5368ea8835f..42af44951efda 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cuh
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index 03d66cecedf1f..d124c0149912d 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index 686dd7851e6af..b5effc3055441 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -16,7 +16,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
index 1b339fa4b392b..40baac6108695 100644
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -1,5 +1,4 @@
 #include <torch/all.h>
-#include <torch/python.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
diff --git a/csrc/registration.h b/csrc/registration.h
new file mode 100644
index 0000000000000..e5396e9a8b137
--- /dev/null
+++ b/csrc/registration.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <Python.h>
+
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+
+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+
+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
+// via python's import statement.
+#define REGISTER_EXTENSION(NAME)                                               \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                           \
+  }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
new file mode 100644
index 0000000000000..df2603544c85a
--- /dev/null
+++ b/csrc/torch_bindings.cpp
@@ -0,0 +1,283 @@
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
+#include "registration.h"
+
+#include <torch/library.h>
+
+// Note on op signatures:
+// The X_meta signatures are for the meta functions corresponding to op X.
+// They must be kept in sync with the signature for X. Generally, only
+// functions that return Tensors require a meta function.
+//
+// See the following links for detailed docs on op registration and function
+// schemas.
+// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // vLLM custom ops
+
+  // Attention ops
+  // Compute the attention between an input query and the cached
+  // keys/values using PagedAttention.
+  ops.def(
+      "paged_attention_v1("
+      "    Tensor! out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
+      "    int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
+
+  // PagedAttention V2.
+  ops.def(
+      "paged_attention_v2("
+      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
+      "    int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
+
+  // Activation ops
+  // Activation function used in SwiGLU.
+  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
+
+  // Activation function used in GeGLU with `none` approximation.
+  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);
+
+  // Activation function used in GeGLU with `tanh` approximation.
+  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul);
+
+  // GELU implementation used in GPT-2.
+  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_new", torch::kCUDA, &gelu_new);
+
+  // Approximate GELU implementation.
+  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
+
+  // Layernorm
+  // Apply Root Mean Square (RMS) Normalization to the input tensor.
+  ops.def(
+      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
+      "()");
+  ops.impl("rms_norm", torch::kCUDA, &rms_norm);
+
+  // In-place fused Add and RMS Normalization.
+  ops.def(
+      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
+      "float epsilon) -> ()");
+  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
+
+  // Rotary embedding
+  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
+  ops.def(
+      "rotary_embedding(Tensor positions, Tensor! query,"
+      "                 Tensor! key, int head_size,"
+      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+  ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);
+
+  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key
+  // (supports multiple loras).
+  ops.def(
+      "batched_rotary_embedding(Tensor positions, Tensor! query,"
+      "                         Tensor! key, int head_size,"
+      "                         Tensor cos_sin_cache, bool is_neox,"
+      "                         int rot_dim,"
+      "                         Tensor cos_sin_cache_offsets) -> ()");
+  ops.impl("batched_rotary_embedding", torch::kCUDA, &batched_rotary_embedding);
+
+  // Quantization ops
+#ifndef USE_ROCM
+  // Quantized GEMM for AQLM.
+  ops.def("aqlm_gemm", &aqlm_gemm);
+  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
+
+  // Decompression method for AQLM.
+  ops.def("aqlm_dequant", &aqlm_dequant);
+  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
+
+  // Quantized GEMM for AWQ.
+  ops.def("awq_gemm", &awq_gemm);
+  ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
+
+  // Dequantization for AWQ.
+  ops.def("awq_dequantize", &awq_dequantize);
+  ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
+
+  // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
+  ops.def("marlin_gemm", &marlin_gemm);
+  ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
+
+  // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
+  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
+  ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
+
+  // gptq_marlin Optimized Quantized GEMM for GPTQ.
+  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm);
+  ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
+
+  // gptq_marlin repack from GPTQ.
+  ops.def("gptq_marlin_repack", &gptq_marlin_repack);
+  ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
+
+  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm_dq(Tensor! out, Tensor a,"
+      "                     Tensor b, Tensor a_scales,"
+      "                     Tensor b_scales) -> ()");
+  ops.impl("cutlass_scaled_mm_dq", torch::kCUDA, &cutlass_scaled_mm_dq);
+#endif
+
+  // Quantized GEMM for GPTQ.
+  ops.def("gptq_gemm", &gptq_gemm);
+  ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
+
+  // Post processing for GPTQ.
+  ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
+  ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
+
+  // Quantized GEMM for SqueezeLLM.
+  ops.def(
+      "squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
+      "lookup_table) -> ()");
+  ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
+
+  // Compute FP8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
+  ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
+
+  // Compute FP8 quantized tensor and scaling factor.
+  ops.def(
+      "dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
+      "()");
+  ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
+
+  // Aligning the number of tokens to be processed by each expert such
+  // that it is divisible by the block size.
+  ops.def(
+      "moe_align_block_size(Tensor topk_ids, int num_experts,"
+      "                     int block_size, Tensor! sorted_token_ids,"
+      "                     Tensor! experts_ids,"
+      "                     Tensor! num_tokens_post_pad) -> ()");
+  ops.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
+
+  // Compute int8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
+      "()");
+  ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant);
+
+  // Compute int8 quantized tensor and scaling factor
+  ops.def(
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
+      "()");
+  ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
+           &dynamic_scaled_int8_quant);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
+  // Cache ops
+  // Swap in (out) the cache blocks from src to dst.
+  cache_ops.def(
+      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
+  cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
+
+  // Copy the cache blocks from src to dst.
+  cache_ops.def(
+      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "block_mapping) -> ()");
+  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
+
+  // Reshape the key and value tensors and cache them.
+  cache_ops.def(
+      "reshape_and_cache(Tensor key, Tensor value,"
+      "                  Tensor! key_cache, Tensor! value_cache,"
+      "                  Tensor slot_mapping,"
+      "                  str kv_cache_dtype,"
+      "                  float kv_scale) -> ()");
+  cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
+
+  // Reshape the key and value tensors and cache them.
+  cache_ops.def(
+      "reshape_and_cache_flash(Tensor key, Tensor value,"
+      "                        Tensor! key_cache,"
+      "                        Tensor! value_cache,"
+      "                        Tensor slot_mapping,"
+      "                        str kv_cache_dtype) -> ()");
+  cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
+                 &reshape_and_cache_flash);
+
+  // Convert the key and value cache to fp8 data type.
+  cache_ops.def(
+      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str "
+      "kv_cache_dtype) -> ()");
+  cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
+  // Cuda utils
+
+  // Gets the specified device attribute.
+  cuda_utils.def("get_device_attribute", &get_device_attribute);
+  cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute);
+
+  // Gets the maximum shared memory per block device attribute.
+  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
+                 &get_max_shared_memory_per_block_device_attribute);
+  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
+                  torch::kCUDA,
+                  &get_max_shared_memory_per_block_device_attribute);
+}
+
+#ifndef USE_ROCM
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
+  // Custom all-reduce kernels
+  custom_ar.def("init_custom_ar", &init_custom_ar);
+  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
+
+  custom_ar.def("should_custom_ar", &should_custom_ar);
+  custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
+
+  custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
+  custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg);
+
+  custom_ar.def(
+      "all_reduce_unreg(int fa, Tensor inp, Tensor reg_buffer, Tensor! out) -> "
+      "()");
+  custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
+
+  custom_ar.def("dispose", &dispose);
+  custom_ar.impl("dispose", torch::kCPU, &dispose);
+
+  custom_ar.def("meta_size", &meta_size);
+  custom_ar.impl("meta_size", torch::kCPU, &meta_size);
+
+  custom_ar.def("register_buffer", &register_buffer);
+  custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
+
+  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
+  custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
+                 &get_graph_buffer_ipc_meta);
+
+  custom_ar.def("register_graph_buffers", &register_graph_buffers);
+  custom_ar.impl("register_graph_buffers", torch::kCPU,
+                 &register_graph_buffers);
+}
+#endif
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/setup.py b/setup.py
index f7d465b60c153..339b0ad6de2d1 100644
--- a/setup.py
+++ b/setup.py
@@ -60,7 +60,7 @@ def remove_prefix(text, prefix):
 class CMakeExtension(Extension):
 
     def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
-        super().__init__(name, sources=[], **kwa)
+        super().__init__(name, sources=[], py_limited_api=True, **kwa)
         self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
 
 
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index aab7af9d2cbf6..0daf7439468aa 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -1,7 +1,8 @@
 import pytest
 import torch
 
-from vllm._C import ops
+# ruff: noqa: F401
+import vllm._C
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
@@ -33,7 +34,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
 
     ops_out = torch.empty_like(x, dtype=torch.int8, device="cuda")
     scales_out = torch.empty_like(scales, dtype=torch.float32, device="cuda")
-    ops.dynamic_scaled_int8_quant(ops_out, x, scales_out)
+    torch.ops._C.dynamic_scaled_int8_quant(ops_out, x, scales_out)
 
     assert torch.allclose(scales_out, scales)
     assert torch.allclose(torch_out, ops_out,
@@ -60,6 +61,6 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
     out2 = torch.empty_like(x, dtype=torch.int8)
     scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda")
 
-    ops.static_scaled_int8_quant(out2, x, scale_argument)
+    torch.ops._C.static_scaled_int8_quant(out2, x, scale_argument)
     assert torch.allclose(out1, out2,
                           atol=1)  # big atol to account for rounding errors
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 7e12f1ba14cde..440b0e8afa99a 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,35 +1,47 @@
-from typing import Optional, Tuple, Type
+import contextlib
+from typing import List, Optional, Tuple, Type
 
 import torch
 
 try:
-    from vllm._C import cache_ops as vllm_cache_ops
-    from vllm._C import ops as vllm_ops
+    import vllm._C
 except ImportError as e:
     from vllm.logger import init_logger
     logger = init_logger(__name__)
     logger.warning("Failed to import from vllm._C with %r", e)
 
+with contextlib.suppress(ImportError):
+    import vllm._moe_C
+
+with contextlib.suppress(ImportError):
+    # ruff: noqa: F401
+    import vllm._punica_C
+
+
+def is_custom_op_supported(op_name: str) -> bool:
+    op, overloads = torch._C._jit_get_operation(op_name)
+    return op is not None
+
 
 # activation ops
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    vllm_ops.silu_and_mul(out, x)
+    torch.ops._C.silu_and_mul(out, x)
 
 
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    vllm_ops.gelu_and_mul(out, x)
+    torch.ops._C.gelu_and_mul(out, x)
 
 
 def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    vllm_ops.gelu_tanh_and_mul(out, x)
+    torch.ops._C.gelu_tanh_and_mul(out, x)
 
 
 def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    vllm_ops.gelu_fast(out, x)
+    torch.ops._C.gelu_fast(out, x)
 
 
 def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    vllm_ops.gelu_new(out, x)
+    torch.ops._C.gelu_new(out, x)
 
 
 # page attention ops
@@ -53,7 +65,7 @@ def paged_attention_v1(
     blocksparse_block_size: int = 64,
     blocksparse_head_sliding_step: int = 0,
 ) -> None:
-    vllm_ops.paged_attention_v1(
+    torch.ops._C.paged_attention_v1(
         out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
         seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
         kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,
@@ -83,7 +95,7 @@ def paged_attention_v2(
     blocksparse_block_size: int = 64,
     blocksparse_head_sliding_step: int = 0,
 ) -> None:
-    vllm_ops.paged_attention_v2(
+    torch.ops._C.paged_attention_v2(
         out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
         num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
         alibi_slopes, kv_cache_dtype, kv_scale, tp_rank,
@@ -100,8 +112,8 @@ def rotary_embedding(
     cos_sin_cache: torch.Tensor,
     is_neox: bool,
 ) -> None:
-    vllm_ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache,
-                              is_neox)
+    torch.ops._C.rotary_embedding(positions, query, key, head_size,
+                                  cos_sin_cache, is_neox)
 
 
 def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
@@ -109,20 +121,20 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
                              cos_sin_cache: torch.Tensor, is_neox: bool,
                              rot_dim: int,
                              cos_sin_cache_offsets: torch.Tensor) -> None:
-    vllm_ops.batched_rotary_embedding(positions, query, key, head_size,
-                                      cos_sin_cache, is_neox, rot_dim,
-                                      cos_sin_cache_offsets)
+    torch.ops._C.batched_rotary_embedding(positions, query, key, head_size,
+                                          cos_sin_cache, is_neox, rot_dim,
+                                          cos_sin_cache_offsets)
 
 
 # layer norm ops
 def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
              epsilon: float) -> None:
-    vllm_ops.rms_norm(out, input, weight, epsilon)
+    torch.ops._C.rms_norm(out, input, weight, epsilon)
 
 
 def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
                        weight: torch.Tensor, epsilon: float) -> None:
-    vllm_ops.fused_add_rms_norm(input, residual, weight, epsilon)
+    torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
 # quantization ops
@@ -130,13 +142,13 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
 def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
                    zeros: torch.Tensor, split_k_iters: int, thx: int,
                    thy: int) -> torch.Tensor:
-    return vllm_ops.awq_dequantize(qweight, scales, zeros, split_k_iters, thx,
-                                   thy)
+    return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters,
+                                       thx, thy)
 
 
 def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
              scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
-    return vllm_ops.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
 
 
 # gptq
@@ -144,27 +156,27 @@ def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
               b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
               b_g_idx: torch.Tensor, use_exllama: bool,
               bit: int) -> torch.Tensor:
-    return vllm_ops.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
-                              b_g_idx, use_exllama, bit)
+    return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
+                                  b_g_idx, use_exllama, bit)
 
 
 def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
                  bit: int) -> None:
-    vllm_ops.gptq_shuffle(q_weight, q_perm, bit)
+    torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
 
 
 # squeezellm
 def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor,
                     lookup_table: torch.Tensor) -> None:
-    vllm_ops.squeezellm_gemm(vec, mat, mul, lookup_table)
+    torch.ops._C.squeezellm_gemm(vec, mat, mul, lookup_table)
 
 
 # marlin
 def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                 b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,
                 size_n: int, size_k: int) -> torch.Tensor:
-    return vllm_ops.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
-                                size_n, size_k)
+    return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
+                                    size_n, size_k)
 
 
 # marlin_24
@@ -172,9 +184,9 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                         b_meta: torch.Tensor, b_scales: torch.Tensor,
                         workspace: torch.Tensor, num_bits: int, size_m: int,
                         size_n: int, size_k: int) -> torch.Tensor:
-    return vllm_ops.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
-                                        workspace, num_bits, size_m, size_n,
-                                        size_k)
+    return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
+                                            workspace, num_bits, size_m,
+                                            size_n, size_k)
 
 
 # cutlass
@@ -188,7 +200,7 @@ def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
     n = b.shape[1]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    vllm_ops.cutlass_scaled_mm_dq(out, a, b, scale_a, scale_b)
+    torch.ops._C.cutlass_scaled_mm_dq(out, a, b, scale_a, scale_b)
 
     return out
 
@@ -198,21 +210,22 @@ def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
               codebook_partition_sizes: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
-    return vllm_ops.aqlm_gemm(input, codes, codebooks, scales,
-                              codebook_partition_sizes, bias)
+    return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
+                                  codebook_partition_sizes, bias)
 
 
 def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
                  codebook_partition_sizes: torch.Tensor) -> torch.Tensor:
-    return vllm_ops.aqlm_dequant(codes, codebooks, codebook_partition_sizes)
+    return torch.ops._C.aqlm_dequant(codes, codebooks,
+                                     codebook_partition_sizes)
 
 
 # gptq_marlin
 def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                        size_k: int, size_n: int,
                        num_bits: int) -> torch.Tensor:
-    return vllm_ops.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
-                                       num_bits)
+    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
+                                           num_bits)
 
 
 def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
@@ -220,9 +233,9 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                      perm: torch.Tensor, workspace: torch.Tensor,
                      num_bits: int, size_m: int, size_n: int, size_k: int,
                      is_k_full: bool) -> torch.Tensor:
-    return vllm_ops.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
-                                     workspace, num_bits, size_m, size_n,
-                                     size_k, is_k_full)
+    return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
+                                         workspace, num_bits, size_m, size_n,
+                                         size_k, is_k_full)
 
 
 # fp8
@@ -259,9 +272,9 @@ def scaled_fp8_quant(
         output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
         scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-        vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
+        torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
-        vllm_ops.static_scaled_fp8_quant(output, input, scale)
+        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
     return output, scale
 
 
@@ -284,14 +297,14 @@ def scaled_int8_quant(
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
-        vllm_ops.static_scaled_int8_quant(output, input, scale)
+        torch.ops._C.static_scaled_int8_quant(output, input, scale)
         return output, scale
 
     # dynamic-per-token quantization.
     input_scales = torch.empty((input.numel() // input.shape[-1], 1),
                                device=input.device,
                                dtype=torch.float32)
-    vllm_ops.dynamic_scaled_int8_quant(output, input, input_scales)
+    torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales)
     return output, input_scales
 
 
@@ -300,9 +313,16 @@ def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                          block_size: int, sorted_token_ids: torch.Tensor,
                          experts_ids: torch.Tensor,
                          num_tokens_post_pad: torch.Tensor) -> None:
-    vllm_ops.moe_align_block_size(topk_ids, num_experts, block_size,
-                                  sorted_token_ids, experts_ids,
-                                  num_tokens_post_pad)
+    torch.ops._C.moe_align_block_size(topk_ids, num_experts, block_size,
+                                      sorted_token_ids, experts_ids,
+                                      num_tokens_post_pad)
+
+
+def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 token_expert_indicies: torch.Tensor,
+                 gating_output: float) -> None:
+    torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
+                                  token_expert_indicies, gating_output)
 
 
 def reshape_and_cache(
@@ -314,8 +334,9 @@ def reshape_and_cache(
     kv_cache_dtype: str,
     kv_scale: float,
 ) -> None:
-    vllm_cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                     slot_mapping, kv_cache_dtype, kv_scale)
+    torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
+                                             value_cache, slot_mapping,
+                                             kv_cache_dtype, kv_scale)
 
 
 def reshape_and_cache_flash(
@@ -326,25 +347,115 @@ def reshape_and_cache_flash(
     slot_mapping: torch.Tensor,
     kv_cache_dtype: str,
 ) -> None:
-    vllm_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
-                                           slot_mapping, kv_cache_dtype)
+    torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
+                                                   value_cache, slot_mapping,
+                                                   kv_cache_dtype)
 
 
 def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor,
                 block_mapping: torch.Tensor) -> None:
-    vllm_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
+    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 
 
 def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                 block_mapping: torch.Tensor) -> None:
-    vllm_cache_ops.swap_blocks(src, dst, block_mapping)
+    torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
 
 
 def convert_fp8(output: torch.Tensor,
                 input: torch.Tensor,
                 scale: float = 1.0,
                 kv_dtype: str = "fp8") -> None:
-    vllm_cache_ops.convert_fp8(output, input, scale, kv_dtype)
+    torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
+
+
+def get_device_attribute(attribute: int, device: int) -> int:
+    return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
+
+
+def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
+    # ruff: noqa: E501
+    return torch.ops._C_cuda_utils.get_max_shared_memory_per_block_device_attribute(
+        device)
+
+
+# custom ar
+def init_custom_ar(meta: torch.Tensor, rank_data: torch.Tensor,
+                   handles: List[str], offsets: List[int], rank: int,
+                   full_nvlink: bool) -> int:
+    return torch.ops._C_custom_ar.init_custom_ar(meta, rank_data, handles,
+                                                 offsets, rank, full_nvlink)
+
+
+def should_custom_ar(inp: torch.Tensor, max_size: int, world_size: int,
+                     full_nvlink: bool) -> bool:
+    return torch.ops._C_custom_ar.should_custom_ar(inp, max_size, world_size,
+                                                   full_nvlink)
+
+
+def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+    torch.ops._C_custom_ar.all_reduce_reg(fa, inp, out)
+
 
+def all_reduce_unreg(fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor,
+                     out: torch.Tensor) -> None:
+    torch.ops._C_custom_ar.all_reduce_unreg(fa, inp, reg_buffer, out)
 
-#TODO: cuda_utils, custom_ar
+
+def dispose(fa: int) -> None:
+    torch.ops._C_custom_ar.dispose(fa)
+
+
+def meta_size() -> int:
+    return torch.ops._C_custom_ar.meta_size()
+
+
+def register_buffer(fa: int, t: torch.Tensor, handles: List[str],
+                    offsets: List[int]) -> None:
+    return torch.ops._C_custom_ar.register_buffer(fa, t, handles, offsets)
+
+
+def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[str], List[int]]:
+    return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
+
+
+def register_graph_buffers(fa: int, handles: List[str],
+                           offsets: List[List[int]]) -> None:
+    torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
+
+
+# punica
+def dispatch_bgmv(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    indicies: torch.Tensor,
+    layer_idx: int,
+    scale: float,
+) -> None:
+    torch.ops._punica_C.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx,
+                                      scale)
+
+
+def dispatch_bgmv_low_level(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    indicies: torch.Tensor,
+    layer_idx: int,
+    scale: float,
+    h_in: int,
+    h_out: int,
+    y_offset: int,
+) -> None:
+    torch.ops._punica_C.dispatch_bgmv_low_level(
+        y,
+        x,
+        w_t_all,
+        indicies,
+        layer_idx,
+        scale,
+        h_in,
+        h_out,
+        y_offset,
+    )
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 070c074e511bc..8c64c2bfdeb8f 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -5,7 +5,7 @@
 import torch
 from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
 
-from vllm._C import cache_ops
+from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata)
 
@@ -47,11 +47,11 @@ def swap_blocks(
     ) -> None:
         src_key_cache = src_kv_cache[0]
         dst_key_cache = dst_kv_cache[0]
-        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
 
         src_value_cache = src_kv_cache[1]
         dst_value_cache = dst_kv_cache[1]
-        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
@@ -60,7 +60,7 @@ def copy_blocks(
     ) -> None:
         key_caches = [kv_cache[0] for kv_cache in kv_caches]
         value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        cache_ops.copy_blocks(key_caches, value_caches, src_to_dists)
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
 
 
 @dataclass
@@ -285,7 +285,7 @@ def forward(
             # Reshape the input keys and values and store them in the cache.
             # If kv_cache is not provided, the new key and value tensors are
             # not cached. This happens during the initial memory profiling run.
-            cache_ops.reshape_and_cache_flash(
+            ops.reshape_and_cache_flash(
                 key,
                 value,
                 key_cache,
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index a3902aecb3793..4a0e19bc0c159 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -6,6 +6,7 @@
 from torch.distributed import ProcessGroup
 
 import vllm.envs as envs
+from vllm import _custom_ops as ops
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import (
@@ -15,7 +16,11 @@
 try:
     import pynvml
 
-    from vllm._C import custom_ar
+    # Simulate ImportError if custom_ar ops are not supported.
+    if not ops.is_custom_op_supported("_C_custom_ar::meta_size"):
+        raise ImportError("custom_ar", __file__)
+
+    custom_ar = True
 
     @contextmanager
     def _nvml():
@@ -27,7 +32,7 @@ def _nvml():
 
 except ImportError:
     # For AMD GPUs
-    custom_ar = None
+    custom_ar = False
     pynvml = None
 
     @contextmanager
@@ -97,7 +102,7 @@ def __init__(self,
         self._IS_CAPTURING = False
         self.disabled = True
 
-        if custom_ar is None:
+        if not custom_ar:
             # disable because of missing custom allreduce library
             # e.g. in a non-cuda environment
             return
@@ -175,7 +180,7 @@ def __init__(self,
         # meta data composes of two parts: meta data for synchronization
         # (256 bytes) and a temporary buffer for storing intermediate
         # allreduce results.
-        self.meta = torch.zeros(custom_ar.meta_size() + max_size,
+        self.meta = torch.zeros(ops.meta_size() + max_size,
                                 dtype=torch.uint8,
                                 device=self.device)
         # This is a pre-registered IPC buffer. In eager mode, input tensors
@@ -196,9 +201,8 @@ def __init__(self,
         self.world_size = world_size
         handles, offsets = self._get_ipc_meta(self.meta)
         self.full_nvlink = full_nvlink
-        self._ptr = custom_ar.init_custom_ar(self.meta, self.rank_data,
-                                             handles, offsets, rank,
-                                             self.full_nvlink)
+        self._ptr = ops.init_custom_ar(self.meta, self.rank_data, handles,
+                                       offsets, rank, self.full_nvlink)
         self.register_buffer(self.buffer)
 
     @contextmanager
@@ -252,31 +256,31 @@ def _gather_ipc_meta(self, shard_data):
 
     def register_buffer(self, inp: torch.Tensor):
         handles, offsets = self._get_ipc_meta(inp)
-        custom_ar.register_buffer(self._ptr, inp, handles, offsets)
+        ops.register_buffer(self._ptr, inp, handles, offsets)
 
     def register_graph_buffers(self):
-        handle, offset = custom_ar.get_graph_buffer_ipc_meta(self._ptr)
+        handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
         handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
         logger.info("Registering %d cuda graph addresses", len(offset))
-        custom_ar.register_graph_buffers(self._ptr, handles, offsets)
+        ops.register_graph_buffers(self._ptr, handles, offsets)
 
     def should_custom_ar(self, inp: torch.Tensor):
-        return custom_ar.should_custom_ar(inp, self.max_size, self.world_size,
-                                          self.full_nvlink)
+        return ops.should_custom_ar(inp, self.max_size, self.world_size,
+                                    self.full_nvlink)
 
     # all reduce, assuming inp tensor is IPC registered with register_buffer,
     # or, in the context of cuda graphs, register_graph_buffers
     def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
         if out is None:
             out = torch.empty_like(inp)
-        custom_ar.all_reduce_reg(self._ptr, inp, out)
+        ops.all_reduce_reg(self._ptr, inp, out)
         return out
 
     # all reduce, assuming inp tensor is NOT IPC registered
     def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
         if out is None:
             out = torch.empty_like(inp)
-        custom_ar.all_reduce_unreg(self._ptr, inp, self.buffer, out)
+        ops.all_reduce_unreg(self._ptr, inp, self.buffer, out)
         return out
 
     def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
@@ -304,7 +308,7 @@ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
 
     def close(self):
         if not self.disabled and self._ptr:
-            custom_ar.dispose(self._ptr)
+            ops.dispose(self._ptr)
             self._ptr = 0
 
     def __del__(self):
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index c87bed54726fc..7ecaa450f1758 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -4,16 +4,21 @@
 
 import torch
 
+from vllm import _custom_ops as ops
+
+
+def _check_punica_support():
+    if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"):
+        return
 
-def _raise_import_error(e):
     if torch.cuda.get_device_capability() < (8, 0):
         raise ImportError(
-            "punica LoRA kernels require compute capability >= 8.0") from e
+            "punica LoRA kernels require compute capability >= 8.0")
     else:
         raise ImportError(
             "punica LoRA kernels could not be imported. If you built vLLM "
             "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-            "was set.") from e
+            "was set.")
 
 
 def bgmv(
@@ -41,12 +46,9 @@ def bgmv(
       layer_idx: Layer index of the weight matrices.
       scale: Scaling factor.
     """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
+    _check_punica_support()
 
-    punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
+    ops.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
 
 
 def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor,
@@ -75,11 +77,9 @@ def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor,
       y_offset: Offset to apply to the starting column of y.
       y_slice_size: Size of the y column slice.
     """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
-    punica_kernels.dispatch_bgmv_low_level(
+    _check_punica_support()
+
+    ops.dispatch_bgmv_low_level(
         y,
         x,
         w_t_all,
@@ -122,10 +122,7 @@ def add_lora(y: torch.Tensor,
       scale: Scaling factor.
       buffer: Optional. Shape: `[B, R]`. Temporary buffer.
     """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
+    _check_punica_support()
 
     r = wb_t_all.size(-1)
     if buffer is None:
@@ -135,9 +132,8 @@ def add_lora(y: torch.Tensor,
         buffer = torch.zeros((x.size(0), r),
                              dtype=torch.float32,
                              device=x.device)
-    punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
-    punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
-                                 scale)
+    ops.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
+    ops.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, scale)
 
 
 def add_lora_slice(y: torch.Tensor,
@@ -176,10 +172,7 @@ def add_lora_slice(y: torch.Tensor,
       y_offset: Offset to apply to the starting column of y.
       y_slice_size: Size of the y column slice.
     """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
+    _check_punica_support()
 
     r = wb_t_all.size(-1)
     if buffer is None:
@@ -189,7 +182,7 @@ def add_lora_slice(y: torch.Tensor,
         buffer = torch.zeros((x.size(0), r),
                              dtype=torch.float32,
                              device=x.device)
-    punica_kernels.dispatch_bgmv_low_level(
+    ops.dispatch_bgmv_low_level(
         buffer,
         x,
         wa_t_all,
@@ -200,7 +193,7 @@ def add_lora_slice(y: torch.Tensor,
         buffer.size(1),
         0,
     )
-    punica_kernels.dispatch_bgmv_low_level(
+    ops.dispatch_bgmv_low_level(
         y,
         buffer,
         wb_t_all,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1c6947137a1c9..4d0160ff296a0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -8,7 +8,6 @@
 import triton
 import triton.language as tl
 
-import vllm._moe_C as moe_kernels
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 
@@ -355,7 +354,7 @@ def fused_topk(
                                         topk,
                                         dtype=torch.int32,
                                         device=hidden_states.device)
-    moe_kernels.topk_softmax(
+    ops.topk_softmax(
         topk_weights,
         topk_ids,
         token_expert_indicies,
diff --git a/vllm/utils.py b/vllm/utils.py
index 2bd24d086f690..54d446b23350a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -22,6 +22,7 @@
 import torch
 
 import vllm.envs as envs
+from vllm import _custom_ops as ops
 from vllm.logger import enable_trace_function_call, init_logger
 
 T = TypeVar("T")
@@ -148,12 +149,8 @@ def is_neuron() -> bool:
 @lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
-    # NOTE: This import statement should be executed lazily since
-    # the Neuron-X backend does not have the `cuda_utils` module.
-    from vllm._C import cuda_utils
-
     max_shared_mem = (
-        cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
+        ops.get_max_shared_memory_per_block_device_attribute(gpu))
     # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
     # will fail
     assert max_shared_mem > 0, "max_shared_mem can not be zero"

From 45f92c00cf1752ae27b4e8a08a560abf08cc6cd2 Mon Sep 17 00:00:00 2001
From: Bla_ckB <50193121+BlackBird-Coding@users.noreply.github.com>
Date: Mon, 10 Jun 2024 06:23:14 +0700
Subject: [PATCH 25/30] [Bugfix] Fix KeyError: 1 When Using LoRA adapters
 (#5164)

---
 vllm/core/scheduler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 0159053b4dc6a..bb37c5f313617 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -423,7 +423,9 @@ def _schedule_running(
                 num_running_seqs = seq_group.get_max_num_running_seqs()
                 budget.subtract_num_seqs(seq_group.request_id,
                                          num_running_seqs)
-                if curr_loras is not None and seq_group.lora_int_id > 0:
+
+                if (curr_loras is not None and seq_group.lora_int_id > 0
+                        and seq_group.lora_int_id in curr_loras):
                     curr_loras.remove(seq_group.lora_int_id)
 
                 if running_queue:

From 5884c2b454d9a6e16646e949d7308a4cfae3ac12 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Sun, 9 Jun 2024 23:49:46 -0400
Subject: [PATCH 26/30] [Misc] Update to comply with the new
 `compressed-tensors` config (#5350)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 tests/quantization/test_compressed_tensors.py | 20 ++++++++++++-------
 vllm/config.py                                |  8 ++------
 .../layers/quantization/__init__.py           |  2 +-
 .../model_loader/weight_utils.py              |  9 +++------
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 9d94d2ecfb222..e6d8218b41372 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -5,15 +5,15 @@
 
 import torch
 
+from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken,
     CompressedTensorsW8A8StaticTensor)
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
-    model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
-    with vllm_runner(model_path, quantization="sparseml",
-                     enforce_eager=True) as llm:
+    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
 
@@ -40,11 +40,17 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):
         assert qkv_proj.input_scale.dtype is torch.float32
 
 
+def test_compressed_tensors_no_enforce_eager(vllm_runner):
+    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    with vllm_runner(model_path) as llm:
+        sampling_params = SamplingParams()
+        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        assert output
+
+
 def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
-    model_path = "nm-testing/tinyllama-one-shot-dynamic-test"
-    with vllm_runner(model_path,
-                     quantization="sparseml",
-                     enforce_eager=True,
+    model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
+    with vllm_runner(model_path, enforce_eager=True,
                      dtype=torch.float16) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
diff --git a/vllm/config.py b/vllm/config.py
index a980168190adc..fa296cd626f17 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -164,12 +164,8 @@ def _verify_embedding_mode(self) -> None:
     def _parse_quant_hf_config(self):
         quant_cfg = getattr(self.hf_config, "quantization_config", None)
         if quant_cfg is None:
-            # SparseML uses a "compression_config" with a "quantization_config".
-            compression_cfg = getattr(self.hf_config, "compression_config",
-                                      None)
-            if compression_cfg is not None:
-                quant_cfg = compression_cfg.get("quantization_config", None)
-
+            # compress-tensors uses a "compression_config" key
+            quant_cfg = getattr(self.hf_config, "compression_config", None)
         return quant_cfg
 
     def _verify_quantization(self) -> None:
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 0bc42beb66257..40b0df75a69a6 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -31,7 +31,7 @@
     "gptq_marlin": GPTQMarlinConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
-    "sparseml": CompressedTensorsConfig,
+    "compressed-tensors": CompressedTensorsConfig,
     "bitsandbytes": BitsAndBytesConfig,
 }
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 6174f0a974712..827591b227a2b 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -122,12 +122,9 @@ def get_quant_config(model_config: ModelConfig,
     hf_quant_config = getattr(model_config.hf_config, "quantization_config",
                               None)
     if hf_quant_config is None:
-        compression_config = getattr(model_config.hf_config,
-                                     "compression_config", None)
-        if compression_config is not None:
-            hf_quant_config = compression_config.get("quantization_config",
-                                                     None)
-
+        # compressed-tensors uses a compressions_config
+        hf_quant_config = getattr(model_config.hf_config, "compression_config",
+                                  None)
     if hf_quant_config is not None:
         return quant_cls.from_config(hf_quant_config)
     # In case of bitsandbytes/QLoRA, get quant config from the adapter model.

From 68bc81703e2c41f38b1696269ce912cacb384e60 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 10 Jun 2024 02:13:39 -0700
Subject: [PATCH 27/30] [Frontend][Misc] Enforce Pixel Values as Input Type for
 VLMs in API Server (#5374)

---
 vllm/entrypoints/openai/api_server.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 95417718b51fe..e7503b9655830 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -183,6 +183,16 @@ async def authentication(request: Request, call_next):
         served_model_names = [args.model]
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
+
+    # Enforce pixel values as image input type for vision language models
+    # when serving with API server
+    if engine_args.image_input_type is not None and \
+        engine_args.image_input_type.upper() != "PIXEL_VALUES":
+        raise ValueError(
+            f"Invalid image_input_type: {engine_args.image_input_type}. "
+            "Only --image-input-type 'pixel_values' is supported for serving "
+            "vision language models with the vLLM API server.")
+
     engine = AsyncLLMEngine.from_engine_args(
         engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
 

From c81da5f56dd613d5378078866e4810452d50d6ed Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 10 Jun 2024 02:51:02 -0700
Subject: [PATCH 28/30] [misc][typo] fix typo (#5372)

---
 .../distributed/device_communicators/custom_all_reduce_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 24ef3cb45b19d..4b89a23dfc463 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -166,7 +166,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
             and (not os.path.exists(path))):
         # only the local master process (with local_rank == 0) can
         #  enter this block to calculate the cache
-        logger.info("generating GPU P2P access cache for in %s", path)
+        logger.info("generating GPU P2P access cache in %s", path)
         cache = {}
         for _i in range(num_dev):
             for _j in range(num_dev):

From 0bfa1c4f133737a59bcb94e85ca80f2f4cd68038 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 10 Jun 2024 19:38:49 +0800
Subject: [PATCH 29/30] [Misc] Improve error message when LoRA parsing fails
 (#5194)

---
 tests/lora/test_utils.py | 14 +++++++++++++-
 vllm/lora/utils.py       | 15 +++++++--------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 892f6081e2aaa..4ff9715b4ca8d 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -1,12 +1,13 @@
 from collections import OrderedDict
 
+import pytest
 from torch import nn
 
 from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
 from vllm.utils import LRUCache
 
 
-def test_parse_fine_tuned_lora_name():
+def test_parse_fine_tuned_lora_name_valid():
     fixture = {
         ("base_model.model.lm_head.lora_A.weight", "lm_head", True),
         ("base_model.model.lm_head.lora_B.weight", "lm_head", False),
@@ -35,6 +36,17 @@ def test_parse_fine_tuned_lora_name():
         assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
 
 
+def test_parse_fine_tuned_lora_name_invalid():
+    fixture = {
+        "weight",
+        "base_model.weight",
+        "base_model.model.weight",
+    }
+    for name in fixture:
+        with pytest.raises(ValueError, match="unsupported LoRA weight"):
+            parse_fine_tuned_lora_name(name)
+
+
 def test_replace_submodule():
     model = nn.Sequential(
         OrderedDict([
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index b0198a50b1c52..4a86c16cf64db 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -94,13 +94,12 @@ def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]:
             is_lora_a whether the tensor is lora_a or lora_b.
     """
     parts = name.split(".")
-    assert parts[0] == "base_model"
-    assert parts[1] == "model"
-    if parts[-1] == "weight":
-        assert parts[-2] == "lora_A" or parts[-2] == "lora_B"
-        return ".".join(parts[2:-2]), parts[-2] == "lora_A"
 
-    if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
-        return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A"
+    if len(parts) >= 2 and parts[0] == "base_model" and parts[1] == "model":
+        if parts[-1] == "weight":
+            if parts[-2] == "lora_A" or parts[-2] == "lora_B":
+                return ".".join(parts[2:-2]), parts[-2] == "lora_A"
+        elif parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
+            return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A"
 
-    raise ValueError(f"{name} is unsupported format")
+    raise ValueError(f"{name} is unsupported LoRA weight")

From 6b29d6fe709b91346be102a697bfaea386ed9107 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 10 Jun 2024 20:47:15 +0800
Subject: [PATCH 30/30] [Model] Initial support for LLaVA-NeXT (#4199)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.rst  |   6 +-
 tests/models/test_llava.py               |   2 -
 tests/models/test_llava_next.py          | 123 +++++++
 tests/multimodal/test_processor.py       |  62 +++-
 vllm/model_executor/models/__init__.py   |   2 +
 vllm/model_executor/models/llava.py      |  18 +-
 vllm/model_executor/models/llava_next.py | 445 +++++++++++++++++++++++
 7 files changed, 640 insertions(+), 18 deletions(-)
 create mode 100644 tests/models/test_llava_next.py
 create mode 100644 vllm/model_executor/models/llava_next.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 24fa83df7d751..5d3f55be1271f 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -89,7 +89,11 @@ Alongside each architecture, we include some popular models that use it.
     - ✅︎
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
-    - :code:`llava-hf/llava-1.5-7b-hf`\*, :code:`llava-hf/llava-1.5-13b-hf`\*, etc.
+    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
+    -
+  * - :code:`LlavaNextForConditionalGeneration`
+    - LLaVA-NeXT
+    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 1f446362167a1..a1f0cff1cc0e5 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -39,8 +39,6 @@ def iter_llava_configs(model_name: str):
 
 model_and_vl_config = [
     *iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
-    # Not enough memory
-    # *iter_llava_configs("llava-hf/llava-1.5-13b-hf"),
 ]
 
 
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
new file mode 100644
index 0000000000000..aa6ee268ae588
--- /dev/null
+++ b/tests/models/test_llava_next.py
@@ -0,0 +1,123 @@
+from typing import List, Tuple
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.config import VisionLanguageConfig
+
+from ..conftest import IMAGE_FILES
+
+pytestmark = pytest.mark.llava
+
+_PREFACE = (
+    "A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's "
+    "questions.")
+
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = [
+    f"{_PREFACE} <image>\nUSER: What's the content of the image? ASSISTANT:",
+    f"{_PREFACE} <image>\nUSER: What is the season? ASSISTANT:",
+]
+
+assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+
+
+def iter_llava_next_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (336, 336): 1176,
+        (672, 672): 2928,
+        (1344, 336): 1944,
+        (336, 1344): 1890,
+    }
+
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+        ]:
+            yield (model_name,
+                   VisionLanguageConfig(image_input_type=input_type,
+                                        image_feature_size=f,
+                                        image_token_id=32000,
+                                        image_input_shape=input_shape,
+                                        image_processor=model_name,
+                                        image_processor_revision=None))
+
+
+model_and_vl_config = [
+    *iter_llava_next_configs("llava-hf/llava-v1.6-vicuna-7b-hf"),
+]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+                      vlm_config: VisionLanguageConfig, model_id: str):
+    """Sanitize vllm output to be comparable with hf output.
+    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
+    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
+    It also reduces `output_str` from "<image><image>bla" to "bla".
+    """
+    input_ids, output_str = vllm_output
+    image_token_id = vlm_config.image_token_id
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    image_token_str = tokenizer.decode(image_token_id)
+
+    hf_input_ids = [
+        input_id for idx, input_id in enumerate(input_ids)
+        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    ]
+    hf_output_str = output_str \
+        .replace(image_token_str * vlm_config.image_feature_size, " ")
+
+    return hf_input_ids, hf_output_str
+
+
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
+                model_and_config, dtype: str, max_tokens: int) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    model_id, vlm_config = model_and_config
+
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
+
+    vllm_image_prompts = [
+        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
+        for p in HF_IMAGE_PROMPTS
+    ]
+
+    with vllm_runner(
+            model_id,
+            dtype=dtype,
+            # should be greater than image_feature_size
+            max_model_len=4096,
+            enforce_eager=True,
+            **vlm_config.as_cli_args_dict(),
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
+
+    for i in range(len(HF_IMAGE_PROMPTS)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+            vllm_outputs[i], vlm_config, model_id)
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py
index 3df28e782dd89..51c352361702a 100644
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_processor.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pytest
-from transformers import CLIPImageProcessor
+from transformers import CLIPImageProcessor, LlavaNextImageProcessor
 
 from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -12,7 +12,7 @@
 @pytest.mark.parametrize("dtype", ["half", "float"])
 def test_clip_image_processor(hf_images, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 33
+    IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
     assert isinstance(hf_processor, CLIPImageProcessor)
@@ -55,10 +55,61 @@ def test_clip_image_processor(hf_images, dtype):
             assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
 
 
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
+@pytest.mark.parametrize("dtype", ["half", "float"])
+def test_llava_next_image_processor(hf_images, dtype):
+    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
+    IMAGE_HEIGHT = IMAGE_WIDTH = 560
+
+    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
+    assert isinstance(hf_processor, LlavaNextImageProcessor)
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+    vlm_config = VisionLanguageConfig(
+        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+        image_token_id=64000,
+        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+        image_feature_size=2928,
+        image_processor=MODEL_NAME,
+        image_processor_revision=None,
+    )
+
+    for image in hf_images:
+        hf_result = hf_processor.preprocess(
+            image,
+            return_tensors="pt",
+        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
+        vllm_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(image),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+
+        assert hf_result.keys() == vllm_result.keys()
+        for key, hf_tensor in hf_result.items():
+            hf_arr: np.ndarray = hf_tensor.numpy()
+            vllm_arr: np.ndarray = vllm_result[key].numpy()
+
+            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
+            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
+
+
+@pytest.mark.xfail(
+    reason="Example image pixels were not processed using HuggingFace")
 @pytest.mark.parametrize("dtype", ["float"])
 def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 33
+    IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     model_config = ModelConfig(
         model=MODEL_NAME,
@@ -95,7 +146,4 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
             tensor_arr: np.ndarray = tensor_result[key].numpy()
 
             assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
-
-            # The examples in PR#3042 have slightly different preprocessing from
-            # HuggingFace's LlavaProcessor, causing the test to fail.
-            # assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
+            assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index a92abe6b5b8dc..4446914c67c8e 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -33,6 +33,8 @@
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     "LlavaForConditionalGeneration":
     ("llava", "LlavaForConditionalGeneration"),
+    "LlavaNextForConditionalGeneration":
+    ("llava_next", "LlavaNextForConditionalGeneration"),
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 3332bcc578460..67b32a08833b6 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,7 +1,7 @@
 from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
 import torch
-from torch import nn
+import torch.nn as nn
 # TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
 # transformers' impl.
 from transformers import CLIPVisionModel, LlavaConfig
@@ -51,10 +51,10 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-def _merge_vision_embeddings(input_ids: torch.Tensor,
-                             inputs_embeds: torch.Tensor,
-                             vision_embeddings: torch.Tensor,
-                             image_token_id: int) -> torch.Tensor:
+def merge_vision_embeddings(input_ids: torch.Tensor,
+                            inputs_embeds: torch.Tensor,
+                            vision_embeddings: torch.Tensor,
+                            image_token_id: int) -> torch.Tensor:
     """In place merges in vision_embeddings with inputs_embeds."""
     mask = (input_ids == image_token_id)
 
@@ -151,7 +151,8 @@ def _parse_and_validate_image_input(
                 return None
 
             if not isinstance(pixel_values, torch.Tensor):
-                raise ValueError("Incorrect type of pixel values")
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
 
             return LlavaImagePixelInputs(
                 type="pixel_values",
@@ -166,7 +167,8 @@ def _parse_and_validate_image_input(
                 return None
 
             if not isinstance(image_features, torch.Tensor):
-                raise ValueError("Incorrect type of image features")
+                raise ValueError("Incorrect type of image features. "
+                                 f"Got type: {type(image_features)}")
 
             return LlavaImageFeatureInputs(
                 type="image_features",
@@ -268,7 +270,7 @@ def forward(
             vision_embeddings = self._process_image_input(image_input)
             inputs_embeds = self.language_model.get_input_embeddings(input_ids)
 
-            inputs_embeds = _merge_vision_embeddings(
+            inputs_embeds = merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
                 self.vision_language_config.image_token_id)
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
new file mode 100644
index 0000000000000..bb15dcb8ed917
--- /dev/null
+++ b/vllm/model_executor/models/llava_next.py
@@ -0,0 +1,445 @@
+from typing import (Dict, Iterable, List, Literal, Optional, Tuple, TypedDict,
+                    Union)
+
+import torch
+import torch.nn as nn
+from PIL import Image
+# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
+# transformers' impl.
+from transformers import CLIPVisionModel, LlavaNextConfig
+from transformers.models.llava_next.modeling_llava_next import (
+    get_anyres_image_grid_shape, unpad_image)
+from typing_extensions import NotRequired
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
+from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
+from vllm.sequence import SamplerOutput, SequenceData
+
+from .llava import LlavaMultiModalProjector, merge_vision_embeddings
+from .vlm_base import VisionLanguageModelBase
+
+logger = init_logger(__name__)
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "language_model.lm_head": "lm_head",
+    "language_model.model": "language_model",
+}
+
+
+class LlavaNextImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, 1 + num_patches, num_channels, height, width)"""
+
+    image_sizes: NotRequired[torch.Tensor]
+    """Shape: (batch_size, 2)"""
+
+
+class LlavaNextImageFeatureInputs(TypedDict):
+    type: Literal["image_features"]
+    data: torch.Tensor
+    """Shape: (batch_size, 1 + num_patches, image_feature_size, hidden_size)"""
+
+    image_sizes: NotRequired[torch.Tensor]
+    """Shape: (batch_size, 2)"""
+
+
+LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
+                             LlavaNextImageFeatureInputs]
+
+
+def _get_dummy_image_data(
+    seq_len: int,
+    model_config: ModelConfig,
+    vlm_config: VisionLanguageConfig,
+) -> Tuple[SequenceData, MultiModalData]:
+    seq_data, fake_mm_data = get_dummy_image_data(seq_len, model_config,
+                                                  vlm_config)
+
+    config_input_type = vlm_config.image_input_type
+    ImageInputType = VisionLanguageConfig.ImageInputType
+
+    if config_input_type == ImageInputType.PIXEL_VALUES:
+        _, c, h, w = vlm_config.image_input_shape
+        mode = {1: "L", 3: "RGB"}[c]
+        fake_mm_data = ImagePixelData(Image.new(mode, (w, h), color=0))
+
+    return seq_data, fake_mm_data
+
+
+def _image_pixel_processor(
+    data: ImagePixelData,
+    model_config: ModelConfig,
+    vlm_config: VisionLanguageConfig,
+) -> Dict[str, torch.Tensor]:
+    image = data.image
+
+    if isinstance(image, torch.Tensor):
+        pixel_values = image.to(model_config.dtype)
+        batch_size, _, _, h, w = pixel_values.shape
+        image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
+
+        return {"pixel_values": pixel_values, "image_sizes": image_sizes}
+
+    # Temporary patch before dynamic number of image tokens is supported
+    _, _, h, w = vlm_config.image_input_shape
+    if (w, h) != (image.width, image.height):
+        logger.warning(
+            "Dynamic image shape is currently not supported. "
+            "Resizing input image to (%d, %d).", w, h)
+
+        data.image = image.resize((w, h))
+
+    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
+        ._default_input_processor(data, model_config, vlm_config)
+
+
+@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
+@MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
+class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
+    """
+    Args to `forward()`:
+        input_ids: Flattened (concatenated) input_ids corresponding to a
+            batch.
+        pixel_values: For PIXEL_VALUES, expects a batch with shape
+            [1, num_patches, 3, 336, 336].
+        image_features: For IMAGE_FEATURES, expects a batch with shape
+            [1, num_patches, 1176, 1024].
+    """
+
+    def __init__(self,
+                 config: LlavaNextConfig,
+                 vision_language_config: VisionLanguageConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__(vision_language_config)
+
+        # Update the type annotation from that of its superclass
+        self.config = config
+
+        if self.vision_language_config.image_input_type == (
+                VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
+            self.vision_tower = CLIPVisionModel(config.vision_config)
+        else:
+            raise TypeError("Image features are not supported by LLaVA-NeXT")
+
+        self.multi_modal_projector = LlavaMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act)
+
+        self.quant_config = quant_config
+        self.language_model = LlamaModel(config.text_config, cache_config,
+                                         quant_config)
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.text_config.hidden_size,
+            org_num_embeddings=self.language_model.org_vocab_size)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+        self.image_newline = nn.Parameter(
+            torch.empty(config.text_config.hidden_size))
+
+    def _validate_image_pixels(self, data: torch.Tensor) -> torch.Tensor:
+        _, num_channels, _, _ = self.vision_language_config.image_input_shape
+
+        # Note that this is different from that of vLLM vision_language_config
+        # since the image is resized by the HuggingFace preprocessor
+        height = width = self.config.vision_config.image_size
+
+        if list(data.shape[2:]) != [num_channels, height, width]:
+            raise ValueError(
+                f"The expected image tensor shape is batch dimension plus "
+                f"num_patches plus {[num_channels, height, width]}. "
+                f"You supplied {data.shape}. "
+                f"If you are using vLLM's entrypoint, make sure your "
+                f"supplied image input is consistent with "
+                f"image_input_shape in engine args.")
+
+        return data
+
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
+        if list(data.shape[1:]) != [2]:
+            raise ValueError(
+                f"The expected image sizes shape is batch dimension plus "
+                f"{[2]}. You supplied {data.shape}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        image_features = kwargs.pop("image_features", None)
+
+        expected_input_type = self.vision_language_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+
+        if expected_input_type == ImageInputType.PIXEL_VALUES:
+            if image_features is not None:
+                raise ValueError(
+                    "Expected pixel values but got image features")
+            if pixel_values is None:
+                return None
+
+            if not isinstance(pixel_values, torch.Tensor):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(image_sizes, torch.Tensor):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(image_sizes)}")
+
+            return LlavaNextImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_image_pixels(pixel_values),
+                image_sizes=self._validate_image_sizes(image_sizes),
+            )
+
+        assert expected_input_type != ImageInputType.IMAGE_FEATURES, (
+            "Failed to validate this at initialization time")
+
+        return None
+
+    def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
+                                      patch_embeddings: torch.Tensor, *,
+                                      strategy: str) -> torch.Tensor:
+        # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
+        if strategy == "flat":
+            return patch_embeddings.flatten(0, 1)
+
+        if strategy.startswith("spatial"):
+            orig_width, orig_height = image_size
+            height = width = self.config.vision_config.image_size \
+                // self.config.vision_config.patch_size
+
+            base_patch_embeds = patch_embeddings[0]
+            if height * width != base_patch_embeds.shape[0]:
+                raise ValueError(
+                    "The number of patches is not consistent with the "
+                    "image size.")
+
+            if patch_embeddings.shape[0] > 1:
+                other_patch_embeds = patch_embeddings[1:]
+
+                # image_aspect_ratio == "anyres"
+                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                    (orig_width, orig_height),
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+                other_patch_embeds = other_patch_embeds \
+                    .view(num_patch_width, num_patch_height, height, width, -1)
+
+                if "unpad" in strategy:
+                    other_patch_embeds = other_patch_embeds \
+                        .permute(4, 0, 2, 1, 3).contiguous() \
+                        .flatten(1, 2).flatten(2, 3)
+                    other_patch_embeds = unpad_image(other_patch_embeds,
+                                                     image_size)
+                    other_patch_embeds = torch.cat((
+                        other_patch_embeds,
+                        self.image_newline[:, None, None] \
+                            .expand(*other_patch_embeds.shape[:-1], 1) \
+                            .to(other_patch_embeds.device),
+                    ), dim=-1)
+                    other_patch_embeds = other_patch_embeds \
+                        .flatten(1, 2).transpose(0, 1)
+                else:
+                    other_patch_embeds = other_patch_embeds \
+                        .permute(0, 2, 1, 3, 4).contiguous() \
+                        .flatten(0, 3)
+
+                merged_patch_embeddings = torch.cat(
+                    (base_patch_embeds, other_patch_embeds), dim=0)
+            else:
+                if "unpad" in strategy:
+                    merged_patch_embeddings = torch.cat(
+                        (base_patch_embeds,
+                         self.image_newline[None] \
+                            .to(base_patch_embeds.device)
+                    ), dim=0)
+                else:
+                    merged_patch_embeddings = base_patch_embeds
+
+            return merged_patch_embeddings
+
+        raise ValueError(f"Unexpected patch merge strategy: {strategy}")
+
+    def _process_image_pixels(
+            self, inputs: LlavaNextImagePixelInputs) -> torch.Tensor:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["data"]
+
+        b, num_patches, c, h, w = pixel_values.shape
+        stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
+
+        stacked_image_features = self._image_pixels_to_features(
+            self.vision_tower, stacked_pixel_values)
+
+        return stacked_image_features.view(b, num_patches,
+                                           *stacked_image_features.shape[-2:])
+
+    def _process_image_input(
+            self, image_input: LlavaNextImageInputs) -> torch.Tensor:
+        if image_input["type"] == "pixel_values":
+            assert self.vision_tower is not None
+            image_features = self._process_image_pixels(image_input)
+        else:
+            image_features = image_input["data"]
+
+        patch_embeddings = self.multi_modal_projector(image_features)
+
+        image_sizes = image_input.get("image_sizes")
+        if image_sizes is None:
+            batch_size = image_input["data"].shape[0]
+            vision_config = self.config.vision_config
+            default_width = default_height = vision_config.image_size
+            image_sizes = torch.as_tensor([[default_width, default_height]
+                                           for _ in range(batch_size)])
+
+        merged_patch_embeddings = [
+            self._merge_image_patch_embeddings(image_sizes[i],
+                                               patch_features,
+                                               strategy="spatial_unpad")
+            for i, patch_features in enumerate(patch_embeddings)
+        ]
+
+        return torch.stack(merged_patch_embeddings, dim=0)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        """Run forward pass for Llava 1.5.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+        Concretely, consider a text prompt:
+        "<image>\nUSER: What's the content of the image?\nASSISTANT:".
+        Tokenizer outputs:
+        [1, 32000, 29871, 13, 11889, 29901, 1724, 29915, 29879, 278,
+        2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901].
+        The to-be-inserted image has a size of 576 (24 * 24) along the context
+        length dimension.
+        `input_ids` is thus [1, 32000, ..., 32000, 29871, 13, 11889, 29901,
+        1724, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933,
+        9047, 13566, 29901].
+        There will be 576 `32000` in the `input_ids`.
+        (32000 is the token id for `<image>`.)
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        The model takes two types of image inputs:
+        PIXEL_VALUES and IMAGE_FEATURES.
+        The following shows how each maps to huggingface implementation.
+        PIXEL_VALUES:
+        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
+        IMAGE_FEATURES:
+        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
+        before going through the multi modal projector.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: For PIXEL_VALUES, expects a batch with shape
+                [1, 3, 336, 336].
+            image_features: For IMAGE_FEATURES, expects a batch with shape
+                [1, 576, 1024].
+        """
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input is not None:
+            vision_embeddings = self._process_image_input(image_input)
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            inputs_embeds = merge_vision_embeddings(
+                input_ids, inputs_embeds, vision_embeddings,
+                self.vision_language_config.image_token_id)
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            kv_caches,
+                                            attn_metadata,
+                                            inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # only doing this for language model part for now.
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_tower is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    param = params_dict[name.replace(weight_name, param_name)]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    use_default_weight_loading = True
+            if use_default_weight_loading:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)