Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Hardware][CPU] Support AWQ for CPU backend #7515

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,19 @@ docker exec cpu-test bash -c "
pytest -v -s tests/models/decoder_only/language \
--ignore=tests/models/test_fp8.py \
--ignore=tests/models/decoder_only/language/test_jamba.py \
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported

# Run compressed-tensor test
# docker exec cpu-test bash -c "
# pytest -s -v \
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"

# Run AWQ test
docker exec cpu-test bash -c "
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
tests/quantization/test_ipex_quant.py"

# online inference
docker exec cpu-test bash -c "
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li

RUN echo 'ulimit -c 0' >> ~/.bashrc

RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
RUN pip install intel_extension_for_pytorch==2.4.0

WORKDIR /workspace

Expand Down
4 changes: 2 additions & 2 deletions docs/source/quantization/supported_hardware.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ The table below shows the compatibility of various quantization implementations
- ✅︎
- ✗
- ✗
-
- ✅︎
- ✗
- ✗
* - GPTQ
Expand Down Expand Up @@ -61,7 +61,7 @@ The table below shows the compatibility of various quantization implementations
- ✅︎
- ✗
- ✗
-
- ✅︎
- ✗
- ✗
* - FP8 (W8A8)
Expand Down
28 changes: 28 additions & 0 deletions tests/quantization/test_ipex_quant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Test model set-up and inference for quantized HF models supported
on the CPU backend using IPEX (including AWQ).

Validating the configuration and printing results for manual checking.

Run `pytest tests/quantization/test_ipex_quant.py`.
"""

import pytest

from vllm.platforms import current_platform

MODELS = [
"casperhansen/llama-3-8b-instruct-awq",
]
DTYPE = ["bfloat16"]


@pytest.mark.skipif(not current_platform.is_cpu(),
reason="only supports the CPU backend.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", DTYPE)
def test_ipex_quant(vllm_runner, model, dtype):
with vllm_runner(model, dtype=dtype) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
assert output
print(output)
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod",
"MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
"TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod",
"ModelOptFp8LinearMethod"
"ModelOptFp8LinearMethod", "IPEXAWQLinearMethod"
]


Expand Down
2 changes: 2 additions & 0 deletions vllm/model_executor/layers/quantization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
GPTQMarlinConfig)
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQMarlin24Config)
from vllm.model_executor.layers.quantization.ipex_quant import IPEXConfig
from vllm.model_executor.layers.quantization.marlin import MarlinConfig
from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config
from vllm.model_executor.layers.quantization.neuron_quant import (
Expand Down Expand Up @@ -49,6 +50,7 @@
"qqq": QQQConfig,
"experts_int8": ExpertsInt8Config,
"neuron_quant": NeuronQuantConfig,
"ipex": IPEXConfig,
}


Expand Down
4 changes: 4 additions & 0 deletions vllm/model_executor/layers/quantization/awq_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.parameter import (GroupQuantScaleParameter,
PackedvLLMParameter)
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types

logger = init_logger(__name__)
Expand Down Expand Up @@ -123,6 +124,9 @@ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
group_size = quant_config.get("group_size")
has_zp = quant_config.get("zero_point")

if not current_platform.is_cuda():
return False

if quant_method != "awq":
return False

Expand Down
155 changes: 155 additions & 0 deletions vllm/model_executor/layers/quantization/ipex_quant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
from typing import Any, Dict, List, Optional

import torch

from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.platforms import current_platform


class IPEXConfig(QuantizationConfig):
"""INT8 quantization config class using IPEX for the CPU backend,
including AWQ.
"""

IPEX_QUANT_METHOD_MAP = {
"awq": 1,
"gptq": 2,
}

def __init__(
self,
method: str,
weight_bits: int,
group_size: int,
) -> None:
self.method = method
self.weight_bits = weight_bits
self.group_size = group_size
self.pack_factor = 32 // self.weight_bits

if self.weight_bits not in [4]:
raise ValueError(f"IPEX quantization supports weight bits [4], "
f"but got {self.weight_bits}.")

if self.method == "awq":
self.quant_method = IPEXAWQLinearMethod
else:
raise ValueError(f"IPEX quantization supports [awq], "
f"but got {self.method}.")

def __repr__(self) -> str:
return (f"IPEXConfig(method={self.method}"
f"weight_bits={self.weight_bits}, "
f"group_size={self.group_size}")

def get_ipex_quant_method_id(self) -> int:
return IPEXConfig.IPEX_QUANT_METHOD_MAP[self.method]

@classmethod
def get_name(cls) -> str:
return "ipex"

@classmethod
def get_supported_act_dtypes(cls) -> List[torch.dtype]:
return [torch.bfloat16]

@classmethod
def get_min_capability(cls) -> int:
return -1

@staticmethod
def get_config_filenames() -> List[str]:
return [
"quant_config.json",
"quantize_config.json",
]

@classmethod
def from_config(cls, config: Dict[str, Any]) -> "IPEXConfig":
method = cls.get_from_keys(config, ["quant_method"]).lower()
weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
mgoin marked this conversation as resolved.
Show resolved Hide resolved
return cls(method, weight_bits, group_size)

@classmethod
def override_quantization_method(cls, hf_quant_cfg,
user_quant) -> Optional[str]:
if not current_platform.is_cpu():
return None

quant_method = hf_quant_cfg.get("quant_method", "").lower()

if quant_method in ["awq"]:
return cls.get_name()

return None

def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["LinearMethodBase"]:
if isinstance(layer, LinearBase):
return self.quant_method(self)
return None

def get_scaled_act_names(self) -> List[str]:
if self.method == "awq":
return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"]
else:
return []


class IPEXAWQLinearMethod(AWQLinearMethod):
"""AWQ linear method using IPEX for the CPU backend.
"""

def __init__(self, quant_config: IPEXConfig):
self.quant_config = quant_config # type: ignore

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
super().process_weights_after_loading(layer=layer)

bias = layer.bias if not layer.skip_bias_add else None

import intel_extension_for_pytorch as ipex
Copy link
Sponsor Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you put a try-catch around the ipex import to provide a better instructive error message to the user? It is also important to enforce versioning if needed.
Something like this deepspeed backend

try:
import deepspeed
if deepspeed.__version__ < "0.14.2":
raise ImportError("deepspeed version is wrong. Please "
"install deepspeed>=0.14.2.")
from deepspeed.ops.fp_quantizer import FP_Quantize
except ImportError as err:
raise ImportError("Please install deepspeed>=0.14.2 via "
"`pip install deepspeed>=0.14.2` to use "
"deepspeedfp quantizer.") from err


# Using the compute dtype (lowp_mode) as INT8 to leverage instructions
# with better performance.
lowp_mode = ipex.quantization.WoqLowpMode.INT8
# The weight will be de-packed from INT4 to INT8.
weight_dtype = ipex.quantization.WoqWeightDtype.INT4
# The float activation will be quantized (dynamic, per-token) to INT8.
act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH

qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
weight_dtype=weight_dtype,
lowp_mode=lowp_mode,
act_quant_mode=act_quant_mode,
group_size=self.quant_config.group_size,
)

layer.ipex_output_size = layer.qweight.size(
1) * self.quant_config.pack_factor
layer.ipex_qlinear = ipex.nn.modules.weight_only_quantization.\
WeightOnlyQuantizedLinear.from_weight(
layer.qweight,
layer.scales,
layer.qzeros,
layer.qweight.size(0),
layer.ipex_output_size,
qconfig=qconfig,
bias=bias,
group_size=self.quant_config.group_size,
quant_method=
self.quant_config.get_ipex_quant_method_id() # type: ignore
)

def apply(self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
reshaped_x = x.reshape(-1, x.shape[-1])
out = layer.ipex_qlinear(reshaped_x)

return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
3 changes: 2 additions & 1 deletion vllm/worker/cpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ def _is_encoder_decoder_model(self):
def init_device(self) -> None:
if self.local_omp_cpuid != "all":
ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
logger.info(ret)
if ret:
logger.info(ret)

self.init_distributed_environment()
# Set random seed.
Expand Down
Loading