From e286fa9251f5e4b3e09d5daa14f450bede0b4676 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Jul 2024 17:11:27 -0400 Subject: [PATCH 1/9] Switch backend with llm-compressor --- auto_fp8/__init__.py | 3 +- auto_fp8/config.py | 42 -------- auto_fp8/modeling.py | 214 +++++++++++++---------------------------- auto_fp8/quantize.py | 167 ++++++++++++++++++++++++++++++++ example_dataset.py | 25 +++-- tests/test_auto_fp8.py | 108 +++++++++++++++++++++ 6 files changed, 358 insertions(+), 201 deletions(-) delete mode 100644 auto_fp8/config.py diff --git a/auto_fp8/__init__.py b/auto_fp8/__init__.py index ea4fbb6..d463cc8 100644 --- a/auto_fp8/__init__.py +++ b/auto_fp8/__init__.py @@ -1,5 +1,4 @@ -from .config import BaseQuantizeConfig -from .modeling import AutoFP8ForCausalLM +from .modeling import AutoFP8ForCausalLM, BaseQuantizeConfig __all__ = [ "AutoFP8ForCausalLM", diff --git a/auto_fp8/config.py b/auto_fp8/config.py deleted file mode 100644 index 24c6200..0000000 --- a/auto_fp8/config.py +++ /dev/null @@ -1,42 +0,0 @@ -from typing import List, Optional, Tuple - - -class BaseQuantizeConfig: - """Configuration for model quantization. - - Args: - quant_method: Type/precision of quantization method to use. - At the moment, this is just "fp8" which specifically means - the fp8_e4m3 format in pytorch. - activation_scheme: Choice of either "dynamic" or "static" quantization - of activtions. If "static", then calibration samples are required - during quantization to produce accurate per-tensor scales for - activations of Linear modules. - ignore_patterns: List of patterns used to ignore layers. If a string - starts with "re:", then everything afterwards is used as python - regex style matching i.e. re.search(), for each Linear layer. - By default, "re:.*lm_head" is included to ignore the embedding - Linear layer usually at the end of decoder LLMs - kv_cache_quant_targets: Tuple of Linear module names to target for - calibration of the output scales for KV cache quantization. - Usually, these should be `("k_proj", "v_proj")`. - """ - - def __init__( - self, - quant_method: str = "fp8", - activation_scheme: str = "static", - ignore_patterns: List[str] = ["re:.*lm_head"], - kv_cache_quant_targets: Optional[Tuple[str]] = None, - ): - if quant_method != "fp8": - raise ValueError("Only FP8 quantization is supported.") - if activation_scheme not in ["static", "dynamic"]: - raise ValueError( - "Invalid activation_scheme. Choose either 'static' or 'dynamic'." - ) - self.quant_method = quant_method - self.activation_scheme = activation_scheme - self.ignore_patterns = ignore_patterns - self.kv_cache_quant_targets = kv_cache_quant_targets - self.ignored_layers = [] diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py index 04a9e71..eb4d2ba 100644 --- a/auto_fp8/modeling.py +++ b/auto_fp8/modeling.py @@ -1,42 +1,49 @@ -import re -from typing import List, Optional, Tuple +import os +from typing import List, Optional + +from transformers import AutoConfig, AutoTokenizer +from datasets import Dataset +from llmcompressor.transformers import SparseAutoModelForCausalLM +from llmcompressor.transformers import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier + + +class BaseQuantizeConfig: + """Configuration for model quantization. + + Args: + quant_method: Type/precision of quantization method to use. + At the moment, this is just "fp8" which specifically means + the fp8_e4m3 format in pytorch. + activation_scheme: Choice of either "dynamic" or "static" quantization + of activtions. If "static", then calibration samples are required + during quantization to produce accurate per-tensor scales for + activations of Linear modules. + ignore_patterns: List of patterns used to ignore layers. If a string + starts with "re:", then everything afterwards is used as python + regex style matching i.e. re.search(), for each Linear layer. + By default, "lm_head" is included to ignore the embedding + Linear layer usually at the end of decoder LLMs + """ -import torch -from transformers import AutoModelForCausalLM - -from auto_fp8.config import BaseQuantizeConfig -from auto_fp8.quantize import ( - quantize_activations, - quantize_weights, - save_quantized_model, -) + def __init__( + self, + quant_method: str = "fp8", + activation_scheme: str = "static", + ignore_patterns: List[str] = ["lm_head"], + ): + self.quant_method = quant_method + self.activation_scheme = activation_scheme + self.ignore_patterns = ignore_patterns class AutoFP8ForCausalLM: def __init__( - self, - model: AutoModelForCausalLM, - quantize_config: BaseQuantizeConfig, + self, model: SparseAutoModelForCausalLM, quantize_config: BaseQuantizeConfig ): self.model = model self.model_type = self.model.config.model_type self.config = self.model.config - - # Gather the Linear module names that we want to ignore - quantize_config.ignored_layers = get_layers_to_ignore( - self.model, quantize_config.ignore_patterns - ) - - if quantize_config.kv_cache_quant_targets: - kv_cache_quant_layers = get_kv_cache_quant_layers( - self.model, quantize_config.kv_cache_quant_targets - ) - if len(kv_cache_quant_layers) == 0: - raise ValueError( - f"Could not find any kv cache layers using kv_cache_quant_targets={quantize_config.kv_cache_quant_targets}, please fix your argument." - ) - quantize_config.kv_cache_quant_layers = kv_cache_quant_layers - self.quantize_config = quantize_config @classmethod @@ -44,130 +51,41 @@ def from_pretrained( cls, pretrained_model_name_or_path: str, quantize_config: BaseQuantizeConfig, - **model_init_kwargs, + **kwargs, ): - """Load the un-quantized pretrained model""" - - def skip(*args, **kwargs): - pass - - torch.nn.init.kaiming_uniform_ = skip - torch.nn.init.uniform_ = skip - torch.nn.init.normal_ = skip - - # Parameters related to loading from Hugging Face Hub - cache_dir = model_init_kwargs.pop("cache_dir", None) - force_download = model_init_kwargs.pop("force_download", False) - resume_download = model_init_kwargs.pop("resume_download", False) - proxies = model_init_kwargs.pop("proxies", None) - local_files_only = model_init_kwargs.pop("local_files_only", False) - use_auth_token = model_init_kwargs.pop("use_auth_token", None) - revision = model_init_kwargs.pop("revision", None) - subfolder = model_init_kwargs.pop("subfolder", "") - commit_hash = model_init_kwargs.pop("_commit_hash", None) - - cached_file_kwargs = { - "cache_dir": cache_dir, - "force_download": force_download, - "proxies": proxies, - "resume_download": resume_download, - "local_files_only": local_files_only, - "use_auth_token": use_auth_token, - "revision": revision, - "subfolder": subfolder, - "_commit_hash": commit_hash, - } - - torch.cuda.empty_cache() - - # Important defaults - if "torch_dtype" not in model_init_kwargs: - model_init_kwargs["torch_dtype"] = "auto" - - if "device_map" not in model_init_kwargs: - model_init_kwargs["device_map"] = "auto" - - merged_kwargs = {**model_init_kwargs, **cached_file_kwargs} - print("Loading model with the following kwargs:", merged_kwargs) - model = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path, **merged_kwargs + config = AutoConfig.from_pretrained(pretrained_model_name_or_path) + model = SparseAutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path, + config=config, + device_map="auto", + torch_dtype="auto", + **kwargs, ) - - model_config = model.config.to_dict() - seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"] - if any(k in model_config for k in seq_len_keys): - for key in seq_len_keys: - if key in model_config: - model.seqlen = model_config[key] - break - else: - print("Can't get model's sequence length, setting to 2048.") - model.seqlen = 2048 - model.eval() - return cls(model, quantize_config) - def quantize(self, calibration_tokens: Optional[torch.Tensor] = None): - - # Always quantize the weights as they do not require calibration data - quantize_weights(self.model, self.quantize_config) - - if self.quantize_config.activation_scheme == "static": - assert ( - calibration_tokens is not None - ), "Calibration tokens required for activation quantization" - - - def _prepare_calibration_data(calibration_tokens): - if hasattr(calibration_tokens, "input_ids"): - return calibration_tokens.input_ids - return calibration_tokens - - quantize_activations( - self.model, - self.quantize_config, - _prepare_calibration_data(calibration_tokens), - ) + def quantize(self, dataset: Optional[Dataset] = None): + assert ( + self.quantize_config.activation_scheme == "static" + ), "Dynamic isn't supported yet" + assert ( + dataset is not None + ), "Calibration tokens required for static activation quantization" - def save_quantized(self, save_dir): - save_quantized_model( - self.model, - quant_config=self.quantize_config, - save_dir=save_dir, + recipe = QuantizationModifier( + targets="Linear", scheme="FP8", ignore=self.quantize_config.ignore_patterns ) + oneshot( + model=self.model, + dataset=dataset, + recipe=recipe, + ) -def get_layers_to_ignore(model, ignore_patterns) -> List[str]: - ignored_layers = set() - - for name, linear in model.named_modules(): - if not isinstance(linear, torch.nn.Linear): - continue - - for ignore_pattern in ignore_patterns: - regex_prefix = "re:" - if ignore_pattern.startswith(regex_prefix): - # check if name matches regex and add to set if true - regex_pattern = ignore_pattern[len(regex_prefix) :] - if re.search(regex_pattern, name): - ignored_layers.add(name) - else: - # else, exact match - if ignore_pattern == name: - ignored_layers.add(name) - - return list(ignored_layers) - - -def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]: - kv_cache_quant_layers = [] - - for name, linear in model.named_modules(): - if not isinstance(linear, torch.nn.Linear): - continue - - for output_quant_target in kv_cache_quant_targets: - if name.endswith(output_quant_target): - kv_cache_quant_layers.append(name) + def save_quantized(self, save_directory: str): + self.save_pretrained(save_directory, save_compressed=True) - return kv_cache_quant_layers + def save_pretrained(self, save_directory: str, save_compressed: bool = True): + self.model.save_pretrained(save_directory, save_compressed=save_compressed) + tokenizer = AutoTokenizer.from_pretrained(self.model.config._name_or_path) + tokenizer.save_pretrained(save_directory) + print(f"Saved final checkpoint to {os.path.abspath(save_directory)}") \ No newline at end of file diff --git a/auto_fp8/quantize.py b/auto_fp8/quantize.py index 38a4de6..0237bc2 100644 --- a/auto_fp8/quantize.py +++ b/auto_fp8/quantize.py @@ -72,11 +72,25 @@ def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype): # Deal with empty tensors (triggeted by empty MoE experts) return torch.empty(size=(0, B.shape[0]), dtype=out_dtype, device=A.device) +<<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 959bdbc (Add comment) # TODO: Disable native fp8 gemm for now, always just dequantize # native_fp8_support = ( # torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9) # ) native_fp8_support = False +<<<<<<< HEAD +======= + native_fp8_support = ( + torch.cuda.is_available() + and torch.cuda.get_device_capability() >= (8, 9) + and False + ) +>>>>>>> 3ee9283 (Support calibrating kv cache scales) +======= +>>>>>>> 959bdbc (Add comment) if native_fp8_support: need_reshape = A.dim() == 3 if need_reshape: @@ -108,6 +122,7 @@ def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype): # Class responsible for quantizing weights class FP8DynamicLinear(torch.nn.Module): +<<<<<<< HEAD def __init__( self, weight: torch.Tensor, @@ -125,10 +140,112 @@ def forward(self, x): A=qinput, A_scale=x_scale, B=self.weight, +======= + def __init__( + self, + weight: torch.Tensor, + weight_scale: torch.Tensor, + bias: torch.nn.Parameter, + ): + super().__init__() + self.weight = torch.nn.Parameter(weight, requires_grad=False) + self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) + self.bias = bias + + def forward(self, x): + qinput, x_scale = per_tensor_quantize(x) + output = fp8_gemm( + A=qinput, + A_scale=x_scale, + B=self.weight, + B_scale=self.weight_scale, + bias=self.bias, + out_dtype=x.dtype, + ) + return output + + +# Module responsible for taking already quantized weights, and recording input scales (and possibly output scales) using an activation observer +class FP8StaticLinearQuantizer(torch.nn.Module): + def __init__( + self, + weight: torch.Tensor, + weight_scale: torch.Tensor, + bias: torch.nn.Parameter, + quantize_output: bool = False, + ): + super().__init__() + self.weight = torch.nn.Parameter(weight, requires_grad=False) + self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) + self.bias = bias + self.input_scale = None + self.output_scale = None + self.quantize_output = quantize_output + + def forward(self, x): + qinput, x_input_scale = per_tensor_quantize(x) + if self.input_scale is None: + self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False) + elif x_input_scale > self.input_scale: + self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False) + output = fp8_gemm( + A=qinput, + A_scale=self.input_scale, + B=self.weight, + B_scale=self.weight_scale, + bias=self.bias, + out_dtype=x.dtype, + ) + + # Optionally, quantize output and record scale + if self.quantize_output: + qoutput, output_scale = per_tensor_quantize(output) + if self.output_scale is None: + self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False) + elif output_scale > self.output_scale: + self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False) + output = qoutput.to(output.dtype) * output_scale + + return output + + +# Module responsible for representing the final checkpoint representation +class FP8StaticLinear(torch.nn.Module): + def __init__( + self, + weight: torch.nn.Parameter, + weight_scale: torch.nn.Parameter, + bias: torch.nn.Parameter, + input_scale: torch.nn.Parameter, + output_scale: Optional[torch.nn.Parameter] = None, + ): + super().__init__() + self.weight = weight + self.weight_scale = weight_scale + self.bias = bias + self.input_scale = input_scale + self.output_scale = output_scale + + def forward(self, x): + qinput = static_per_tensor_quantize(x, self.input_scale) + output = fp8_gemm( + A=qinput, + A_scale=self.input_scale, +<<<<<<< HEAD + B=self.qweight, +>>>>>>> 3ee9283 (Support calibrating kv cache scales) B_scale=self.weight_scale, bias=self.bias, out_dtype=x.dtype, ) +<<<<<<< HEAD +======= + + if self.output_scale: + qoutput = static_per_tensor_quantize(output, self.output_scale) + output = qoutput.to(output.dtype) * self.output_scale + +>>>>>>> 3ee9283 (Support calibrating kv cache scales) return output @@ -198,6 +315,8 @@ def forward(self, x): output = fp8_gemm( A=qinput, A_scale=self.input_scale, +======= +>>>>>>> def2049 (Fix weight name) B=self.weight, B_scale=self.weight_scale, bias=self.bias, @@ -237,7 +356,15 @@ def quantize_weights( quant_weight, weight_scale = per_tensor_quantize(linear.weight) bias = copy.deepcopy(linear.bias) if linear.bias is not None else None quant_linear = FP8DynamicLinear( +<<<<<<< HEAD +<<<<<<< HEAD weight=quant_weight, weight_scale=weight_scale, bias=bias +======= + qweight=quant_weight, weight_scale=weight_scale, bias=bias +>>>>>>> 3ee9283 (Support calibrating kv cache scales) +======= + weight=quant_weight, weight_scale=weight_scale, bias=bias +>>>>>>> def2049 (Fix weight name) ) replace_module(model, name, quant_linear) del linear.weight @@ -259,7 +386,15 @@ def quantize_activations( ): continue quantizer = FP8StaticLinearQuantizer( +<<<<<<< HEAD +<<<<<<< HEAD + weight=dynamic_quant_linear.weight, +======= + qweight=dynamic_quant_linear.qweight, +>>>>>>> 3ee9283 (Support calibrating kv cache scales) +======= weight=dynamic_quant_linear.weight, +>>>>>>> def2049 (Fix weight name) weight_scale=dynamic_quant_linear.weight_scale, bias=dynamic_quant_linear.bias, quantize_output=( @@ -272,12 +407,36 @@ def quantize_activations( cleanup_memory() # Pass through calibration data to measure activation scales +<<<<<<< HEAD +<<<<<<< HEAD with torch.inference_mode(): with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar: for row_idx in range(calibration_tokens.shape[0]): model(calibration_tokens[row_idx].reshape(1, -1)) cleanup_memory() pbar.update(1) +======= +======= +>>>>>>> 57c31bb (Use `torch.inference_mode()` for lower memory usage during calibration (#20)) + with tqdm.tqdm( + total=calibration_tokens.shape[0], desc="Calibrating activation scales" + ) as pbar: + for row_idx in range(calibration_tokens.shape[0]): + model(calibration_tokens[row_idx].reshape(1, -1)) + cleanup_memory() + pbar.update(1) +<<<<<<< HEAD +>>>>>>> 3ee9283 (Support calibrating kv cache scales) +======= +======= + with torch.inference_mode(): + with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar: + for row_idx in range(calibration_tokens.shape[0]): + model(calibration_tokens[row_idx].reshape(1, -1)) + cleanup_memory() + pbar.update(1) +>>>>>>> b1c6ad6 (Use `torch.inference_mode()` for lower memory usage during calibration (#20)) +>>>>>>> 57c31bb (Use `torch.inference_mode()` for lower memory usage during calibration (#20)) # Replace dynamic quantizer observer with StaticLinear for export for name, quantizer in model.named_modules(): @@ -287,7 +446,15 @@ def quantize_activations( ): continue static_proj = FP8StaticLinear( +<<<<<<< HEAD +<<<<<<< HEAD + weight=quantizer.weight, +======= + qweight=quantizer.qweight, +>>>>>>> 3ee9283 (Support calibrating kv cache scales) +======= weight=quantizer.weight, +>>>>>>> def2049 (Fix weight name) weight_scale=quantizer.weight_scale, bias=quantizer.bias, input_scale=quantizer.input_scale, diff --git a/example_dataset.py b/example_dataset.py index 204345f..82d336e 100644 --- a/example_dataset.py +++ b/example_dataset.py @@ -3,20 +3,27 @@ from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig -pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" -quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8" +pretrained_model_dir = "facebook/opt-125m" +quantized_model_dir = "opt-125m-FP8" tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) tokenizer.pad_token = tokenizer.eos_token -ds = load_dataset("mgoin/ultrachat_2k", split="train_sft") -examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] -examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda") +MAX_SEQUENCE_LENGTH = 2048 +ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) +def preprocess(example): + example = tokenizer.apply_chat_template(example["messages"], tokenize=False) + return tokenizer( + example, + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) +ds = ds.map(preprocess, remove_columns=ds.column_names) quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") -model = AutoFP8ForCausalLM.from_pretrained( - pretrained_model_dir, quantize_config=quantize_config -) -model.quantize(examples) +model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) +model.quantize(ds) model.save_quantized(quantized_model_dir) diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py index 6045d84..bb852d9 100644 --- a/tests/test_auto_fp8.py +++ b/tests/test_auto_fp8.py @@ -1,20 +1,52 @@ import os import shutil +<<<<<<< HEAD +<<<<<<< HEAD import pytest +======= +>>>>>>> 3ee9283 (Support calibrating kv cache scales) +======= +import pytest +>>>>>>> 2739d61 (Add Qwen test) import safetensors.torch from transformers import AutoTokenizer from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig MODELS = [ +<<<<<<< HEAD +<<<<<<< HEAD + ("facebook/opt-125m", 160), + ("Qwen/Qwen2-0.5B-Instruct", 620), +] + +<<<<<<< HEAD +@pytest.mark.parametrize("model_id,target_size", MODELS) +def test_dynamic_quantization(model_id, target_size): + quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic" +======= +def test_dynamic_quantization(): + model_id = "facebook/opt-125m" + quantized_model_dir = "opt-125m-fp8-dynamic" +>>>>>>> 3ee9283 (Support calibrating kv cache scales) +======= + "facebook/opt-125m", + "Qwen/Qwen2-0.5B-Instruct", +======= ("facebook/opt-125m", 160), +<<<<<<< HEAD + ("Qwen/Qwen2-0.5B-Instruct", 600), +>>>>>>> 415c0b7 (Add fixed target sizes) +======= ("Qwen/Qwen2-0.5B-Instruct", 620), +>>>>>>> 93c0d54 (Fix proj linear count) ] @pytest.mark.parametrize("model_id,target_size", MODELS) def test_dynamic_quantization(model_id, target_size): quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic" +>>>>>>> 2739d61 (Add Qwen test) quantize_config = BaseQuantizeConfig( quant_method="fp8", activation_scheme="dynamic" @@ -30,6 +62,11 @@ def test_dynamic_quantization(model_id, target_size): model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") shutil.rmtree(quantized_model_dir) +<<<<<<< HEAD +<<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> c3acdee (Switch from output_scale to kv_scale) # We expect the quantized model to be a certain size target_size = target_size * (1024 * 1024) assert model_size < target_size @@ -38,6 +75,31 @@ def test_dynamic_quantization(model_id, target_size): @pytest.mark.parametrize("model_id,target_size", MODELS) def test_static_quantization(model_id, target_size): quantized_model_dir = model_id.split("/")[-1] + "-fp8-static" +======= + # We expect the model to be < 160MB + target_size = 160 * (1024 * 1024) + assert model_size < target_size + + +<<<<<<< HEAD +def test_static_quantization(): + model_id = "facebook/opt-125m" + quantized_model_dir = "opt-125m-fp8-static" +>>>>>>> 3ee9283 (Support calibrating kv cache scales) +======= +@pytest.mark.parametrize("model_id", MODELS) +def test_static_quantization(model_id): +======= + # We expect the model to be a certain size + target_size = target_size * (1024 * 1024) + assert model_size < target_size + + +@pytest.mark.parametrize("model_id,target_size", MODELS) +def test_static_quantization(model_id, target_size): +>>>>>>> 415c0b7 (Add fixed target sizes) + quantized_model_dir = model_id.split("/")[-1] + "-fp8-static" +>>>>>>> 2739d61 (Add Qwen test) tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) examples = ["auto-fp8 is an easy-to-use model quantization library"] @@ -55,7 +117,53 @@ def test_static_quantization(model_id, target_size): model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") shutil.rmtree(quantized_model_dir) +<<<<<<< HEAD +<<<<<<< HEAD + # We expect the quantized model to be a certain size + target_size = target_size * (1024 * 1024) + assert model_size < target_size + +@pytest.mark.parametrize("model_id,target_size", MODELS) +def test_kv_cache_static_quantization(model_id, target_size): + quantized_model_dir = model_id.split("/")[-1] + "-fp8-static-kv" + + tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) + examples = ["auto-fp8 is an easy-to-use model quantization library"] + examples = tokenizer(examples, return_tensors="pt") + + quantize_config = BaseQuantizeConfig( + quant_method="fp8", + activation_scheme="static", + kv_cache_quant_targets=("k_proj", "v_proj"), + ) + + model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config) + model.model.to("cpu") + + model.quantize(examples) + model.save_quantized(quantized_model_dir) + + tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors") + proj_linear_count = 0 + kv_scale_count = 0 + for name, _ in tensors.items(): + if name.endswith("k_proj.weight") or name.endswith("v_proj.weight"): + proj_linear_count += 1 + if name.endswith("kv_scale"): + kv_scale_count += 1 + assert proj_linear_count // 2 == kv_scale_count + + # Measure checkpoint size and cleanup + model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") + shutil.rmtree(quantized_model_dir) + + # We expect the quantized model to be a certain size +======= + # We expect the model to be < 160MB +>>>>>>> 415c0b7 (Add fixed target sizes) +======= # We expect the quantized model to be a certain size +>>>>>>> c3acdee (Switch from output_scale to kv_scale) target_size = target_size * (1024 * 1024) assert model_size < target_size From 6d508ae267323c46bbf4fcf6919c10ca3515045e Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Jul 2024 17:12:08 -0400 Subject: [PATCH 2/9] Remove quantize --- auto_fp8/quantize.py | 511 ------------------------------------------- 1 file changed, 511 deletions(-) delete mode 100644 auto_fp8/quantize.py diff --git a/auto_fp8/quantize.py b/auto_fp8/quantize.py deleted file mode 100644 index 0237bc2..0000000 --- a/auto_fp8/quantize.py +++ /dev/null @@ -1,511 +0,0 @@ -import gc -import re -from typing import Optional, Tuple -import copy - -import torch -import tqdm -import transformers -from transformers import AutoModelForCausalLM, AutoTokenizer - -from .config import BaseQuantizeConfig - - -# HACK: Override the dtype_byte_size function in transformers to support float8 types -# Fix is posted upstream https://github.com/huggingface/transformers/pull/30488 -def new_dtype_byte_size(dtype): - if dtype == torch.bool: - return 1 / 8 - bit_search = re.search(r"[^\d](\d+)_?", str(dtype)) - if bit_search is None: - raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") - bit_size = int(bit_search.groups()[0]) - return bit_size // 8 - - -transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size - - -def cleanup_memory(): - gc.collect() - torch.cuda.empty_cache() - - -def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]: - """Quantize a tensor using per-tensor static scaling factor. - Args: - tensor: The input tensor. - """ - finfo = torch.finfo(torch.float8_e4m3fn) - # Calculate the scale as dtype max divided by absmax. - # Since .abs() creates a new tensor, we use aminmax to get - # the min and max first and then calculate the absmax. - if tensor.numel() == 0: - # Deal with empty tensors (triggered by empty MoE experts) - min_val, max_val = ( - torch.tensor(-16.0, dtype=tensor.dtype), - torch.tensor(16.0, dtype=tensor.dtype), - ) - else: - min_val, max_val = tensor.aminmax() - amax = torch.maximum(min_val.abs(), max_val.abs()) - scale = finfo.max / amax.clamp(min=1e-12) - # scale and clamp the tensor to bring it to - # the representative range of float8 data type - # (as default cast is unsaturated) - qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max) - # Return both float8 data and the inverse scale (as float), - # as both required as inputs to torch._scaled_mm - qweight = qweight.to(torch.float8_e4m3fn) - scale = scale.float().reciprocal() - return qweight, scale - - -def static_per_tensor_quantize(tensor: torch.Tensor, inv_scale: float) -> torch.Tensor: - finfo = torch.finfo(torch.float8_e4m3fn) - qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max) - return qweight.to(torch.float8_e4m3fn) - - -def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype): - if A.numel() == 0: - # Deal with empty tensors (triggeted by empty MoE experts) - return torch.empty(size=(0, B.shape[0]), dtype=out_dtype, device=A.device) - -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> 959bdbc (Add comment) - # TODO: Disable native fp8 gemm for now, always just dequantize - # native_fp8_support = ( - # torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9) - # ) - native_fp8_support = False -<<<<<<< HEAD -======= - native_fp8_support = ( - torch.cuda.is_available() - and torch.cuda.get_device_capability() >= (8, 9) - and False - ) ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= ->>>>>>> 959bdbc (Add comment) - if native_fp8_support: - need_reshape = A.dim() == 3 - if need_reshape: - batch_size = A.shape[0] - A_input = A.reshape(-1, A.shape[-1]) - else: - batch_size = None - A_input = A - output, _ = torch._scaled_mm( - A_input, - B.t(), - out_dtype=out_dtype, - scale_a=A_scale, - scale_b=B_scale, - bias=bias, - ) - if need_reshape: - output = output.reshape( - batch_size, output.shape[0] // batch_size, output.shape[1] - ) - else: - output = torch.nn.functional.linear( - A.to(out_dtype) * A_scale, - B.to(out_dtype) * B_scale.to(out_dtype), - bias=bias, - ) - return output - - -# Class responsible for quantizing weights -class FP8DynamicLinear(torch.nn.Module): -<<<<<<< HEAD - def __init__( - self, - weight: torch.Tensor, - weight_scale: torch.Tensor, - bias: torch.nn.Parameter, - ): - super().__init__() - self.weight = torch.nn.Parameter(weight, requires_grad=False) - self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) - self.bias = bias - - def forward(self, x): - qinput, x_scale = per_tensor_quantize(x) - output = fp8_gemm( - A=qinput, - A_scale=x_scale, - B=self.weight, -======= - def __init__( - self, - weight: torch.Tensor, - weight_scale: torch.Tensor, - bias: torch.nn.Parameter, - ): - super().__init__() - self.weight = torch.nn.Parameter(weight, requires_grad=False) - self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) - self.bias = bias - - def forward(self, x): - qinput, x_scale = per_tensor_quantize(x) - output = fp8_gemm( - A=qinput, - A_scale=x_scale, - B=self.weight, - B_scale=self.weight_scale, - bias=self.bias, - out_dtype=x.dtype, - ) - return output - - -# Module responsible for taking already quantized weights, and recording input scales (and possibly output scales) using an activation observer -class FP8StaticLinearQuantizer(torch.nn.Module): - def __init__( - self, - weight: torch.Tensor, - weight_scale: torch.Tensor, - bias: torch.nn.Parameter, - quantize_output: bool = False, - ): - super().__init__() - self.weight = torch.nn.Parameter(weight, requires_grad=False) - self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) - self.bias = bias - self.input_scale = None - self.output_scale = None - self.quantize_output = quantize_output - - def forward(self, x): - qinput, x_input_scale = per_tensor_quantize(x) - if self.input_scale is None: - self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False) - elif x_input_scale > self.input_scale: - self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False) - output = fp8_gemm( - A=qinput, - A_scale=self.input_scale, - B=self.weight, - B_scale=self.weight_scale, - bias=self.bias, - out_dtype=x.dtype, - ) - - # Optionally, quantize output and record scale - if self.quantize_output: - qoutput, output_scale = per_tensor_quantize(output) - if self.output_scale is None: - self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False) - elif output_scale > self.output_scale: - self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False) - output = qoutput.to(output.dtype) * output_scale - - return output - - -# Module responsible for representing the final checkpoint representation -class FP8StaticLinear(torch.nn.Module): - def __init__( - self, - weight: torch.nn.Parameter, - weight_scale: torch.nn.Parameter, - bias: torch.nn.Parameter, - input_scale: torch.nn.Parameter, - output_scale: Optional[torch.nn.Parameter] = None, - ): - super().__init__() - self.weight = weight - self.weight_scale = weight_scale - self.bias = bias - self.input_scale = input_scale - self.output_scale = output_scale - - def forward(self, x): - qinput = static_per_tensor_quantize(x, self.input_scale) - output = fp8_gemm( - A=qinput, - A_scale=self.input_scale, -<<<<<<< HEAD - B=self.qweight, ->>>>>>> 3ee9283 (Support calibrating kv cache scales) - B_scale=self.weight_scale, - bias=self.bias, - out_dtype=x.dtype, - ) -<<<<<<< HEAD -======= - - if self.output_scale: - qoutput = static_per_tensor_quantize(output, self.output_scale) - output = qoutput.to(output.dtype) * self.output_scale - ->>>>>>> 3ee9283 (Support calibrating kv cache scales) - return output - - -# Module responsible for taking already quantized weights, and recording input scales (and possibly output scales) using an activation observer -class FP8StaticLinearQuantizer(torch.nn.Module): - def __init__( - self, - weight: torch.Tensor, - weight_scale: torch.Tensor, - bias: torch.nn.Parameter, - quantize_output: bool = False, - ): - super().__init__() - self.weight = torch.nn.Parameter(weight, requires_grad=False) - self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) - self.bias = bias - self.input_scale = None - self.output_scale = None - self.quantize_output = quantize_output - - def forward(self, x): - qinput, x_input_scale = per_tensor_quantize(x) - if self.input_scale is None: - self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False) - elif x_input_scale > self.input_scale: - self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False) - output = fp8_gemm( - A=qinput, - A_scale=self.input_scale, - B=self.weight, - B_scale=self.weight_scale, - bias=self.bias, - out_dtype=x.dtype, - ) - - # Optionally, quantize output and record scale - if self.quantize_output: - qoutput, output_scale = per_tensor_quantize(output) - if self.output_scale is None: - self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False) - elif output_scale > self.output_scale: - self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False) - output = qoutput.to(output.dtype) * output_scale - - return output - - -# Module responsible for representing the final checkpoint representation -class FP8StaticLinear(torch.nn.Module): - def __init__( - self, - weight: torch.nn.Parameter, - weight_scale: torch.nn.Parameter, - bias: torch.nn.Parameter, - input_scale: torch.nn.Parameter, - output_scale: Optional[torch.nn.Parameter] = None, - ): - super().__init__() - self.weight = weight - self.weight_scale = weight_scale - self.bias = bias - self.input_scale = input_scale - self.output_scale = output_scale - - def forward(self, x): - qinput = static_per_tensor_quantize(x, self.input_scale) - output = fp8_gemm( - A=qinput, - A_scale=self.input_scale, -======= ->>>>>>> def2049 (Fix weight name) - B=self.weight, - B_scale=self.weight_scale, - bias=self.bias, - out_dtype=x.dtype, - ) - - if self.output_scale: - qoutput = static_per_tensor_quantize(output, self.output_scale) - output = qoutput.to(output.dtype) * self.output_scale - - return output - - -def replace_module(model: AutoModelForCausalLM, name: str, new_module: torch.nn.Module): - if "." in name: - parent_name = name.rsplit(".", 1)[0] - child_name = name[len(parent_name) + 1 :] - parent = model.get_submodule(parent_name) - else: - parent_name = "" - parent = model - child_name = name - setattr(parent, child_name, new_module) - - -def quantize_weights( - model: AutoModelForCausalLM, - quantize_config: BaseQuantizeConfig, -): - named_modules = list(model.named_modules()) - for name, linear in tqdm.tqdm(named_modules, desc="Quantizing weights"): - if ( - not isinstance(linear, torch.nn.Linear) - or name in quantize_config.ignored_layers - ): - continue - quant_weight, weight_scale = per_tensor_quantize(linear.weight) - bias = copy.deepcopy(linear.bias) if linear.bias is not None else None - quant_linear = FP8DynamicLinear( -<<<<<<< HEAD -<<<<<<< HEAD - weight=quant_weight, weight_scale=weight_scale, bias=bias -======= - qweight=quant_weight, weight_scale=weight_scale, bias=bias ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= - weight=quant_weight, weight_scale=weight_scale, bias=bias ->>>>>>> def2049 (Fix weight name) - ) - replace_module(model, name, quant_linear) - del linear.weight - del linear.bias - del linear - cleanup_memory() - - -def quantize_activations( - model: AutoModelForCausalLM, - quantize_config: BaseQuantizeConfig, - calibration_tokens, -): - # Replace weight quantizer with a dynamic activation quantizer observer - for name, dynamic_quant_linear in model.named_modules(): - if ( - not isinstance(dynamic_quant_linear, FP8DynamicLinear) - or name in quantize_config.ignored_layers - ): - continue - quantizer = FP8StaticLinearQuantizer( -<<<<<<< HEAD -<<<<<<< HEAD - weight=dynamic_quant_linear.weight, -======= - qweight=dynamic_quant_linear.qweight, ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= - weight=dynamic_quant_linear.weight, ->>>>>>> def2049 (Fix weight name) - weight_scale=dynamic_quant_linear.weight_scale, - bias=dynamic_quant_linear.bias, - quantize_output=( - hasattr(quantize_config, "kv_cache_quant_layers") - and name in quantize_config.kv_cache_quant_layers - ), - ) - replace_module(model, name, quantizer) - del dynamic_quant_linear - cleanup_memory() - - # Pass through calibration data to measure activation scales -<<<<<<< HEAD -<<<<<<< HEAD - with torch.inference_mode(): - with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar: - for row_idx in range(calibration_tokens.shape[0]): - model(calibration_tokens[row_idx].reshape(1, -1)) - cleanup_memory() - pbar.update(1) -======= -======= ->>>>>>> 57c31bb (Use `torch.inference_mode()` for lower memory usage during calibration (#20)) - with tqdm.tqdm( - total=calibration_tokens.shape[0], desc="Calibrating activation scales" - ) as pbar: - for row_idx in range(calibration_tokens.shape[0]): - model(calibration_tokens[row_idx].reshape(1, -1)) - cleanup_memory() - pbar.update(1) -<<<<<<< HEAD ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= -======= - with torch.inference_mode(): - with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar: - for row_idx in range(calibration_tokens.shape[0]): - model(calibration_tokens[row_idx].reshape(1, -1)) - cleanup_memory() - pbar.update(1) ->>>>>>> b1c6ad6 (Use `torch.inference_mode()` for lower memory usage during calibration (#20)) ->>>>>>> 57c31bb (Use `torch.inference_mode()` for lower memory usage during calibration (#20)) - - # Replace dynamic quantizer observer with StaticLinear for export - for name, quantizer in model.named_modules(): - if ( - not isinstance(quantizer, FP8StaticLinearQuantizer) - or name in quantize_config.ignored_layers - ): - continue - static_proj = FP8StaticLinear( -<<<<<<< HEAD -<<<<<<< HEAD - weight=quantizer.weight, -======= - qweight=quantizer.qweight, ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= - weight=quantizer.weight, ->>>>>>> def2049 (Fix weight name) - weight_scale=quantizer.weight_scale, - bias=quantizer.bias, - input_scale=quantizer.input_scale, - output_scale=quantizer.output_scale, - ) - replace_module(model, name, static_proj) - del quantizer - cleanup_memory() - - # Post-process step for kv cache scales to take the k/v module - # `output_scale` parameters, take the max of them, and store them in - # the parent attention module as `kv_scale` - # NOTE: if we want to switch to the `output_scale` representation, we can simply remove this block - if hasattr(quantize_config, "kv_cache_quant_layers"): - # Assumes that list is ordered such that [layer0.k_proj, layer0.v_proj, layer1.k_proj, layer1.v_proj, ...] - # so we make a list of tuples [(layer0.k_proj, layer0.v_proj), (layer1.k_proj, layer1.v_proj), ...] - kv_proj_pairs = zip(*[iter(quantize_config.kv_cache_quant_layers)]*2) - for k_proj_name, v_proj_name in kv_proj_pairs: - parent_module_name = ".".join(k_proj_name.split(".")[:-1]) - assert parent_module_name == ".".join(v_proj_name.split(".")[:-1]) - parent_module = dict(model.named_modules())[parent_module_name] - - k_proj = dict(model.named_modules())[k_proj_name] - v_proj = dict(model.named_modules())[v_proj_name] - - kv_scale = max(k_proj.output_scale, v_proj.output_scale) - parent_module.kv_scale = torch.nn.Parameter(kv_scale, requires_grad=False) - - # Remove output_scale from k_proj and v_proj - k_proj.output_scale = None - v_proj.output_scale = None - cleanup_memory() - - -def save_quantized_model( - model: AutoModelForCausalLM, - quant_config: BaseQuantizeConfig, - save_dir: str, -): - print(model) - print(f"Saving the model to {save_dir}") - static_q_dict = { - "quantization_config": { - "quant_method": "fp8", - "activation_scheme": quant_config.activation_scheme, - "ignored_layers": quant_config.ignored_layers, - } - } - if hasattr(quant_config, "kv_cache_quant_layers"): - static_q_dict["quantization_config"]["kv_cache_scheme"] = "static" - model.config.update(static_q_dict) - model.save_pretrained(save_dir) - tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path) - tokenizer.save_pretrained(save_dir) From bbf352f58c1fbaffa825066d5fd1d666f255a5f5 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Jul 2024 17:36:01 -0400 Subject: [PATCH 3/9] Fix test --- tests/test_auto_fp8.py | 230 ++++++++++++----------------------------- 1 file changed, 66 insertions(+), 164 deletions(-) diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py index bb852d9..dfe6e61 100644 --- a/tests/test_auto_fp8.py +++ b/tests/test_auto_fp8.py @@ -1,206 +1,108 @@ import os import shutil -<<<<<<< HEAD -<<<<<<< HEAD import pytest -======= ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= -import pytest ->>>>>>> 2739d61 (Add Qwen test) import safetensors.torch +from datasets import load_dataset from transformers import AutoTokenizer from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig MODELS = [ -<<<<<<< HEAD -<<<<<<< HEAD - ("facebook/opt-125m", 160), - ("Qwen/Qwen2-0.5B-Instruct", 620), -] - -<<<<<<< HEAD -@pytest.mark.parametrize("model_id,target_size", MODELS) -def test_dynamic_quantization(model_id, target_size): - quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic" -======= -def test_dynamic_quantization(): - model_id = "facebook/opt-125m" - quantized_model_dir = "opt-125m-fp8-dynamic" ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= - "facebook/opt-125m", - "Qwen/Qwen2-0.5B-Instruct", -======= ("facebook/opt-125m", 160), -<<<<<<< HEAD - ("Qwen/Qwen2-0.5B-Instruct", 600), ->>>>>>> 415c0b7 (Add fixed target sizes) -======= ("Qwen/Qwen2-0.5B-Instruct", 620), ->>>>>>> 93c0d54 (Fix proj linear count) ] -@pytest.mark.parametrize("model_id,target_size", MODELS) -def test_dynamic_quantization(model_id, target_size): - quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic" ->>>>>>> 2739d61 (Add Qwen test) - - quantize_config = BaseQuantizeConfig( - quant_method="fp8", activation_scheme="dynamic" - ) - - model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config) - model.model.to("cpu") - - model.quantize() - model.save_quantized(quantized_model_dir) - - # Measure checkpoint size and cleanup - model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") - shutil.rmtree(quantized_model_dir) +# @pytest.mark.parametrize("model_id,target_size", MODELS) +# def test_dynamic_quantization(model_id, target_size): +# quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic" -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> c3acdee (Switch from output_scale to kv_scale) - # We expect the quantized model to be a certain size - target_size = target_size * (1024 * 1024) - assert model_size < target_size +# quantize_config = BaseQuantizeConfig( +# quant_method="fp8", activation_scheme="dynamic" +# ) +# model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config) +# model.model.to("cpu") -@pytest.mark.parametrize("model_id,target_size", MODELS) -def test_static_quantization(model_id, target_size): - quantized_model_dir = model_id.split("/")[-1] + "-fp8-static" -======= - # We expect the model to be < 160MB - target_size = 160 * (1024 * 1024) - assert model_size < target_size +# model.quantize() +# model.save_quantized(quantized_model_dir) +# # Measure checkpoint size and cleanup +# model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") +# shutil.rmtree(quantized_model_dir) -<<<<<<< HEAD -def test_static_quantization(): - model_id = "facebook/opt-125m" - quantized_model_dir = "opt-125m-fp8-static" ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= -@pytest.mark.parametrize("model_id", MODELS) -def test_static_quantization(model_id): -======= - # We expect the model to be a certain size - target_size = target_size * (1024 * 1024) - assert model_size < target_size +# # We expect the quantized model to be a certain size +# target_size = target_size * (1024 * 1024) +# assert model_size < target_size @pytest.mark.parametrize("model_id,target_size", MODELS) def test_static_quantization(model_id, target_size): ->>>>>>> 415c0b7 (Add fixed target sizes) quantized_model_dir = model_id.split("/")[-1] + "-fp8-static" ->>>>>>> 2739d61 (Add Qwen test) tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) - examples = ["auto-fp8 is an easy-to-use model quantization library"] - examples = tokenizer(examples, return_tensors="pt") + ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(2)) + def preprocess(example): + example = tokenizer.apply_chat_template(example["messages"], tokenize=False) + return tokenizer( + example, + padding=False, + max_length=32, + truncation=True, + add_special_tokens=False, + ) + ds = ds.map(preprocess, remove_columns=ds.column_names) quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config) model.model.to("cpu") - model.quantize(examples) - model.save_quantized(quantized_model_dir) - - # Measure checkpoint size and cleanup - model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") - shutil.rmtree(quantized_model_dir) - -<<<<<<< HEAD -<<<<<<< HEAD - # We expect the quantized model to be a certain size - target_size = target_size * (1024 * 1024) - assert model_size < target_size - -@pytest.mark.parametrize("model_id,target_size", MODELS) -def test_kv_cache_static_quantization(model_id, target_size): - quantized_model_dir = model_id.split("/")[-1] + "-fp8-static-kv" - - tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) - examples = ["auto-fp8 is an easy-to-use model quantization library"] - examples = tokenizer(examples, return_tensors="pt") - - quantize_config = BaseQuantizeConfig( - quant_method="fp8", - activation_scheme="static", - kv_cache_quant_targets=("k_proj", "v_proj"), - ) - - model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config) - model.model.to("cpu") - - model.quantize(examples) + model.quantize(ds) model.save_quantized(quantized_model_dir) - tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors") - proj_linear_count = 0 - kv_scale_count = 0 - for name, _ in tensors.items(): - if name.endswith("k_proj.weight") or name.endswith("v_proj.weight"): - proj_linear_count += 1 - if name.endswith("kv_scale"): - kv_scale_count += 1 - assert proj_linear_count // 2 == kv_scale_count - # Measure checkpoint size and cleanup model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") shutil.rmtree(quantized_model_dir) # We expect the quantized model to be a certain size -======= - # We expect the model to be < 160MB ->>>>>>> 415c0b7 (Add fixed target sizes) -======= - # We expect the quantized model to be a certain size ->>>>>>> c3acdee (Switch from output_scale to kv_scale) target_size = target_size * (1024 * 1024) assert model_size < target_size -@pytest.mark.parametrize("model_id,target_size", MODELS) -def test_kv_cache_static_quantization(model_id, target_size): - quantized_model_dir = model_id.split("/")[-1] + "-fp8-static-kv" - - tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) - examples = ["auto-fp8 is an easy-to-use model quantization library"] - examples = tokenizer(examples, return_tensors="pt") - - quantize_config = BaseQuantizeConfig( - quant_method="fp8", - activation_scheme="static", - kv_cache_quant_targets=("k_proj", "v_proj"), - ) - - model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config) - model.model.to("cpu") - - model.quantize(examples) - model.save_quantized(quantized_model_dir) - - tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors") - proj_linear_count = 0 - kv_scale_count = 0 - for name, _ in tensors.items(): - if name.endswith("k_proj.weight") or name.endswith("v_proj.weight"): - proj_linear_count += 1 - if name.endswith("kv_scale"): - kv_scale_count += 1 - assert proj_linear_count // 2 == kv_scale_count - - # Measure checkpoint size and cleanup - model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") - shutil.rmtree(quantized_model_dir) - - # We expect the quantized model to be a certain size - target_size = target_size * (1024 * 1024) - assert model_size < target_size +# @pytest.mark.parametrize("model_id,target_size", MODELS) +# def test_kv_cache_static_quantization(model_id, target_size): +# quantized_model_dir = model_id.split("/")[-1] + "-fp8-static-kv" + +# tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) +# examples = ["auto-fp8 is an easy-to-use model quantization library"] +# examples = tokenizer(examples, return_tensors="pt") + +# quantize_config = BaseQuantizeConfig( +# quant_method="fp8", +# activation_scheme="static", +# kv_cache_quant_targets=("k_proj", "v_proj"), +# ) + +# model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config) +# model.model.to("cpu") + +# model.quantize(examples) +# model.save_quantized(quantized_model_dir) + +# tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors") +# proj_linear_count = 0 +# kv_scale_count = 0 +# for name, _ in tensors.items(): +# if name.endswith("k_proj.weight") or name.endswith("v_proj.weight"): +# proj_linear_count += 1 +# if name.endswith("kv_scale"): +# kv_scale_count += 1 +# assert proj_linear_count // 2 == kv_scale_count + +# # Measure checkpoint size and cleanup +# model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") +# shutil.rmtree(quantized_model_dir) + +# # We expect the quantized model to be a certain size +# target_size = target_size * (1024 * 1024) +# assert model_size < target_size \ No newline at end of file From ab3dad3eba0663046a8e291b6984505352c64428 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Jul 2024 17:38:02 -0400 Subject: [PATCH 4/9] Add to requirements --- requirements.txt | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index f40dfeb..3c01461 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ transformers datasets accelerate tqdm +llm-compressor @ git+https://github.com/vllm-project/llm-compressor.git diff --git a/setup.py b/setup.py index 7417754..c2b015d 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ "datasets", "accelerate", "tqdm", + "llm-compressor @ git+https://github.com/vllm-project/llm-compressor.git" ], classifiers=[ "Programming Language :: Python :: 3", From be6eef2ea7fd37b2d189fd832cc825bcb661f594 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Jul 2024 17:40:21 -0400 Subject: [PATCH 5/9] Update example --- example_dataset.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/example_dataset.py b/example_dataset.py index 82d336e..bf6b6fd 100644 --- a/example_dataset.py +++ b/example_dataset.py @@ -9,17 +9,10 @@ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) tokenizer.pad_token = tokenizer.eos_token -MAX_SEQUENCE_LENGTH = 2048 ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) def preprocess(example): example = tokenizer.apply_chat_template(example["messages"], tokenize=False) - return tokenizer( - example, - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) + return tokenizer(example, max_length=2048, truncation=True, add_special_tokens=False) ds = ds.map(preprocess, remove_columns=ds.column_names) quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") From b4f830dbdfd86c2bd3fc338296c11902c6ab6181 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Jul 2024 17:41:32 -0400 Subject: [PATCH 6/9] Fix requirement --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3c01461..191d853 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ transformers datasets accelerate tqdm -llm-compressor @ git+https://github.com/vllm-project/llm-compressor.git +llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git diff --git a/setup.py b/setup.py index c2b015d..3dcd85f 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ "datasets", "accelerate", "tqdm", - "llm-compressor @ git+https://github.com/vllm-project/llm-compressor.git" + "llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git" ], classifiers=[ "Programming Language :: Python :: 3", From af8f5a0f2ab4034501e4e6b37d5e90a9002ea040 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Jul 2024 17:54:06 -0400 Subject: [PATCH 7/9] Fix test --- auto_fp8/modeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py index eb4d2ba..0e4e8cc 100644 --- a/auto_fp8/modeling.py +++ b/auto_fp8/modeling.py @@ -79,6 +79,7 @@ def quantize(self, dataset: Optional[Dataset] = None): model=self.model, dataset=dataset, recipe=recipe, + num_calibration_samples=dataset.shape[0], ) def save_quantized(self, save_directory: str): From 3063398f7c89d9f25dc86e6b77bc0ecf40b1ce7c Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Jul 2024 17:57:36 -0400 Subject: [PATCH 8/9] Test --- tests/test_auto_fp8.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py index dfe6e61..6717ae1 100644 --- a/tests/test_auto_fp8.py +++ b/tests/test_auto_fp8.py @@ -56,8 +56,6 @@ def preprocess(example): quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config) - model.model.to("cpu") - model.quantize(ds) model.save_quantized(quantized_model_dir) From 3f683f8617b8baaace7bd21f6c6ed36fa3ee7f0a Mon Sep 17 00:00:00 2001 From: mgoin Date: Fri, 19 Jul 2024 10:30:19 -0400 Subject: [PATCH 9/9] Add support for dynamic activation --- auto_fp8/modeling.py | 92 ++++++++++++++++++++++++++++++++++-------- tests/test_auto_fp8.py | 34 ++++++++-------- 2 files changed, 91 insertions(+), 35 deletions(-) diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py index 0e4e8cc..79d80b2 100644 --- a/auto_fp8/modeling.py +++ b/auto_fp8/modeling.py @@ -6,6 +6,12 @@ from llmcompressor.transformers import SparseAutoModelForCausalLM from llmcompressor.transformers import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from compressed_tensors.quantization import ( + QuantizationArgs, + QuantizationType, + QuantizationScheme, + QuantizationStrategy, +) class BaseQuantizeConfig: @@ -64,23 +70,75 @@ def from_pretrained( return cls(model, quantize_config) def quantize(self, dataset: Optional[Dataset] = None): - assert ( - self.quantize_config.activation_scheme == "static" - ), "Dynamic isn't supported yet" - assert ( - dataset is not None - ), "Calibration tokens required for static activation quantization" - - recipe = QuantizationModifier( - targets="Linear", scheme="FP8", ignore=self.quantize_config.ignore_patterns - ) + if self.quantize_config.activation_scheme == "dynamic": + if dataset is None: + # For dynamic activations, we don't care about calibration data + # being provided. However, we need to pass something + # TODO(mgoin): Remove once llmcompressor allows no dataset + from datasets import load_dataset + dataset = load_dataset("openai/openai_humaneval", split="test").select(range(1)) + dataset = dataset.rename_column("prompt", "text") - oneshot( - model=self.model, - dataset=dataset, - recipe=recipe, - num_calibration_samples=dataset.shape[0], - ) + FP8_W8 = QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs( + num_bits=8, + type=QuantizationType.FLOAT, + strategy=QuantizationStrategy.TENSOR, + symmetric=True, + dynamic=False, + ), + ) + + recipe = QuantizationModifier( + config_groups={"group_0": FP8_W8}, + ignore=self.quantize_config.ignore_patterns, + ) + + oneshot( + model=self.model, + dataset=dataset, + recipe=recipe, + num_calibration_samples=dataset.shape[0], + ) + elif self.quantize_config.activation_scheme == "static": + assert ( + dataset is not None + ), "Calibration tokens required for static activation quantization" + + FP8_W8A8 = QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs( + num_bits=8, + type=QuantizationType.FLOAT, + strategy=QuantizationStrategy.TENSOR, + symmetric=True, + dynamic=False, + ), + input_activations=QuantizationArgs( + num_bits=8, + type=QuantizationType.FLOAT, + strategy=QuantizationStrategy.TENSOR, + symmetric=True, + dynamic=False, + ), + ) + + recipe = QuantizationModifier( + config_groups={"group_0": FP8_W8A8}, + ignore=self.quantize_config.ignore_patterns, + ) + + oneshot( + model=self.model, + dataset=dataset, + recipe=recipe, + num_calibration_samples=dataset.shape[0], + ) + else: + raise ValueError( + f"Unsupported activation_scheme={self.quantize_config.activation_scheme}" + ) def save_quantized(self, save_directory: str): self.save_pretrained(save_directory, save_compressed=True) @@ -89,4 +147,4 @@ def save_pretrained(self, save_directory: str, save_compressed: bool = True): self.model.save_pretrained(save_directory, save_compressed=save_compressed) tokenizer = AutoTokenizer.from_pretrained(self.model.config._name_or_path) tokenizer.save_pretrained(save_directory) - print(f"Saved final checkpoint to {os.path.abspath(save_directory)}") \ No newline at end of file + print(f"Saved final checkpoint to {os.path.abspath(save_directory)}") diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py index 6717ae1..0322c2d 100644 --- a/tests/test_auto_fp8.py +++ b/tests/test_auto_fp8.py @@ -10,30 +10,28 @@ MODELS = [ ("facebook/opt-125m", 160), - ("Qwen/Qwen2-0.5B-Instruct", 620), + # ("Qwen/Qwen2-0.5B-Instruct", 620), ] -# @pytest.mark.parametrize("model_id,target_size", MODELS) -# def test_dynamic_quantization(model_id, target_size): -# quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic" +@pytest.mark.parametrize("model_id,target_size", MODELS) +def test_dynamic_quantization(model_id, target_size): + quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic" -# quantize_config = BaseQuantizeConfig( -# quant_method="fp8", activation_scheme="dynamic" -# ) + quantize_config = BaseQuantizeConfig( + quant_method="fp8", activation_scheme="dynamic" + ) -# model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config) -# model.model.to("cpu") - -# model.quantize() -# model.save_quantized(quantized_model_dir) + model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config) + model.quantize() + model.save_quantized(quantized_model_dir) -# # Measure checkpoint size and cleanup -# model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") -# shutil.rmtree(quantized_model_dir) + # Measure checkpoint size and cleanup + model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors") + shutil.rmtree(quantized_model_dir) -# # We expect the quantized model to be a certain size -# target_size = target_size * (1024 * 1024) -# assert model_size < target_size + # We expect the quantized model to be a certain size + target_size = target_size * (1024 * 1024) + assert model_size < target_size @pytest.mark.parametrize("model_id,target_size", MODELS)