From e286fa9251f5e4b3e09d5daa14f450bede0b4676 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Jul 2024 17:11:27 -0400
Subject: [PATCH 1/9] Switch backend with llm-compressor

---
 auto_fp8/__init__.py   |   3 +-
 auto_fp8/config.py     |  42 --------
 auto_fp8/modeling.py   | 214 +++++++++++++----------------------------
 auto_fp8/quantize.py   | 167 ++++++++++++++++++++++++++++++++
 example_dataset.py     |  25 +++--
 tests/test_auto_fp8.py | 108 +++++++++++++++++++++
 6 files changed, 358 insertions(+), 201 deletions(-)
 delete mode 100644 auto_fp8/config.py

diff --git a/auto_fp8/__init__.py b/auto_fp8/__init__.py
index ea4fbb6..d463cc8 100644
--- a/auto_fp8/__init__.py
+++ b/auto_fp8/__init__.py
@@ -1,5 +1,4 @@
-from .config import BaseQuantizeConfig
-from .modeling import AutoFP8ForCausalLM
+from .modeling import AutoFP8ForCausalLM, BaseQuantizeConfig
 
 __all__ = [
     "AutoFP8ForCausalLM",
diff --git a/auto_fp8/config.py b/auto_fp8/config.py
deleted file mode 100644
index 24c6200..0000000
--- a/auto_fp8/config.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from typing import List, Optional, Tuple
-
-
-class BaseQuantizeConfig:
-    """Configuration for model quantization.
-
-    Args:
-        quant_method: Type/precision of quantization method to use.
-            At the moment, this is just "fp8" which specifically means
-            the fp8_e4m3 format in pytorch.
-        activation_scheme: Choice of either "dynamic" or "static" quantization
-            of activtions. If "static", then calibration samples are required
-            during quantization to produce accurate per-tensor scales for
-            activations of Linear modules.
-        ignore_patterns: List of patterns used to ignore layers. If a string
-            starts with "re:", then everything afterwards is used as python
-            regex style matching i.e. re.search(), for each Linear layer.
-            By default, "re:.*lm_head" is included to ignore the embedding
-            Linear layer usually at the end of decoder LLMs
-        kv_cache_quant_targets: Tuple of Linear module names to target for
-            calibration of the output scales for KV cache quantization.
-            Usually, these should be `("k_proj", "v_proj")`.
-    """
-
-    def __init__(
-        self,
-        quant_method: str = "fp8",
-        activation_scheme: str = "static",
-        ignore_patterns: List[str] = ["re:.*lm_head"],
-        kv_cache_quant_targets: Optional[Tuple[str]] = None,
-    ):
-        if quant_method != "fp8":
-            raise ValueError("Only FP8 quantization is supported.")
-        if activation_scheme not in ["static", "dynamic"]:
-            raise ValueError(
-                "Invalid activation_scheme. Choose either 'static' or 'dynamic'."
-            )
-        self.quant_method = quant_method
-        self.activation_scheme = activation_scheme
-        self.ignore_patterns = ignore_patterns
-        self.kv_cache_quant_targets = kv_cache_quant_targets
-        self.ignored_layers = []
diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py
index 04a9e71..eb4d2ba 100644
--- a/auto_fp8/modeling.py
+++ b/auto_fp8/modeling.py
@@ -1,42 +1,49 @@
-import re
-from typing import List, Optional, Tuple
+import os
+from typing import List, Optional
+
+from transformers import AutoConfig, AutoTokenizer
+from datasets import Dataset
+from llmcompressor.transformers import SparseAutoModelForCausalLM
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+
+class BaseQuantizeConfig:
+    """Configuration for model quantization.
+
+    Args:
+        quant_method: Type/precision of quantization method to use.
+            At the moment, this is just "fp8" which specifically means
+            the fp8_e4m3 format in pytorch.
+        activation_scheme: Choice of either "dynamic" or "static" quantization
+            of activtions. If "static", then calibration samples are required
+            during quantization to produce accurate per-tensor scales for
+            activations of Linear modules.
+        ignore_patterns: List of patterns used to ignore layers. If a string
+            starts with "re:", then everything afterwards is used as python
+            regex style matching i.e. re.search(), for each Linear layer.
+            By default, "lm_head" is included to ignore the embedding
+            Linear layer usually at the end of decoder LLMs
+    """
 
-import torch
-from transformers import AutoModelForCausalLM
-
-from auto_fp8.config import BaseQuantizeConfig
-from auto_fp8.quantize import (
-    quantize_activations,
-    quantize_weights,
-    save_quantized_model,
-)
+    def __init__(
+        self,
+        quant_method: str = "fp8",
+        activation_scheme: str = "static",
+        ignore_patterns: List[str] = ["lm_head"],
+    ):
+        self.quant_method = quant_method
+        self.activation_scheme = activation_scheme
+        self.ignore_patterns = ignore_patterns
 
 
 class AutoFP8ForCausalLM:
     def __init__(
-        self,
-        model: AutoModelForCausalLM,
-        quantize_config: BaseQuantizeConfig,
+        self, model: SparseAutoModelForCausalLM, quantize_config: BaseQuantizeConfig
     ):
         self.model = model
         self.model_type = self.model.config.model_type
         self.config = self.model.config
-
-        # Gather the Linear module names that we want to ignore
-        quantize_config.ignored_layers = get_layers_to_ignore(
-            self.model, quantize_config.ignore_patterns
-        )
-
-        if quantize_config.kv_cache_quant_targets:
-            kv_cache_quant_layers = get_kv_cache_quant_layers(
-                self.model, quantize_config.kv_cache_quant_targets
-            )
-            if len(kv_cache_quant_layers) == 0:
-                raise ValueError(
-                    f"Could not find any kv cache layers using kv_cache_quant_targets={quantize_config.kv_cache_quant_targets}, please fix your argument."
-                )
-            quantize_config.kv_cache_quant_layers = kv_cache_quant_layers
-
         self.quantize_config = quantize_config
 
     @classmethod
@@ -44,130 +51,41 @@ def from_pretrained(
         cls,
         pretrained_model_name_or_path: str,
         quantize_config: BaseQuantizeConfig,
-        **model_init_kwargs,
+        **kwargs,
     ):
-        """Load the un-quantized pretrained model"""
-
-        def skip(*args, **kwargs):
-            pass
-
-        torch.nn.init.kaiming_uniform_ = skip
-        torch.nn.init.uniform_ = skip
-        torch.nn.init.normal_ = skip
-
-        # Parameters related to loading from Hugging Face Hub
-        cache_dir = model_init_kwargs.pop("cache_dir", None)
-        force_download = model_init_kwargs.pop("force_download", False)
-        resume_download = model_init_kwargs.pop("resume_download", False)
-        proxies = model_init_kwargs.pop("proxies", None)
-        local_files_only = model_init_kwargs.pop("local_files_only", False)
-        use_auth_token = model_init_kwargs.pop("use_auth_token", None)
-        revision = model_init_kwargs.pop("revision", None)
-        subfolder = model_init_kwargs.pop("subfolder", "")
-        commit_hash = model_init_kwargs.pop("_commit_hash", None)
-
-        cached_file_kwargs = {
-            "cache_dir": cache_dir,
-            "force_download": force_download,
-            "proxies": proxies,
-            "resume_download": resume_download,
-            "local_files_only": local_files_only,
-            "use_auth_token": use_auth_token,
-            "revision": revision,
-            "subfolder": subfolder,
-            "_commit_hash": commit_hash,
-        }
-
-        torch.cuda.empty_cache()
-
-        # Important defaults
-        if "torch_dtype" not in model_init_kwargs:
-            model_init_kwargs["torch_dtype"] = "auto"
-
-        if "device_map" not in model_init_kwargs:
-            model_init_kwargs["device_map"] = "auto"
-
-        merged_kwargs = {**model_init_kwargs, **cached_file_kwargs}
-        print("Loading model with the following kwargs:", merged_kwargs)
-        model = AutoModelForCausalLM.from_pretrained(
-            pretrained_model_name_or_path, **merged_kwargs
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+        model = SparseAutoModelForCausalLM.from_pretrained(
+            pretrained_model_name_or_path,
+            config=config,
+            device_map="auto",
+            torch_dtype="auto",
+            **kwargs,
         )
-
-        model_config = model.config.to_dict()
-        seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
-        if any(k in model_config for k in seq_len_keys):
-            for key in seq_len_keys:
-                if key in model_config:
-                    model.seqlen = model_config[key]
-                    break
-        else:
-            print("Can't get model's sequence length, setting to 2048.")
-            model.seqlen = 2048
-        model.eval()
-
         return cls(model, quantize_config)
 
-    def quantize(self, calibration_tokens: Optional[torch.Tensor] = None):
-
-        # Always quantize the weights as they do not require calibration data
-        quantize_weights(self.model, self.quantize_config)
-
-        if self.quantize_config.activation_scheme == "static":
-            assert (
-                calibration_tokens is not None
-            ), "Calibration tokens required for activation quantization"
-
-
-            def _prepare_calibration_data(calibration_tokens):
-                if hasattr(calibration_tokens, "input_ids"):
-                    return calibration_tokens.input_ids
-                return calibration_tokens
-
-            quantize_activations(
-                self.model,
-                self.quantize_config,
-                _prepare_calibration_data(calibration_tokens),
-            )
+    def quantize(self, dataset: Optional[Dataset] = None):
+        assert (
+            self.quantize_config.activation_scheme == "static"
+        ), "Dynamic isn't supported yet"
+        assert (
+            dataset is not None
+        ), "Calibration tokens required for static activation quantization"
 
-    def save_quantized(self, save_dir):
-        save_quantized_model(
-            self.model,
-            quant_config=self.quantize_config,
-            save_dir=save_dir,
+        recipe = QuantizationModifier(
+            targets="Linear", scheme="FP8", ignore=self.quantize_config.ignore_patterns
         )
 
+        oneshot(
+            model=self.model,
+            dataset=dataset,
+            recipe=recipe,
+        )
 
-def get_layers_to_ignore(model, ignore_patterns) -> List[str]:
-    ignored_layers = set()
-
-    for name, linear in model.named_modules():
-        if not isinstance(linear, torch.nn.Linear):
-            continue
-
-        for ignore_pattern in ignore_patterns:
-            regex_prefix = "re:"
-            if ignore_pattern.startswith(regex_prefix):
-                # check if name matches regex and add to set if true
-                regex_pattern = ignore_pattern[len(regex_prefix) :]
-                if re.search(regex_pattern, name):
-                    ignored_layers.add(name)
-            else:
-                # else, exact match
-                if ignore_pattern == name:
-                    ignored_layers.add(name)
-
-    return list(ignored_layers)
-
-
-def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
-    kv_cache_quant_layers = []
-
-    for name, linear in model.named_modules():
-        if not isinstance(linear, torch.nn.Linear):
-            continue
-
-        for output_quant_target in kv_cache_quant_targets:
-            if name.endswith(output_quant_target):
-                kv_cache_quant_layers.append(name)
+    def save_quantized(self, save_directory: str):
+        self.save_pretrained(save_directory, save_compressed=True)
 
-    return kv_cache_quant_layers
+    def save_pretrained(self, save_directory: str, save_compressed: bool = True):
+        self.model.save_pretrained(save_directory, save_compressed=save_compressed)
+        tokenizer = AutoTokenizer.from_pretrained(self.model.config._name_or_path)
+        tokenizer.save_pretrained(save_directory)
+        print(f"Saved final checkpoint to {os.path.abspath(save_directory)}")
\ No newline at end of file
diff --git a/auto_fp8/quantize.py b/auto_fp8/quantize.py
index 38a4de6..0237bc2 100644
--- a/auto_fp8/quantize.py
+++ b/auto_fp8/quantize.py
@@ -72,11 +72,25 @@ def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype):
         # Deal with empty tensors (triggeted by empty MoE experts)
         return torch.empty(size=(0, B.shape[0]), dtype=out_dtype, device=A.device)
 
+<<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> 959bdbc (Add comment)
     # TODO: Disable native fp8 gemm for now, always just dequantize
     # native_fp8_support = (
     #     torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
     # )
     native_fp8_support = False
+<<<<<<< HEAD
+=======
+    native_fp8_support = (
+        torch.cuda.is_available()
+        and torch.cuda.get_device_capability() >= (8, 9)
+        and False
+    )
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
+>>>>>>> 959bdbc (Add comment)
     if native_fp8_support:
         need_reshape = A.dim() == 3
         if need_reshape:
@@ -108,6 +122,7 @@ def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype):
 
 # Class responsible for quantizing weights
 class FP8DynamicLinear(torch.nn.Module):
+<<<<<<< HEAD
     def __init__(
         self,
         weight: torch.Tensor,
@@ -125,10 +140,112 @@ def forward(self, x):
             A=qinput,
             A_scale=x_scale,
             B=self.weight,
+=======
+    def __init__(
+        self,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        bias: torch.nn.Parameter,
+    ):
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight, requires_grad=False)
+        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+        self.bias = bias
+
+    def forward(self, x):
+        qinput, x_scale = per_tensor_quantize(x)
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=x_scale,
+            B=self.weight,
+            B_scale=self.weight_scale,
+            bias=self.bias,
+            out_dtype=x.dtype,
+        )
+        return output
+
+
+# Module responsible for taking already quantized weights, and recording input scales (and possibly output scales) using an activation observer
+class FP8StaticLinearQuantizer(torch.nn.Module):
+    def __init__(
+        self,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        bias: torch.nn.Parameter,
+        quantize_output: bool = False,
+    ):
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight, requires_grad=False)
+        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+        self.bias = bias
+        self.input_scale = None
+        self.output_scale = None
+        self.quantize_output = quantize_output
+
+    def forward(self, x):
+        qinput, x_input_scale = per_tensor_quantize(x)
+        if self.input_scale is None:
+            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
+        elif x_input_scale > self.input_scale:
+            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=self.input_scale,
+            B=self.weight,
+            B_scale=self.weight_scale,
+            bias=self.bias,
+            out_dtype=x.dtype,
+        )
+
+        # Optionally, quantize output and record scale
+        if self.quantize_output:
+            qoutput, output_scale = per_tensor_quantize(output)
+            if self.output_scale is None:
+                self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False)
+            elif output_scale > self.output_scale:
+                self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False)
+            output = qoutput.to(output.dtype) * output_scale
+
+        return output
+
+
+# Module responsible for representing the final checkpoint representation
+class FP8StaticLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight: torch.nn.Parameter,
+        weight_scale: torch.nn.Parameter,
+        bias: torch.nn.Parameter,
+        input_scale: torch.nn.Parameter,
+        output_scale: Optional[torch.nn.Parameter] = None,
+    ):
+        super().__init__()
+        self.weight = weight
+        self.weight_scale = weight_scale
+        self.bias = bias
+        self.input_scale = input_scale
+        self.output_scale = output_scale
+
+    def forward(self, x):
+        qinput = static_per_tensor_quantize(x, self.input_scale)
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=self.input_scale,
+<<<<<<< HEAD
+            B=self.qweight,
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
             B_scale=self.weight_scale,
             bias=self.bias,
             out_dtype=x.dtype,
         )
+<<<<<<< HEAD
+=======
+
+        if self.output_scale:
+            qoutput = static_per_tensor_quantize(output, self.output_scale)
+            output = qoutput.to(output.dtype) * self.output_scale
+
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
         return output
 
 
@@ -198,6 +315,8 @@ def forward(self, x):
         output = fp8_gemm(
             A=qinput,
             A_scale=self.input_scale,
+=======
+>>>>>>> def2049 (Fix weight name)
             B=self.weight,
             B_scale=self.weight_scale,
             bias=self.bias,
@@ -237,7 +356,15 @@ def quantize_weights(
         quant_weight, weight_scale = per_tensor_quantize(linear.weight)
         bias = copy.deepcopy(linear.bias) if linear.bias is not None else None
         quant_linear = FP8DynamicLinear(
+<<<<<<< HEAD
+<<<<<<< HEAD
             weight=quant_weight, weight_scale=weight_scale, bias=bias
+=======
+            qweight=quant_weight, weight_scale=weight_scale, bias=bias
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
+            weight=quant_weight, weight_scale=weight_scale, bias=bias
+>>>>>>> def2049 (Fix weight name)
         )
         replace_module(model, name, quant_linear)
         del linear.weight
@@ -259,7 +386,15 @@ def quantize_activations(
         ):
             continue
         quantizer = FP8StaticLinearQuantizer(
+<<<<<<< HEAD
+<<<<<<< HEAD
+            weight=dynamic_quant_linear.weight,
+=======
+            qweight=dynamic_quant_linear.qweight,
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
             weight=dynamic_quant_linear.weight,
+>>>>>>> def2049 (Fix weight name)
             weight_scale=dynamic_quant_linear.weight_scale,
             bias=dynamic_quant_linear.bias,
             quantize_output=(
@@ -272,12 +407,36 @@ def quantize_activations(
     cleanup_memory()
 
     # Pass through calibration data to measure activation scales
+<<<<<<< HEAD
+<<<<<<< HEAD
     with torch.inference_mode():
         with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar:
             for row_idx in range(calibration_tokens.shape[0]):
                 model(calibration_tokens[row_idx].reshape(1, -1))
                 cleanup_memory()
                 pbar.update(1)
+=======
+=======
+>>>>>>> 57c31bb (Use `torch.inference_mode()` for lower memory usage during calibration (#20))
+    with tqdm.tqdm(
+        total=calibration_tokens.shape[0], desc="Calibrating activation scales"
+    ) as pbar:
+        for row_idx in range(calibration_tokens.shape[0]):
+            model(calibration_tokens[row_idx].reshape(1, -1))
+            cleanup_memory()
+            pbar.update(1)
+<<<<<<< HEAD
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
+=======
+    with torch.inference_mode():
+        with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar:
+            for row_idx in range(calibration_tokens.shape[0]):
+                model(calibration_tokens[row_idx].reshape(1, -1))
+                cleanup_memory()
+                pbar.update(1)
+>>>>>>> b1c6ad6 (Use `torch.inference_mode()` for lower memory usage during calibration (#20))
+>>>>>>> 57c31bb (Use `torch.inference_mode()` for lower memory usage during calibration (#20))
 
     # Replace dynamic quantizer observer with StaticLinear for export
     for name, quantizer in model.named_modules():
@@ -287,7 +446,15 @@ def quantize_activations(
         ):
             continue
         static_proj = FP8StaticLinear(
+<<<<<<< HEAD
+<<<<<<< HEAD
+            weight=quantizer.weight,
+=======
+            qweight=quantizer.qweight,
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
             weight=quantizer.weight,
+>>>>>>> def2049 (Fix weight name)
             weight_scale=quantizer.weight_scale,
             bias=quantizer.bias,
             input_scale=quantizer.input_scale,
diff --git a/example_dataset.py b/example_dataset.py
index 204345f..82d336e 100644
--- a/example_dataset.py
+++ b/example_dataset.py
@@ -3,20 +3,27 @@
 
 from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
 
-pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
-quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8"
+pretrained_model_dir = "facebook/opt-125m"
+quantized_model_dir = "opt-125m-FP8"
 
 tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
 tokenizer.pad_token = tokenizer.eos_token
 
-ds = load_dataset("mgoin/ultrachat_2k", split="train_sft")
-examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds]
-examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")
+MAX_SEQUENCE_LENGTH = 2048
+ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
+def preprocess(example):
+    example = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+    return tokenizer(
+        example,
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+ds = ds.map(preprocess, remove_columns=ds.column_names)
 
 quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")
 
-model = AutoFP8ForCausalLM.from_pretrained(
-    pretrained_model_dir, quantize_config=quantize_config
-)
-model.quantize(examples)
+model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
+model.quantize(ds)
 model.save_quantized(quantized_model_dir)
diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py
index 6045d84..bb852d9 100644
--- a/tests/test_auto_fp8.py
+++ b/tests/test_auto_fp8.py
@@ -1,20 +1,52 @@
 import os
 import shutil
 
+<<<<<<< HEAD
+<<<<<<< HEAD
 import pytest
+=======
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
+import pytest
+>>>>>>> 2739d61 (Add Qwen test)
 import safetensors.torch
 from transformers import AutoTokenizer
 
 from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
 
 MODELS = [
+<<<<<<< HEAD
+<<<<<<< HEAD
+    ("facebook/opt-125m", 160),
+    ("Qwen/Qwen2-0.5B-Instruct", 620),
+]
+
+<<<<<<< HEAD
+@pytest.mark.parametrize("model_id,target_size", MODELS)
+def test_dynamic_quantization(model_id, target_size):
+    quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic"
+=======
+def test_dynamic_quantization():
+    model_id = "facebook/opt-125m"
+    quantized_model_dir = "opt-125m-fp8-dynamic"
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
+    "facebook/opt-125m",
+    "Qwen/Qwen2-0.5B-Instruct",
+=======
     ("facebook/opt-125m", 160),
+<<<<<<< HEAD
+    ("Qwen/Qwen2-0.5B-Instruct", 600),
+>>>>>>> 415c0b7 (Add fixed target sizes)
+=======
     ("Qwen/Qwen2-0.5B-Instruct", 620),
+>>>>>>> 93c0d54 (Fix proj linear count)
 ]
 
 @pytest.mark.parametrize("model_id,target_size", MODELS)
 def test_dynamic_quantization(model_id, target_size):
     quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic"
+>>>>>>> 2739d61 (Add Qwen test)
 
     quantize_config = BaseQuantizeConfig(
         quant_method="fp8", activation_scheme="dynamic"
@@ -30,6 +62,11 @@ def test_dynamic_quantization(model_id, target_size):
     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
     shutil.rmtree(quantized_model_dir)
 
+<<<<<<< HEAD
+<<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> c3acdee (Switch from output_scale to kv_scale)
     # We expect the quantized model to be a certain size
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size
@@ -38,6 +75,31 @@ def test_dynamic_quantization(model_id, target_size):
 @pytest.mark.parametrize("model_id,target_size", MODELS)
 def test_static_quantization(model_id, target_size):
     quantized_model_dir = model_id.split("/")[-1] + "-fp8-static"
+=======
+    # We expect the model to be < 160MB
+    target_size = 160 * (1024 * 1024)
+    assert model_size < target_size
+
+
+<<<<<<< HEAD
+def test_static_quantization():
+    model_id = "facebook/opt-125m"
+    quantized_model_dir = "opt-125m-fp8-static"
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
+@pytest.mark.parametrize("model_id", MODELS)
+def test_static_quantization(model_id):
+=======
+    # We expect the model to be a certain size
+    target_size = target_size * (1024 * 1024)
+    assert model_size < target_size
+
+
+@pytest.mark.parametrize("model_id,target_size", MODELS)
+def test_static_quantization(model_id, target_size):
+>>>>>>> 415c0b7 (Add fixed target sizes)
+    quantized_model_dir = model_id.split("/")[-1] + "-fp8-static"
+>>>>>>> 2739d61 (Add Qwen test)
 
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
     examples = ["auto-fp8 is an easy-to-use model quantization library"]
@@ -55,7 +117,53 @@ def test_static_quantization(model_id, target_size):
     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
     shutil.rmtree(quantized_model_dir)
 
+<<<<<<< HEAD
+<<<<<<< HEAD
+    # We expect the quantized model to be a certain size
+    target_size = target_size * (1024 * 1024)
+    assert model_size < target_size
+
+@pytest.mark.parametrize("model_id,target_size", MODELS)
+def test_kv_cache_static_quantization(model_id, target_size):
+    quantized_model_dir = model_id.split("/")[-1] + "-fp8-static-kv"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+    examples = ["auto-fp8 is an easy-to-use model quantization library"]
+    examples = tokenizer(examples, return_tensors="pt")
+
+    quantize_config = BaseQuantizeConfig(
+        quant_method="fp8",
+        activation_scheme="static",
+        kv_cache_quant_targets=("k_proj", "v_proj"),
+    )
+
+    model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
+    model.model.to("cpu")
+
+    model.quantize(examples)
+    model.save_quantized(quantized_model_dir)
+
+    tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors")
+    proj_linear_count = 0
+    kv_scale_count = 0
+    for name, _ in tensors.items():
+        if name.endswith("k_proj.weight") or name.endswith("v_proj.weight"):
+            proj_linear_count += 1
+        if name.endswith("kv_scale"):
+            kv_scale_count += 1
+    assert proj_linear_count // 2 == kv_scale_count
+
+    # Measure checkpoint size and cleanup
+    model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
+    shutil.rmtree(quantized_model_dir)
+
+    # We expect the quantized model to be a certain size
+=======
+    # We expect the model to be < 160MB
+>>>>>>> 415c0b7 (Add fixed target sizes)
+=======
     # We expect the quantized model to be a certain size
+>>>>>>> c3acdee (Switch from output_scale to kv_scale)
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size
 

From 6d508ae267323c46bbf4fcf6919c10ca3515045e Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Jul 2024 17:12:08 -0400
Subject: [PATCH 2/9] Remove quantize

---
 auto_fp8/quantize.py | 511 -------------------------------------------
 1 file changed, 511 deletions(-)
 delete mode 100644 auto_fp8/quantize.py

diff --git a/auto_fp8/quantize.py b/auto_fp8/quantize.py
deleted file mode 100644
index 0237bc2..0000000
--- a/auto_fp8/quantize.py
+++ /dev/null
@@ -1,511 +0,0 @@
-import gc
-import re
-from typing import Optional, Tuple
-import copy
-
-import torch
-import tqdm
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from .config import BaseQuantizeConfig
-
-
-# HACK: Override the dtype_byte_size function in transformers to support float8 types
-# Fix is posted upstream https://github.com/huggingface/transformers/pull/30488
-def new_dtype_byte_size(dtype):
-    if dtype == torch.bool:
-        return 1 / 8
-    bit_search = re.search(r"[^\d](\d+)_?", str(dtype))
-    if bit_search is None:
-        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
-    bit_size = int(bit_search.groups()[0])
-    return bit_size // 8
-
-
-transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size
-
-
-def cleanup_memory():
-    gc.collect()
-    torch.cuda.empty_cache()
-
-
-def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
-    """Quantize a tensor using per-tensor static scaling factor.
-    Args:
-        tensor: The input tensor.
-    """
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    # Calculate the scale as dtype max divided by absmax.
-    # Since .abs() creates a new tensor, we use aminmax to get
-    # the min and max first and then calculate the absmax.
-    if tensor.numel() == 0:
-        # Deal with empty tensors (triggered by empty MoE experts)
-        min_val, max_val = (
-            torch.tensor(-16.0, dtype=tensor.dtype),
-            torch.tensor(16.0, dtype=tensor.dtype),
-        )
-    else:
-        min_val, max_val = tensor.aminmax()
-    amax = torch.maximum(min_val.abs(), max_val.abs())
-    scale = finfo.max / amax.clamp(min=1e-12)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
-    # Return both float8 data and the inverse scale (as float),
-    # as both required as inputs to torch._scaled_mm
-    qweight = qweight.to(torch.float8_e4m3fn)
-    scale = scale.float().reciprocal()
-    return qweight, scale
-
-
-def static_per_tensor_quantize(tensor: torch.Tensor, inv_scale: float) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
-    return qweight.to(torch.float8_e4m3fn)
-
-
-def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype):
-    if A.numel() == 0:
-        # Deal with empty tensors (triggeted by empty MoE experts)
-        return torch.empty(size=(0, B.shape[0]), dtype=out_dtype, device=A.device)
-
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
->>>>>>> 959bdbc (Add comment)
-    # TODO: Disable native fp8 gemm for now, always just dequantize
-    # native_fp8_support = (
-    #     torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
-    # )
-    native_fp8_support = False
-<<<<<<< HEAD
-=======
-    native_fp8_support = (
-        torch.cuda.is_available()
-        and torch.cuda.get_device_capability() >= (8, 9)
-        and False
-    )
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
->>>>>>> 959bdbc (Add comment)
-    if native_fp8_support:
-        need_reshape = A.dim() == 3
-        if need_reshape:
-            batch_size = A.shape[0]
-            A_input = A.reshape(-1, A.shape[-1])
-        else:
-            batch_size = None
-            A_input = A
-        output, _ = torch._scaled_mm(
-            A_input,
-            B.t(),
-            out_dtype=out_dtype,
-            scale_a=A_scale,
-            scale_b=B_scale,
-            bias=bias,
-        )
-        if need_reshape:
-            output = output.reshape(
-                batch_size, output.shape[0] // batch_size, output.shape[1]
-            )
-    else:
-        output = torch.nn.functional.linear(
-            A.to(out_dtype) * A_scale,
-            B.to(out_dtype) * B_scale.to(out_dtype),
-            bias=bias,
-        )
-    return output
-
-
-# Class responsible for quantizing weights
-class FP8DynamicLinear(torch.nn.Module):
-<<<<<<< HEAD
-    def __init__(
-        self,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        bias: torch.nn.Parameter,
-    ):
-        super().__init__()
-        self.weight = torch.nn.Parameter(weight, requires_grad=False)
-        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
-        self.bias = bias
-
-    def forward(self, x):
-        qinput, x_scale = per_tensor_quantize(x)
-        output = fp8_gemm(
-            A=qinput,
-            A_scale=x_scale,
-            B=self.weight,
-=======
-    def __init__(
-        self,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        bias: torch.nn.Parameter,
-    ):
-        super().__init__()
-        self.weight = torch.nn.Parameter(weight, requires_grad=False)
-        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
-        self.bias = bias
-
-    def forward(self, x):
-        qinput, x_scale = per_tensor_quantize(x)
-        output = fp8_gemm(
-            A=qinput,
-            A_scale=x_scale,
-            B=self.weight,
-            B_scale=self.weight_scale,
-            bias=self.bias,
-            out_dtype=x.dtype,
-        )
-        return output
-
-
-# Module responsible for taking already quantized weights, and recording input scales (and possibly output scales) using an activation observer
-class FP8StaticLinearQuantizer(torch.nn.Module):
-    def __init__(
-        self,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        bias: torch.nn.Parameter,
-        quantize_output: bool = False,
-    ):
-        super().__init__()
-        self.weight = torch.nn.Parameter(weight, requires_grad=False)
-        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
-        self.bias = bias
-        self.input_scale = None
-        self.output_scale = None
-        self.quantize_output = quantize_output
-
-    def forward(self, x):
-        qinput, x_input_scale = per_tensor_quantize(x)
-        if self.input_scale is None:
-            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
-        elif x_input_scale > self.input_scale:
-            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
-        output = fp8_gemm(
-            A=qinput,
-            A_scale=self.input_scale,
-            B=self.weight,
-            B_scale=self.weight_scale,
-            bias=self.bias,
-            out_dtype=x.dtype,
-        )
-
-        # Optionally, quantize output and record scale
-        if self.quantize_output:
-            qoutput, output_scale = per_tensor_quantize(output)
-            if self.output_scale is None:
-                self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False)
-            elif output_scale > self.output_scale:
-                self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False)
-            output = qoutput.to(output.dtype) * output_scale
-
-        return output
-
-
-# Module responsible for representing the final checkpoint representation
-class FP8StaticLinear(torch.nn.Module):
-    def __init__(
-        self,
-        weight: torch.nn.Parameter,
-        weight_scale: torch.nn.Parameter,
-        bias: torch.nn.Parameter,
-        input_scale: torch.nn.Parameter,
-        output_scale: Optional[torch.nn.Parameter] = None,
-    ):
-        super().__init__()
-        self.weight = weight
-        self.weight_scale = weight_scale
-        self.bias = bias
-        self.input_scale = input_scale
-        self.output_scale = output_scale
-
-    def forward(self, x):
-        qinput = static_per_tensor_quantize(x, self.input_scale)
-        output = fp8_gemm(
-            A=qinput,
-            A_scale=self.input_scale,
-<<<<<<< HEAD
-            B=self.qweight,
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-            B_scale=self.weight_scale,
-            bias=self.bias,
-            out_dtype=x.dtype,
-        )
-<<<<<<< HEAD
-=======
-
-        if self.output_scale:
-            qoutput = static_per_tensor_quantize(output, self.output_scale)
-            output = qoutput.to(output.dtype) * self.output_scale
-
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-        return output
-
-
-# Module responsible for taking already quantized weights, and recording input scales (and possibly output scales) using an activation observer
-class FP8StaticLinearQuantizer(torch.nn.Module):
-    def __init__(
-        self,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        bias: torch.nn.Parameter,
-        quantize_output: bool = False,
-    ):
-        super().__init__()
-        self.weight = torch.nn.Parameter(weight, requires_grad=False)
-        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
-        self.bias = bias
-        self.input_scale = None
-        self.output_scale = None
-        self.quantize_output = quantize_output
-
-    def forward(self, x):
-        qinput, x_input_scale = per_tensor_quantize(x)
-        if self.input_scale is None:
-            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
-        elif x_input_scale > self.input_scale:
-            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
-        output = fp8_gemm(
-            A=qinput,
-            A_scale=self.input_scale,
-            B=self.weight,
-            B_scale=self.weight_scale,
-            bias=self.bias,
-            out_dtype=x.dtype,
-        )
-
-        # Optionally, quantize output and record scale
-        if self.quantize_output:
-            qoutput, output_scale = per_tensor_quantize(output)
-            if self.output_scale is None:
-                self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False)
-            elif output_scale > self.output_scale:
-                self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False)
-            output = qoutput.to(output.dtype) * output_scale
-
-        return output
-
-
-# Module responsible for representing the final checkpoint representation
-class FP8StaticLinear(torch.nn.Module):
-    def __init__(
-        self,
-        weight: torch.nn.Parameter,
-        weight_scale: torch.nn.Parameter,
-        bias: torch.nn.Parameter,
-        input_scale: torch.nn.Parameter,
-        output_scale: Optional[torch.nn.Parameter] = None,
-    ):
-        super().__init__()
-        self.weight = weight
-        self.weight_scale = weight_scale
-        self.bias = bias
-        self.input_scale = input_scale
-        self.output_scale = output_scale
-
-    def forward(self, x):
-        qinput = static_per_tensor_quantize(x, self.input_scale)
-        output = fp8_gemm(
-            A=qinput,
-            A_scale=self.input_scale,
-=======
->>>>>>> def2049 (Fix weight name)
-            B=self.weight,
-            B_scale=self.weight_scale,
-            bias=self.bias,
-            out_dtype=x.dtype,
-        )
-
-        if self.output_scale:
-            qoutput = static_per_tensor_quantize(output, self.output_scale)
-            output = qoutput.to(output.dtype) * self.output_scale
-
-        return output
-
-
-def replace_module(model: AutoModelForCausalLM, name: str, new_module: torch.nn.Module):
-    if "." in name:
-        parent_name = name.rsplit(".", 1)[0]
-        child_name = name[len(parent_name) + 1 :]
-        parent = model.get_submodule(parent_name)
-    else:
-        parent_name = ""
-        parent = model
-        child_name = name
-    setattr(parent, child_name, new_module)
-
-
-def quantize_weights(
-    model: AutoModelForCausalLM,
-    quantize_config: BaseQuantizeConfig,
-):
-    named_modules = list(model.named_modules())
-    for name, linear in tqdm.tqdm(named_modules, desc="Quantizing weights"):
-        if (
-            not isinstance(linear, torch.nn.Linear)
-            or name in quantize_config.ignored_layers
-        ):
-            continue
-        quant_weight, weight_scale = per_tensor_quantize(linear.weight)
-        bias = copy.deepcopy(linear.bias) if linear.bias is not None else None
-        quant_linear = FP8DynamicLinear(
-<<<<<<< HEAD
-<<<<<<< HEAD
-            weight=quant_weight, weight_scale=weight_scale, bias=bias
-=======
-            qweight=quant_weight, weight_scale=weight_scale, bias=bias
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
-            weight=quant_weight, weight_scale=weight_scale, bias=bias
->>>>>>> def2049 (Fix weight name)
-        )
-        replace_module(model, name, quant_linear)
-        del linear.weight
-        del linear.bias
-        del linear
-    cleanup_memory()
-
-
-def quantize_activations(
-    model: AutoModelForCausalLM,
-    quantize_config: BaseQuantizeConfig,
-    calibration_tokens,
-):
-    # Replace weight quantizer with a dynamic activation quantizer observer
-    for name, dynamic_quant_linear in model.named_modules():
-        if (
-            not isinstance(dynamic_quant_linear, FP8DynamicLinear)
-            or name in quantize_config.ignored_layers
-        ):
-            continue
-        quantizer = FP8StaticLinearQuantizer(
-<<<<<<< HEAD
-<<<<<<< HEAD
-            weight=dynamic_quant_linear.weight,
-=======
-            qweight=dynamic_quant_linear.qweight,
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
-            weight=dynamic_quant_linear.weight,
->>>>>>> def2049 (Fix weight name)
-            weight_scale=dynamic_quant_linear.weight_scale,
-            bias=dynamic_quant_linear.bias,
-            quantize_output=(
-                hasattr(quantize_config, "kv_cache_quant_layers")
-                and name in quantize_config.kv_cache_quant_layers
-            ),
-        )
-        replace_module(model, name, quantizer)
-        del dynamic_quant_linear
-    cleanup_memory()
-
-    # Pass through calibration data to measure activation scales
-<<<<<<< HEAD
-<<<<<<< HEAD
-    with torch.inference_mode():
-        with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar:
-            for row_idx in range(calibration_tokens.shape[0]):
-                model(calibration_tokens[row_idx].reshape(1, -1))
-                cleanup_memory()
-                pbar.update(1)
-=======
-=======
->>>>>>> 57c31bb (Use `torch.inference_mode()` for lower memory usage during calibration (#20))
-    with tqdm.tqdm(
-        total=calibration_tokens.shape[0], desc="Calibrating activation scales"
-    ) as pbar:
-        for row_idx in range(calibration_tokens.shape[0]):
-            model(calibration_tokens[row_idx].reshape(1, -1))
-            cleanup_memory()
-            pbar.update(1)
-<<<<<<< HEAD
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
-=======
-    with torch.inference_mode():
-        with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar:
-            for row_idx in range(calibration_tokens.shape[0]):
-                model(calibration_tokens[row_idx].reshape(1, -1))
-                cleanup_memory()
-                pbar.update(1)
->>>>>>> b1c6ad6 (Use `torch.inference_mode()` for lower memory usage during calibration (#20))
->>>>>>> 57c31bb (Use `torch.inference_mode()` for lower memory usage during calibration (#20))
-
-    # Replace dynamic quantizer observer with StaticLinear for export
-    for name, quantizer in model.named_modules():
-        if (
-            not isinstance(quantizer, FP8StaticLinearQuantizer)
-            or name in quantize_config.ignored_layers
-        ):
-            continue
-        static_proj = FP8StaticLinear(
-<<<<<<< HEAD
-<<<<<<< HEAD
-            weight=quantizer.weight,
-=======
-            qweight=quantizer.qweight,
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
-            weight=quantizer.weight,
->>>>>>> def2049 (Fix weight name)
-            weight_scale=quantizer.weight_scale,
-            bias=quantizer.bias,
-            input_scale=quantizer.input_scale,
-            output_scale=quantizer.output_scale,
-        )
-        replace_module(model, name, static_proj)
-        del quantizer
-    cleanup_memory()
-
-    # Post-process step for kv cache scales to take the k/v module
-    # `output_scale` parameters, take the max of them, and store them in
-    # the parent attention module as `kv_scale`
-    # NOTE: if we want to switch to the `output_scale` representation, we can simply remove this block
-    if hasattr(quantize_config, "kv_cache_quant_layers"):
-        # Assumes that list is ordered such that [layer0.k_proj, layer0.v_proj, layer1.k_proj, layer1.v_proj, ...]
-        # so we make a list of tuples [(layer0.k_proj, layer0.v_proj), (layer1.k_proj, layer1.v_proj), ...]
-        kv_proj_pairs = zip(*[iter(quantize_config.kv_cache_quant_layers)]*2)
-        for k_proj_name, v_proj_name in kv_proj_pairs:
-            parent_module_name = ".".join(k_proj_name.split(".")[:-1])
-            assert parent_module_name == ".".join(v_proj_name.split(".")[:-1])
-            parent_module = dict(model.named_modules())[parent_module_name]
-
-            k_proj = dict(model.named_modules())[k_proj_name]
-            v_proj = dict(model.named_modules())[v_proj_name]
-
-            kv_scale = max(k_proj.output_scale, v_proj.output_scale)
-            parent_module.kv_scale = torch.nn.Parameter(kv_scale, requires_grad=False)
-
-            # Remove output_scale from k_proj and v_proj
-            k_proj.output_scale = None
-            v_proj.output_scale = None
-    cleanup_memory()
-
-
-def save_quantized_model(
-    model: AutoModelForCausalLM,
-    quant_config: BaseQuantizeConfig,
-    save_dir: str,
-):
-    print(model)
-    print(f"Saving the model to {save_dir}")
-    static_q_dict = {
-        "quantization_config": {
-            "quant_method": "fp8",
-            "activation_scheme": quant_config.activation_scheme,
-            "ignored_layers": quant_config.ignored_layers,
-        }
-    }
-    if hasattr(quant_config, "kv_cache_quant_layers"):
-        static_q_dict["quantization_config"]["kv_cache_scheme"] = "static"
-    model.config.update(static_q_dict)
-    model.save_pretrained(save_dir)
-    tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
-    tokenizer.save_pretrained(save_dir)

From bbf352f58c1fbaffa825066d5fd1d666f255a5f5 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Jul 2024 17:36:01 -0400
Subject: [PATCH 3/9] Fix test

---
 tests/test_auto_fp8.py | 230 ++++++++++++-----------------------------
 1 file changed, 66 insertions(+), 164 deletions(-)

diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py
index bb852d9..dfe6e61 100644
--- a/tests/test_auto_fp8.py
+++ b/tests/test_auto_fp8.py
@@ -1,206 +1,108 @@
 import os
 import shutil
 
-<<<<<<< HEAD
-<<<<<<< HEAD
 import pytest
-=======
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
-import pytest
->>>>>>> 2739d61 (Add Qwen test)
 import safetensors.torch
+from datasets import load_dataset
 from transformers import AutoTokenizer
 
 from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
 
 MODELS = [
-<<<<<<< HEAD
-<<<<<<< HEAD
-    ("facebook/opt-125m", 160),
-    ("Qwen/Qwen2-0.5B-Instruct", 620),
-]
-
-<<<<<<< HEAD
-@pytest.mark.parametrize("model_id,target_size", MODELS)
-def test_dynamic_quantization(model_id, target_size):
-    quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic"
-=======
-def test_dynamic_quantization():
-    model_id = "facebook/opt-125m"
-    quantized_model_dir = "opt-125m-fp8-dynamic"
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
-    "facebook/opt-125m",
-    "Qwen/Qwen2-0.5B-Instruct",
-=======
     ("facebook/opt-125m", 160),
-<<<<<<< HEAD
-    ("Qwen/Qwen2-0.5B-Instruct", 600),
->>>>>>> 415c0b7 (Add fixed target sizes)
-=======
     ("Qwen/Qwen2-0.5B-Instruct", 620),
->>>>>>> 93c0d54 (Fix proj linear count)
 ]
 
-@pytest.mark.parametrize("model_id,target_size", MODELS)
-def test_dynamic_quantization(model_id, target_size):
-    quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic"
->>>>>>> 2739d61 (Add Qwen test)
-
-    quantize_config = BaseQuantizeConfig(
-        quant_method="fp8", activation_scheme="dynamic"
-    )
-
-    model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
-    model.model.to("cpu")
-
-    model.quantize()
-    model.save_quantized(quantized_model_dir)
-
-    # Measure checkpoint size and cleanup
-    model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
-    shutil.rmtree(quantized_model_dir)
+# @pytest.mark.parametrize("model_id,target_size", MODELS)
+# def test_dynamic_quantization(model_id, target_size):
+#     quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic"
 
-<<<<<<< HEAD
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
->>>>>>> c3acdee (Switch from output_scale to kv_scale)
-    # We expect the quantized model to be a certain size
-    target_size = target_size * (1024 * 1024)
-    assert model_size < target_size
+#     quantize_config = BaseQuantizeConfig(
+#         quant_method="fp8", activation_scheme="dynamic"
+#     )
 
+#     model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
+#     model.model.to("cpu")
 
-@pytest.mark.parametrize("model_id,target_size", MODELS)
-def test_static_quantization(model_id, target_size):
-    quantized_model_dir = model_id.split("/")[-1] + "-fp8-static"
-=======
-    # We expect the model to be < 160MB
-    target_size = 160 * (1024 * 1024)
-    assert model_size < target_size
+#     model.quantize()
+#     model.save_quantized(quantized_model_dir)
 
+#     # Measure checkpoint size and cleanup
+#     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
+#     shutil.rmtree(quantized_model_dir)
 
-<<<<<<< HEAD
-def test_static_quantization():
-    model_id = "facebook/opt-125m"
-    quantized_model_dir = "opt-125m-fp8-static"
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
-@pytest.mark.parametrize("model_id", MODELS)
-def test_static_quantization(model_id):
-=======
-    # We expect the model to be a certain size
-    target_size = target_size * (1024 * 1024)
-    assert model_size < target_size
+#     # We expect the quantized model to be a certain size
+#     target_size = target_size * (1024 * 1024)
+#     assert model_size < target_size
 
 
 @pytest.mark.parametrize("model_id,target_size", MODELS)
 def test_static_quantization(model_id, target_size):
->>>>>>> 415c0b7 (Add fixed target sizes)
     quantized_model_dir = model_id.split("/")[-1] + "-fp8-static"
->>>>>>> 2739d61 (Add Qwen test)
 
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
-    examples = ["auto-fp8 is an easy-to-use model quantization library"]
-    examples = tokenizer(examples, return_tensors="pt")
+    ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(2))
+    def preprocess(example):
+        example = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+        return tokenizer(
+            example,
+            padding=False,
+            max_length=32,
+            truncation=True,
+            add_special_tokens=False,
+        )
+    ds = ds.map(preprocess, remove_columns=ds.column_names)
 
     quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")
 
     model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
     model.model.to("cpu")
 
-    model.quantize(examples)
-    model.save_quantized(quantized_model_dir)
-
-    # Measure checkpoint size and cleanup
-    model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
-    shutil.rmtree(quantized_model_dir)
-
-<<<<<<< HEAD
-<<<<<<< HEAD
-    # We expect the quantized model to be a certain size
-    target_size = target_size * (1024 * 1024)
-    assert model_size < target_size
-
-@pytest.mark.parametrize("model_id,target_size", MODELS)
-def test_kv_cache_static_quantization(model_id, target_size):
-    quantized_model_dir = model_id.split("/")[-1] + "-fp8-static-kv"
-
-    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
-    examples = ["auto-fp8 is an easy-to-use model quantization library"]
-    examples = tokenizer(examples, return_tensors="pt")
-
-    quantize_config = BaseQuantizeConfig(
-        quant_method="fp8",
-        activation_scheme="static",
-        kv_cache_quant_targets=("k_proj", "v_proj"),
-    )
-
-    model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
-    model.model.to("cpu")
-
-    model.quantize(examples)
+    model.quantize(ds)
     model.save_quantized(quantized_model_dir)
 
-    tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors")
-    proj_linear_count = 0
-    kv_scale_count = 0
-    for name, _ in tensors.items():
-        if name.endswith("k_proj.weight") or name.endswith("v_proj.weight"):
-            proj_linear_count += 1
-        if name.endswith("kv_scale"):
-            kv_scale_count += 1
-    assert proj_linear_count // 2 == kv_scale_count
-
     # Measure checkpoint size and cleanup
     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
     shutil.rmtree(quantized_model_dir)
 
     # We expect the quantized model to be a certain size
-=======
-    # We expect the model to be < 160MB
->>>>>>> 415c0b7 (Add fixed target sizes)
-=======
-    # We expect the quantized model to be a certain size
->>>>>>> c3acdee (Switch from output_scale to kv_scale)
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size
 
-@pytest.mark.parametrize("model_id,target_size", MODELS)
-def test_kv_cache_static_quantization(model_id, target_size):
-    quantized_model_dir = model_id.split("/")[-1] + "-fp8-static-kv"
-
-    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
-    examples = ["auto-fp8 is an easy-to-use model quantization library"]
-    examples = tokenizer(examples, return_tensors="pt")
-
-    quantize_config = BaseQuantizeConfig(
-        quant_method="fp8",
-        activation_scheme="static",
-        kv_cache_quant_targets=("k_proj", "v_proj"),
-    )
-
-    model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
-    model.model.to("cpu")
-
-    model.quantize(examples)
-    model.save_quantized(quantized_model_dir)
-
-    tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors")
-    proj_linear_count = 0
-    kv_scale_count = 0
-    for name, _ in tensors.items():
-        if name.endswith("k_proj.weight") or name.endswith("v_proj.weight"):
-            proj_linear_count += 1
-        if name.endswith("kv_scale"):
-            kv_scale_count += 1
-    assert proj_linear_count // 2 == kv_scale_count
-
-    # Measure checkpoint size and cleanup
-    model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
-    shutil.rmtree(quantized_model_dir)
-
-    # We expect the quantized model to be a certain size
-    target_size = target_size * (1024 * 1024)
-    assert model_size < target_size
+# @pytest.mark.parametrize("model_id,target_size", MODELS)
+# def test_kv_cache_static_quantization(model_id, target_size):
+#     quantized_model_dir = model_id.split("/")[-1] + "-fp8-static-kv"
+
+#     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+#     examples = ["auto-fp8 is an easy-to-use model quantization library"]
+#     examples = tokenizer(examples, return_tensors="pt")
+
+#     quantize_config = BaseQuantizeConfig(
+#         quant_method="fp8",
+#         activation_scheme="static",
+#         kv_cache_quant_targets=("k_proj", "v_proj"),
+#     )
+
+#     model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
+#     model.model.to("cpu")
+
+#     model.quantize(examples)
+#     model.save_quantized(quantized_model_dir)
+
+#     tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors")
+#     proj_linear_count = 0
+#     kv_scale_count = 0
+#     for name, _ in tensors.items():
+#         if name.endswith("k_proj.weight") or name.endswith("v_proj.weight"):
+#             proj_linear_count += 1
+#         if name.endswith("kv_scale"):
+#             kv_scale_count += 1
+#     assert proj_linear_count // 2 == kv_scale_count
+
+#     # Measure checkpoint size and cleanup
+#     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
+#     shutil.rmtree(quantized_model_dir)
+
+#     # We expect the quantized model to be a certain size
+#     target_size = target_size * (1024 * 1024)
+#     assert model_size < target_size
\ No newline at end of file

From ab3dad3eba0663046a8e291b6984505352c64428 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Jul 2024 17:38:02 -0400
Subject: [PATCH 4/9] Add to requirements

---
 requirements.txt | 1 +
 setup.py         | 1 +
 2 files changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index f40dfeb..3c01461 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ transformers
 datasets
 accelerate
 tqdm
+llm-compressor @ git+https://github.com/vllm-project/llm-compressor.git
diff --git a/setup.py b/setup.py
index 7417754..c2b015d 100644
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,7 @@
         "datasets",
         "accelerate",
         "tqdm",
+        "llm-compressor @ git+https://github.com/vllm-project/llm-compressor.git"
     ],
     classifiers=[
         "Programming Language :: Python :: 3",

From be6eef2ea7fd37b2d189fd832cc825bcb661f594 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Jul 2024 17:40:21 -0400
Subject: [PATCH 5/9] Update example

---
 example_dataset.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/example_dataset.py b/example_dataset.py
index 82d336e..bf6b6fd 100644
--- a/example_dataset.py
+++ b/example_dataset.py
@@ -9,17 +9,10 @@
 tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
 tokenizer.pad_token = tokenizer.eos_token
 
-MAX_SEQUENCE_LENGTH = 2048
 ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
 def preprocess(example):
     example = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-    return tokenizer(
-        example,
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
+    return tokenizer(example, max_length=2048, truncation=True, add_special_tokens=False)
 ds = ds.map(preprocess, remove_columns=ds.column_names)
 
 quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")

From b4f830dbdfd86c2bd3fc338296c11902c6ab6181 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Jul 2024 17:41:32 -0400
Subject: [PATCH 6/9] Fix requirement

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 3c01461..191d853 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,4 @@ transformers
 datasets
 accelerate
 tqdm
-llm-compressor @ git+https://github.com/vllm-project/llm-compressor.git
+llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git
diff --git a/setup.py b/setup.py
index c2b015d..3dcd85f 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
         "datasets",
         "accelerate",
         "tqdm",
-        "llm-compressor @ git+https://github.com/vllm-project/llm-compressor.git"
+        "llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git"
     ],
     classifiers=[
         "Programming Language :: Python :: 3",

From af8f5a0f2ab4034501e4e6b37d5e90a9002ea040 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Jul 2024 17:54:06 -0400
Subject: [PATCH 7/9] Fix test

---
 auto_fp8/modeling.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py
index eb4d2ba..0e4e8cc 100644
--- a/auto_fp8/modeling.py
+++ b/auto_fp8/modeling.py
@@ -79,6 +79,7 @@ def quantize(self, dataset: Optional[Dataset] = None):
             model=self.model,
             dataset=dataset,
             recipe=recipe,
+            num_calibration_samples=dataset.shape[0],
         )
 
     def save_quantized(self, save_directory: str):

From 3063398f7c89d9f25dc86e6b77bc0ecf40b1ce7c Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Jul 2024 17:57:36 -0400
Subject: [PATCH 8/9] Test

---
 tests/test_auto_fp8.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py
index dfe6e61..6717ae1 100644
--- a/tests/test_auto_fp8.py
+++ b/tests/test_auto_fp8.py
@@ -56,8 +56,6 @@ def preprocess(example):
     quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")
 
     model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
-    model.model.to("cpu")
-
     model.quantize(ds)
     model.save_quantized(quantized_model_dir)
 

From 3f683f8617b8baaace7bd21f6c6ed36fa3ee7f0a Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Fri, 19 Jul 2024 10:30:19 -0400
Subject: [PATCH 9/9] Add support for dynamic activation

---
 auto_fp8/modeling.py   | 92 ++++++++++++++++++++++++++++++++++--------
 tests/test_auto_fp8.py | 34 ++++++++--------
 2 files changed, 91 insertions(+), 35 deletions(-)

diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py
index 0e4e8cc..79d80b2 100644
--- a/auto_fp8/modeling.py
+++ b/auto_fp8/modeling.py
@@ -6,6 +6,12 @@
 from llmcompressor.transformers import SparseAutoModelForCausalLM
 from llmcompressor.transformers import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationType,
+    QuantizationScheme,
+    QuantizationStrategy,
+)
 
 
 class BaseQuantizeConfig:
@@ -64,23 +70,75 @@ def from_pretrained(
         return cls(model, quantize_config)
 
     def quantize(self, dataset: Optional[Dataset] = None):
-        assert (
-            self.quantize_config.activation_scheme == "static"
-        ), "Dynamic isn't supported yet"
-        assert (
-            dataset is not None
-        ), "Calibration tokens required for static activation quantization"
-
-        recipe = QuantizationModifier(
-            targets="Linear", scheme="FP8", ignore=self.quantize_config.ignore_patterns
-        )
+        if self.quantize_config.activation_scheme == "dynamic":
+            if dataset is None:
+                # For dynamic activations, we don't care about calibration data
+                # being provided. However, we need to pass something
+                # TODO(mgoin): Remove once llmcompressor allows no dataset
+                from datasets import load_dataset
+                dataset = load_dataset("openai/openai_humaneval", split="test").select(range(1))
+                dataset = dataset.rename_column("prompt", "text")
 
-        oneshot(
-            model=self.model,
-            dataset=dataset,
-            recipe=recipe,
-            num_calibration_samples=dataset.shape[0],
-        )
+            FP8_W8 = QuantizationScheme(
+                targets=["Linear"],
+                weights=QuantizationArgs(
+                    num_bits=8,
+                    type=QuantizationType.FLOAT,
+                    strategy=QuantizationStrategy.TENSOR,
+                    symmetric=True,
+                    dynamic=False,
+                ),
+            )
+
+            recipe = QuantizationModifier(
+                config_groups={"group_0": FP8_W8},
+                ignore=self.quantize_config.ignore_patterns,
+            )
+
+            oneshot(
+                model=self.model,
+                dataset=dataset,
+                recipe=recipe,
+                num_calibration_samples=dataset.shape[0],
+            )
+        elif self.quantize_config.activation_scheme == "static":
+            assert (
+                dataset is not None
+            ), "Calibration tokens required for static activation quantization"
+
+            FP8_W8A8 = QuantizationScheme(
+                targets=["Linear"],
+                weights=QuantizationArgs(
+                    num_bits=8,
+                    type=QuantizationType.FLOAT,
+                    strategy=QuantizationStrategy.TENSOR,
+                    symmetric=True,
+                    dynamic=False,
+                ),
+                input_activations=QuantizationArgs(
+                    num_bits=8,
+                    type=QuantizationType.FLOAT,
+                    strategy=QuantizationStrategy.TENSOR,
+                    symmetric=True,
+                    dynamic=False,
+                ),
+            )
+
+            recipe = QuantizationModifier(
+                config_groups={"group_0": FP8_W8A8},
+                ignore=self.quantize_config.ignore_patterns,
+            )
+
+            oneshot(
+                model=self.model,
+                dataset=dataset,
+                recipe=recipe,
+                num_calibration_samples=dataset.shape[0],
+            )
+        else:
+            raise ValueError(
+                f"Unsupported activation_scheme={self.quantize_config.activation_scheme}"
+            )
 
     def save_quantized(self, save_directory: str):
         self.save_pretrained(save_directory, save_compressed=True)
@@ -89,4 +147,4 @@ def save_pretrained(self, save_directory: str, save_compressed: bool = True):
         self.model.save_pretrained(save_directory, save_compressed=save_compressed)
         tokenizer = AutoTokenizer.from_pretrained(self.model.config._name_or_path)
         tokenizer.save_pretrained(save_directory)
-        print(f"Saved final checkpoint to {os.path.abspath(save_directory)}")
\ No newline at end of file
+        print(f"Saved final checkpoint to {os.path.abspath(save_directory)}")
diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py
index 6717ae1..0322c2d 100644
--- a/tests/test_auto_fp8.py
+++ b/tests/test_auto_fp8.py
@@ -10,30 +10,28 @@
 
 MODELS = [
     ("facebook/opt-125m", 160),
-    ("Qwen/Qwen2-0.5B-Instruct", 620),
+    # ("Qwen/Qwen2-0.5B-Instruct", 620),
 ]
 
-# @pytest.mark.parametrize("model_id,target_size", MODELS)
-# def test_dynamic_quantization(model_id, target_size):
-#     quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic"
+@pytest.mark.parametrize("model_id,target_size", MODELS)
+def test_dynamic_quantization(model_id, target_size):
+    quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic"
 
-#     quantize_config = BaseQuantizeConfig(
-#         quant_method="fp8", activation_scheme="dynamic"
-#     )
+    quantize_config = BaseQuantizeConfig(
+        quant_method="fp8", activation_scheme="dynamic"
+    )
 
-#     model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
-#     model.model.to("cpu")
-
-#     model.quantize()
-#     model.save_quantized(quantized_model_dir)
+    model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
+    model.quantize()
+    model.save_quantized(quantized_model_dir)
 
-#     # Measure checkpoint size and cleanup
-#     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
-#     shutil.rmtree(quantized_model_dir)
+    # Measure checkpoint size and cleanup
+    model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
+    shutil.rmtree(quantized_model_dir)
 
-#     # We expect the quantized model to be a certain size
-#     target_size = target_size * (1024 * 1024)
-#     assert model_size < target_size
+    # We expect the quantized model to be a certain size
+    target_size = target_size * (1024 * 1024)
+    assert model_size < target_size
 
 
 @pytest.mark.parametrize("model_id,target_size", MODELS)