From b244e0640e3ba83db412ca867e83c48b4835b076 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 13 Jan 2025 12:37:39 -0500
Subject: [PATCH 1/7] Remove old examples (#1062)

Summary
- No longer required
---
 examples/automodelforcausallm/README.md             | 13 -------------
 .../run_automodelforcausallm.py                     | 11 -----------
 2 files changed, 24 deletions(-)
 delete mode 100644 examples/automodelforcausallm/README.md
 delete mode 100644 examples/automodelforcausallm/run_automodelforcausallm.py

diff --git a/examples/automodelforcausallm/README.md b/examples/automodelforcausallm/README.md
deleted file mode 100644
index e40cb5c2a..000000000
--- a/examples/automodelforcausallm/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Loading models using `AutoModelForCausalLM`
-
-Models quantized through `llm-compressor` can be loaded directly through 
-`AutoModelForCausalLM`. Note: this requires `transformers>=v4.45.0` and 
-`compressed-tensors>v0.6.0`.
-
-```python
-from transformers import AutoModelForCausalLM
-
-MODEL_ID = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
-
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
-```
diff --git a/examples/automodelforcausallm/run_automodelforcausallm.py b/examples/automodelforcausallm/run_automodelforcausallm.py
deleted file mode 100644
index 791b4d3d5..000000000
--- a/examples/automodelforcausallm/run_automodelforcausallm.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-MODEL_ID = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
-
-# Use the AutoModelForCausalLM to run the model
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids
-output = model.generate(input_ids, max_new_tokens=100)
-print(tokenizer.decode(output[0]))

From a87734c835375e11d107f4150c67710bdd604284 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 13 Jan 2025 12:38:54 -0500
Subject: [PATCH 2/7] VLM: Fix typo bug in
 TraceableLlavaForConditionalGeneration (#1065)

## Purpose ##
* Fix bug in `maybe_install_metadata_inputs_embeds`

## Changes ##
* Rename `maybe_install_metadata_inputs_embeds` to
maybe_install_metadata_inputs_embeds_masked`
* Add TRACING comment

## Testing ##
* Llava example in #1064

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/transformers/tracing/llava.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/llmcompressor/transformers/tracing/llava.py b/src/llmcompressor/transformers/tracing/llava.py
index 0f993a356..cce636601 100644
--- a/src/llmcompressor/transformers/tracing/llava.py
+++ b/src/llmcompressor/transformers/tracing/llava.py
@@ -58,7 +58,7 @@ def maybe_install_metadata_image_features(
 
 # TRACING: The shape of inputs_embeds is known. This function compensates for
 # the fact that shape inference through `masked_scatter` is not implemented yet
-def maybe_install_metadata_inputs_embeds(
+def maybe_install_metadata_inputs_embeds_masked(
     inputs_embeds_masked: Union[torch.Tensor, HFProxy],
     inputs_embeds: Union[torch.Tensor, HFProxy],
     special_image_mask: Union[torch.Tensor, HFProxy],
@@ -70,7 +70,7 @@ def maybe_install_metadata_inputs_embeds(
         )
         inputs_embeds_masked.install_metadata(metadata)
 
-    return inputs_embeds
+    return inputs_embeds_masked
 
 
 # TRACING: override `__init__` and `forward`
@@ -153,6 +153,7 @@ def forward(
                 vision_feature_select_strategy=vision_feature_select_strategy,
             )
 
+            # TRACING: install metadata
             image_features = maybe_install_metadata_image_features(
                 image_features, pixel_values, self.config
             )
@@ -223,7 +224,7 @@ def forward(
             inputs_embeds_masked = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
             # TRACING: install metadata
-            inputs_embeds_masked = maybe_install_metadata_inputs_embeds(inputs_embeds_masked, inputs_embeds, special_image_mask, image_features)
+            inputs_embeds_masked = maybe_install_metadata_inputs_embeds_masked(inputs_embeds_masked, inputs_embeds, special_image_mask, image_features)
             inputs_embeds = inputs_embeds_masked
 
         outputs = self.language_model(

From fcbadc78923d243892cadeb48cf3f52052e57882 Mon Sep 17 00:00:00 2001
From: Domenic Barbuzzi <domenic@neuralmagic.com>
Date: Mon, 13 Jan 2025 13:48:37 -0500
Subject: [PATCH 3/7] Add tests for "examples/sparse_2of4_[...]" (#1067)

SUMMARY:
Add tests for the recently added "sparse_2of4_quantization_fp8" examples
folder. This tests the `llama3_8b_2of4.py` example script in that folder
when run both with and without the `--fp8` flag.


TEST PLAN:
```shell
# using 1 GPU on beaker
$ pytest tests/examples/test_sparse_2of4_quantization_fp8.py
================================================= test session starts ==================================================
platform linux -- Python 3.9.20, pytest-8.3.4, pluggy-1.5.0 -- /home/domenic/code/llm-compressor/.venv/bin/python3
cachedir: .pytest_cache
rootdir: /home/domenic/code/llm-compressor
configfile: pyproject.toml
plugins: mock-3.14.0, rerunfailures-15.0
collected 2 items

tests/examples/test_sparse_2of4_quantization_fp8.py::TestSparse2of4QuantizationFP8::test_blah[flags0] PASSED
tests/examples/test_sparse_2of4_quantization_fp8.py::TestSparse2of4QuantizationFP8::test_blah[flags1] PASSED

============================================ 2 passed in 3393.56s (0:56:33) ============================================
```

---------

Signed-off-by: Domenic Barbuzzi <domenic@neuralmagic.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../test_sparse_2of4_quantization_fp8.py      | 33 +++++++++++++++++++
 tests/examples/utils.py                       |  7 +++-
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 tests/examples/test_sparse_2of4_quantization_fp8.py

diff --git a/tests/examples/test_sparse_2of4_quantization_fp8.py b/tests/examples/test_sparse_2of4_quantization_fp8.py
new file mode 100644
index 000000000..410a4e44e
--- /dev/null
+++ b/tests/examples/test_sparse_2of4_quantization_fp8.py
@@ -0,0 +1,33 @@
+from pathlib import Path
+
+import pytest
+
+from tests.examples.utils import (
+    copy_and_run_script,
+    gen_cmd_fail_message,
+    requires_gpu_count,
+)
+
+
+@pytest.fixture
+def example_dir() -> str:
+    return "examples/sparse_2of4_quantization_fp8"
+
+
+@requires_gpu_count(1)
+class TestSparse2of4QuantizationFP8:
+    """
+    Tests for examples in the "sparse_2of4_quantization_fp8" example folder.
+    """
+
+    @pytest.mark.parametrize(("flags"), [[], ["--fp8"]])
+    def test_blah(self, example_dir: str, tmp_path: Path, flags: list[str]):
+        """
+        Tests for the "llama3_8b_2of4.py" example script.
+        """
+        script_filename = "llama3_8b_2of4.py"
+        command, result = copy_and_run_script(
+            tmp_path, example_dir, script_filename, flags=flags
+        )
+
+        assert result.returncode == 0, gen_cmd_fail_message(command, result)
diff --git a/tests/examples/utils.py b/tests/examples/utils.py
index 38ff98d64..29eba8dd4 100644
--- a/tests/examples/utils.py
+++ b/tests/examples/utils.py
@@ -68,7 +68,10 @@ def copy_and_run_command(
 
 
 def copy_and_run_script(
-    tmp_path: Path, example_dir: str, script_filename: str
+    tmp_path: Path,
+    example_dir: str,
+    script_filename: str,
+    flags: Optional[list[str]] = None,
 ) -> Tuple[List[str], CompletedProcess[str]]:
     """
     Copies the contents of example_dir (relative to the current working directory) to
@@ -81,6 +84,8 @@ def copy_and_run_script(
     :return: subprocess.CompletedProcess object
     """
     command = [sys.executable, script_filename]
+    if flags:
+        command.extend(flags)
     return command, copy_and_run_command(tmp_path, example_dir, command)
 
 
From 98688f82fe4a0e2b7b1ede4e7be0b79bf4ae1ba3 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 13 Jan 2025 17:28:11 -0500
Subject: [PATCH 4/7] VLM Image Examples (#1064)

## Purpose ##
* Provide sample generation with images in VLM examples

## Prerequisites ##
* #1065

## Changes ##
* Replace sample generation text prompt with one that also contains an
image of a cat, similar to the example provided by
[llava-1.5](https://huggingface.co/nm-testing/llava-1.5-7b-hf-FP8-dynamic/discussions/1)
# <img width="50%" alt="tool icon"
src="http://images.cocodataset.org/train2017/000000231895.jpg" />

## Testing ##
* The sample generation was run without oneshot in order to save time

Llava
```
========== SAMPLE GENERATION ==============
USER:
Please describe the animal in this image
 ASSISTANT: A white kitten is laying on a computer keyboard.
==========================================
```

Mllama
```
========== SAMPLE GENERATION ==============
user

Please describe the animal in this image
assistant

The image features a small white cat curled up on a computer keyboard, which is situated on a desk in front of a computer monitor. The cat's fur is short and white, and it is lying on its side, with its head resting on the keyboard and its body curled up behind it. The keyboard is white and has a standard QWERTY layout, with the cat positioned on the right-hand side.

In the background, a computer monitor is visible, along with various office supplies and kn
==========================================
```

Pixtral
```
========== SAMPLE GENERATION ==============
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Please describe the animal in this image
The image features a white cat comfortably resting on a white computer keyboard. The cat appears relaxed and is lying down with its head resting on the keys. The keyboard is placed on a wooden desk. In the background, there are various items on the desk, including a computer monitor, a yellow toy, a small bowl, and a red apple. The overall scene suggests a cozy and homely environment where the cat feels at ease.
==========================================
```

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 examples/multimodal_vision/llava_example.py   | 23 +++++++++++++++----
 examples/multimodal_vision/mllama_example.py  | 21 ++++++++++++++---
 examples/multimodal_vision/pixtral_example.py | 23 +++++++++++++++----
 3 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
index c86cf0dfe..6fb308e1c 100644
--- a/examples/multimodal_vision/llava_example.py
+++ b/examples/multimodal_vision/llava_example.py
@@ -1,3 +1,5 @@
+import requests
+from PIL import Image
 from transformers import AutoProcessor
 
 from llmcompressor.modifiers.quantization import GPTQModifier
@@ -23,8 +25,8 @@
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
         sequential_targets=["LlamaDecoderLayer"],
+        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
     ),
 ]
 
@@ -43,9 +45,22 @@
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
-input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
-print(processor.decode(output[0]))
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Please describe the animal in this image\n"},
+            {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
+raw_image = Image.open(requests.get(image_url, stream=True).raw)
+
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
 
 # Save to disk compressed.
diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
index 16c17f18e..a5f6e3921 100644
--- a/examples/multimodal_vision/mllama_example.py
+++ b/examples/multimodal_vision/mllama_example.py
@@ -1,3 +1,5 @@
+import requests
+from PIL import Image
 from transformers import AutoProcessor
 
 from llmcompressor.modifiers.quantization import GPTQModifier
@@ -42,9 +44,22 @@
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
-input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
-print(processor.decode(output[0]))
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Please describe the animal in this image\n"},
+            {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
+raw_image = Image.open(requests.get(image_url, stream=True).raw)
+
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
 
 # Save to disk compressed.
diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
index e068a6dc9..ecf85d8e1 100644
--- a/examples/multimodal_vision/pixtral_example.py
+++ b/examples/multimodal_vision/pixtral_example.py
@@ -1,3 +1,5 @@
+import requests
+from PIL import Image
 from transformers import AutoProcessor
 
 from llmcompressor.modifiers.quantization import GPTQModifier
@@ -23,8 +25,8 @@
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
         sequential_targets=["MistralDecoderLayer"],
+        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
     ),
 ]
 
@@ -43,9 +45,22 @@
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
-input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
-print(processor.decode(output[0]))
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Please describe the animal in this image\n"},
+            {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
+raw_image = Image.open(requests.get(image_url, stream=True).raw)
+
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
 
 # Save to disk compressed.

From 625e127541867d98ee08bac502581a32d4018b63 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 14 Jan 2025 10:43:23 -0500
Subject: [PATCH 5/7] Add quick warning for DeepSeek with transformers 4.48.0
 (#1066)

Summary
- Add warning about import error for DeepSeek with transformers 4.48.0

---------

Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/deepseek_moe_w4a16.py     | 4 ++++
 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py  | 4 ++++
 examples/quantizing_moe/deepseek_moe_w8a8_int8.py | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
index 3d7d33099..55a7021b4 100644
--- a/examples/quantizing_moe/deepseek_moe_w4a16.py
+++ b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -5,6 +5,10 @@
 from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
+# NOTE: transformers 4.48.0 has an import error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
 # select a Mixture of Experts model for quantization
 MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
 
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
index 666da8f9a..cda202eb9 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -4,6 +4,10 @@
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.transformers import oneshot
 
+# NOTE: transformers 4.48.0 has an import error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
 # select a Mixture of Experts model for quantization
 MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
index ba215aa9e..289f4234f 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -6,6 +6,10 @@
 from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
+# NOTE: transformers 4.48.0 has an import error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
 # select a Mixture of Experts model for quantization
 MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 

From 28fff75069edd8fffbe294c1562b5324e53202d8 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Tue, 14 Jan 2025 12:15:35 -0500
Subject: [PATCH 6/7] [KV Cache] kv-cache end to end unit tests (#141)

SUMMARY:
Tests for
https://github.com/neuralmagic/compressed-tensors/pull/148

Note:
Blocked on transformers to update to include

https://github.com/huggingface/transformers/commit/181c962aabb4be59ddba82071d82adbbe3b3922d

---------

Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../transformers/kv_cache/test_kv_cache.py    | 252 ++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100644 tests/llmcompressor/transformers/kv_cache/test_kv_cache.py

diff --git a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py
new file mode 100644
index 000000000..f98a06a91
--- /dev/null
+++ b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py
@@ -0,0 +1,252 @@
+import os
+from pathlib import Path
+
+import pytest
+from accelerate import init_empty_weights
+from compressed_tensors.quantization.lifecycle import KVCacheScaleType
+from compressed_tensors.quantization.utils.helpers import iter_named_quantizable_modules
+from datasets import load_dataset
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor.core import reset_session
+from llmcompressor.transformers import oneshot
+
+NUM_CALIBRATION_SAMPLES = 16
+MAX_SEQUENCE_LENGTH = 512
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+MODEL_IDS = [
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "microsoft/Phi-3-mini-4k-instruct",
+]
+
+
+@pytest.fixture(scope="session")
+def oneshot_fixture():
+    def _oneshot_fixture(tmp_path: Path):
+        num_bits = 8
+        _type = "float"
+        strategy = "tensor"
+        dynamic = False
+        symmetric = True
+        recipe = f"""
+        quant_stage:
+            quant_modifiers:
+                QuantizationModifier:
+                    kv_cache_scheme:
+                        num_bits: {num_bits}
+                        type: {_type}
+                        strategy: {strategy}
+                        dynamic: {dynamic}
+                        symmetric: {symmetric}
+        """
+        used_args = dict(
+            num_bits=num_bits,
+            _type=_type,
+            strategy=strategy,
+            dynamic=dynamic,
+            symmetric=symmetric,
+        )
+        oneshot_args = dict(
+            dataset="open_platypus",
+            recipe=recipe,
+            num_calibration_samples=16,
+        )
+        for model_id in MODEL_IDS:
+            oneshot_args["output_dir"] = os.path.join(tmp_path, model_id)
+            used_args["output_dir"] = oneshot_args["output_dir"]
+            yield oneshot(model=model_id, **oneshot_args), used_args
+
+    return _oneshot_fixture
+
+
+@pytest.fixture(scope="session")
+def kv_cache_fixture():
+    def _kv_cache_fixture(recipe: str, tmp_path: Path):
+        num_bits = 8
+        _type = "float"
+        strategy = "tensor"
+        dynamic = False
+        symmetric = True
+
+        recipe = recipe.format(
+            num_bits=num_bits,
+            _type=_type,
+            strategy=strategy,
+            dynamic=dynamic,
+            symmetric=symmetric,
+        )
+
+        model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+
+        ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+        ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        def preprocess(example):
+            return {
+                "text": tokenizer.apply_chat_template(
+                    example["messages"],
+                    tokenize=False,
+                )
+            }
+
+        ds = ds.map(preprocess)
+
+        def tokenize(sample):
+            return tokenizer(
+                sample["text"],
+                padding=False,
+                max_length=MAX_SEQUENCE_LENGTH,
+                truncation=True,
+                add_special_tokens=False,
+            )
+
+        ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+        output_dir = os.path.join(tmp_path, model_id[-1].replace("-", "_"))
+
+        oneshot_args = dict(
+            model=model_id,
+            dataset=ds,
+            recipe=recipe,
+            max_seq_length=MAX_SEQUENCE_LENGTH,
+            num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+            output_dir=output_dir,
+        )
+
+        oneshot(**oneshot_args)
+        reset_session()
+
+        yield (
+            output_dir,
+            {
+                "num_bits": num_bits,
+                "_type": _type,
+                "strategy": strategy,
+                "dynamic": dynamic,
+                "symmetric": symmetric,
+                "output_dir": output_dir,
+            },
+        )
+
+    return _kv_cache_fixture
+
+
+def test_kv_cache_config_format(oneshot_fixture, tmp_path):
+    _, used_args = next(oneshot_fixture(tmp_path))
+    output_dir = used_args["output_dir"]
+    config = AutoConfig.from_pretrained(str(output_dir))
+    quant_config = config.quantization_config
+    assert quant_config is not None
+    assert quant_config["kv_cache_scheme"] is not None
+
+    kv_cache_scheme = quant_config["kv_cache_scheme"]
+    assert kv_cache_scheme["num_bits"] == used_args["num_bits"]
+    assert kv_cache_scheme["type"] == used_args["_type"]
+    assert kv_cache_scheme["strategy"] == used_args["strategy"]
+    assert kv_cache_scheme["dynamic"] == used_args["dynamic"]
+    assert kv_cache_scheme["symmetric"] == used_args["symmetric"]
+
+
+def test_kv_cache_model_state_dict_attr(oneshot_fixture, tmp_path):
+    model, used_args = next(oneshot_fixture(tmp_path))
+    output_dir = used_args["output_dir"]
+    with init_empty_weights():
+        model = AutoModelForCausalLM.from_pretrained(str(output_dir))
+
+    counts = 0
+    for name, submodule in iter_named_quantizable_modules(
+        model, include_children=False, include_attn=True
+    ):
+        counts += 1
+        assert "self_attn" in name
+        assert hasattr(submodule, KVCacheScaleType.VALUE.value)
+        assert hasattr(submodule, KVCacheScaleType.KEY.value)
+    assert counts > 0
+
+
+def test_kv_cache_gptq_config_format(kv_cache_fixture, tmp_path):
+    recipe = """
+    quant_stage:
+        quant_modifiers:
+            QuantizationModifier:
+                kv_cache_scheme:
+                    num_bits: {num_bits}
+                    type: {_type}
+                    strategy: {strategy}
+                    dynamic: {dynamic}
+                    symmetric: {symmetric}
+    """
+
+    output_dir, used_args = next(kv_cache_fixture(recipe, tmp_path))
+
+    config = AutoConfig.from_pretrained(output_dir)
+    quant_config = config.quantization_config
+    assert quant_config is not None
+    assert quant_config.get("kv_cache_scheme") is not None
+
+    kv_cache_scheme = quant_config["kv_cache_scheme"]
+    assert kv_cache_scheme["num_bits"] == used_args["num_bits"]
+    assert kv_cache_scheme["type"] == used_args["_type"]
+    assert kv_cache_scheme["strategy"] == used_args["strategy"]
+    assert kv_cache_scheme["dynamic"] == used_args["dynamic"]
+    assert kv_cache_scheme["symmetric"] == used_args["symmetric"]
+
+    with init_empty_weights():
+        model = AutoModelForCausalLM.from_pretrained(output_dir)
+
+    counts = 0
+    for name, submodule in iter_named_quantizable_modules(
+        model, include_children=False, include_attn=True
+    ):
+        counts += 1
+        assert "self_attn" in name
+        assert hasattr(submodule, KVCacheScaleType.VALUE.value)
+        assert hasattr(submodule, KVCacheScaleType.KEY.value)
+
+    assert counts > 0
+
+
+def test_kv_cache_gptq_model_state_dict_attr(kv_cache_fixture, tmp_path):
+    recipe = """
+    quant_stage:
+        quant_modifiers:
+            QuantizationModifier:
+                kv_cache_scheme:
+                    num_bits: {num_bits}
+                    type: {_type}
+                    strategy: {strategy}
+                    dynamic: {dynamic}
+                    symmetric: {symmetric}
+            GPTQModifier:
+                sequential_update: false
+                ignore: ["lm_head"]
+                config_groups:
+                    group_0:
+                        weights:
+                            num_bits: 4
+                            type: "int"
+                            symmetric: true
+                            strategy: "channel"
+                            actorder: False
+                        targets: ["Linear"]
+    """
+
+    output_dir, _ = next(kv_cache_fixture(recipe, tmp_path))
+
+    with init_empty_weights():
+        model = AutoModelForCausalLM.from_pretrained(output_dir)
+
+    counts = 0
+    for name, submodule in iter_named_quantizable_modules(
+        model, include_children=False, include_attn=True
+    ):
+        counts += 1
+        assert "self_attn" in name
+        assert hasattr(submodule, KVCacheScaleType.VALUE.value)
+        assert hasattr(submodule, KVCacheScaleType.KEY.value)
+
+    assert counts > 0

From fe318295271a4e001f0687acdcbdd5b698a12194 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Tue, 14 Jan 2025 12:54:05 -0500
Subject: [PATCH 7/7] [E2E Testing] Fix HF upload (#1061)

SUMMARY:
Fix for e2e tests for models that do not exist on HF.
Error comes from repo not existing on HF, hence trying to push to a
non-existing repo.

Error from e2e.

https://github.com/neuralmagic/llm-compressor-testing/actions/runs/12729284591/job/35480943434#step:15:6775


TEST PLAN:
Verified that e2e works with one existing config and all of new kv-cache
configs locally, pushing to nm-testing.
Note:[ Custom test
runners](https://github.com/neuralmagic/llm-compressor-testing#custom-test-runs)
are down.

---------

Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 tests/e2e/vLLM/test_vllm.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
index 3aeff6f7f..e554ad3ff 100644
--- a/tests/e2e/vLLM/test_vllm.py
+++ b/tests/e2e/vLLM/test_vllm.py
@@ -22,6 +22,7 @@
     logger.warning("vllm is not installed. This test will be skipped")
 
 HF_MODEL_HUB_NAME = "nm-testing"
+
 TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", "")
 
 EXPECTED_SAVED_FILES = [
@@ -129,8 +130,17 @@ def test_vllm(self):
 
         logger.info("================= UPLOADING TO HUB ======================")
 
+        stub = f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e"
+
+        self.api.create_repo(
+            repo_id=stub,
+            exist_ok=True,
+            repo_type="model",
+            private=False,
+        )
+
         self.api.upload_folder(
-            repo_id=f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e",
+            repo_id=stub,
             folder_path=self.save_dir,
         )