From 2d56538e4bf4a6b41f38fb1e8b326fecd2bf7c0e Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Tue, 14 Jan 2025 21:58:38 +0000
Subject: [PATCH] add sparsity testing with vllm; give option to not save
 compressed

---
 .../vLLM/configs/sparse2of4_fp8_dynamic.yaml  |  7 ++++++
 tests/e2e/vLLM/configs/sparse_24.yaml         |  8 ++++++
 .../Sparse_2of4/recipe_sparse_2of4.yaml       |  6 +++++
 .../recipe_sparse_2of4_fp8_dynamic.yaml       | 25 +++++++++++++++++++
 tests/e2e/vLLM/test_vllm.py                   | 11 ++++++--
 5 files changed, 55 insertions(+), 2 deletions(-)
 create mode 100644 tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml
 create mode 100644 tests/e2e/vLLM/configs/sparse_24.yaml
 create mode 100644 tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml
 create mode 100644 tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml

diff --git a/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml b/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml
new file mode 100644
index 000000000..e1785ce2c
--- /dev/null
+++ b/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml
@@ -0,0 +1,7 @@
+cadence: "nightly"
+test_type: "regression"
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
+scheme: sparse2of4_fp8_dynamic
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
\ No newline at end of file
diff --git a/tests/e2e/vLLM/configs/sparse_24.yaml b/tests/e2e/vLLM/configs/sparse_24.yaml
new file mode 100644
index 000000000..653168b97
--- /dev/null
+++ b/tests/e2e/vLLM/configs/sparse_24.yaml
@@ -0,0 +1,8 @@
+cadence: "nightly"
+test_type: "regression"
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml
+scheme: sparse2of4_only
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
+save_compressed: False
\ No newline at end of file
diff --git a/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml
new file mode 100644
index 000000000..895e02450
--- /dev/null
+++ b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml
@@ -0,0 +1,6 @@
+sparsity_stage:
+  sparsity_modifiers:
+    SparseGPTModifier:
+      sparsity: 0.5
+      mask_structure: "2:4"
+      sequential_update: false
diff --git a/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
new file mode 100644
index 000000000..1e6b350e1
--- /dev/null
+++ b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
@@ -0,0 +1,25 @@
+sparsity_stage:
+  run_type: oneshot
+  sparsity_modifiers:
+    SparseGPTModifier:
+      sparsity: 0.5
+      mask_structure: "2:4"
+      sequential_update: false
+quantization_stage:
+  run_type: oneshot
+  quantization_modifiers:
+    ConstantPruningModifier:
+      targets: [
+        're:.*q_proj.weight',
+        're:.*k_proj.weight', 
+        're:.*v_proj.weight',
+        're:.*o_proj.weight',
+        're:.*gate_proj.weight',
+        're:.*up_proj.weight',
+        're:.*down_proj.weight',
+      ]
+      start: 0
+    QuantizationModifier:
+      targets: ["Linear"]
+      ignore: ["lm_head"]
+      scheme: "FP8_DYNAMIC"
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
index 3aeff6f7f..5b475a5b7 100644
--- a/tests/e2e/vLLM/test_vllm.py
+++ b/tests/e2e/vLLM/test_vllm.py
@@ -13,6 +13,7 @@
 from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing
 from tests.examples.utils import requires_gpu_count
 
+"""
 try:
     from vllm import LLM, SamplingParams
 
@@ -20,6 +21,7 @@
 except ImportError:
     vllm_installed = False
     logger.warning("vllm is not installed. This test will be skipped")
+"""
 
 HF_MODEL_HUB_NAME = "nm-testing"
 TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", "")
@@ -41,7 +43,7 @@ def record_config_file(record_testsuite_property: Callable[[str, object], None])
 # Will run each test case in its own process through run_tests.sh
 # emulating vLLM CI testing
 @requires_gpu_count(1)
-@pytest.mark.skipif(not vllm_installed, reason="vLLM is not installed, skipping test")
+# @pytest.mark.skipif(not vllm_installed, reason="vLLM is not installed, skipping test")
 class TestvLLM:
     """
     The following test quantizes a model using a preset scheme or recipe,
@@ -73,6 +75,7 @@ def set_up(self):
         self.recipe = eval_config.get("recipe")
         self.quant_type = eval_config.get("quant_type")
         self.save_dir = eval_config.get("save_dir")
+        self.save_compressed = eval_config.get("save_compressed", True)
 
         logger.info("========== RUNNING ==============")
         logger.info(self.scheme)
@@ -112,7 +115,9 @@ def test_vllm(self):
         self._check_session_contains_recipe()
 
         logger.info("================= SAVING TO DISK ======================")
-        oneshot_model.save_pretrained(self.save_dir)
+        oneshot_model.save_pretrained(
+            self.save_dir, save_compressed=self.save_compressed
+        )
         tokenizer.save_pretrained(self.save_dir)
         recipe_path = os.path.join(self.save_dir, "recipe.yaml")
 
@@ -134,6 +139,7 @@ def test_vllm(self):
             folder_path=self.save_dir,
         )
 
+        """
         logger.info("================= RUNNING vLLM =========================")
 
         sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
@@ -156,6 +162,7 @@ def test_vllm(self):
             logger.info(generated_text)
 
         self.tear_down()
+        """
 
     def tear_down(self):
         if self.save_dir is not None: