From 2d56538e4bf4a6b41f38fb1e8b326fecd2bf7c0e Mon Sep 17 00:00:00 2001 From: Dipika <dipikasikka1@gmail.com> Date: Tue, 14 Jan 2025 21:58:38 +0000 Subject: [PATCH] add sparsity testing with vllm; give option to not save compressed --- .../vLLM/configs/sparse2of4_fp8_dynamic.yaml | 7 ++++++ tests/e2e/vLLM/configs/sparse_24.yaml | 8 ++++++ .../Sparse_2of4/recipe_sparse_2of4.yaml | 6 +++++ .../recipe_sparse_2of4_fp8_dynamic.yaml | 25 +++++++++++++++++++ tests/e2e/vLLM/test_vllm.py | 11 ++++++-- 5 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml create mode 100644 tests/e2e/vLLM/configs/sparse_24.yaml create mode 100644 tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml create mode 100644 tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml diff --git a/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml b/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml new file mode 100644 index 000000000..e1785ce2c --- /dev/null +++ b/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml +scheme: sparse2of4_fp8_dynamic +dataset_id: HuggingFaceH4/ultrachat_200k +dataset_split: train_sft \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/sparse_24.yaml b/tests/e2e/vLLM/configs/sparse_24.yaml new file mode 100644 index 000000000..653168b97 --- /dev/null +++ b/tests/e2e/vLLM/configs/sparse_24.yaml @@ -0,0 +1,8 @@ +cadence: "nightly" +test_type: "regression" +model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml +scheme: sparse2of4_only +dataset_id: HuggingFaceH4/ultrachat_200k +dataset_split: train_sft +save_compressed: False \ No newline at end of file diff --git a/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml new file mode 100644 index 000000000..895e02450 --- /dev/null +++ b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml @@ -0,0 +1,6 @@ +sparsity_stage: + sparsity_modifiers: + SparseGPTModifier: + sparsity: 0.5 + mask_structure: "2:4" + sequential_update: false diff --git a/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml new file mode 100644 index 000000000..1e6b350e1 --- /dev/null +++ b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml @@ -0,0 +1,25 @@ +sparsity_stage: + run_type: oneshot + sparsity_modifiers: + SparseGPTModifier: + sparsity: 0.5 + mask_structure: "2:4" + sequential_update: false +quantization_stage: + run_type: oneshot + quantization_modifiers: + ConstantPruningModifier: + targets: [ + 're:.*q_proj.weight', + 're:.*k_proj.weight', + 're:.*v_proj.weight', + 're:.*o_proj.weight', + 're:.*gate_proj.weight', + 're:.*up_proj.weight', + 're:.*down_proj.weight', + ] + start: 0 + QuantizationModifier: + targets: ["Linear"] + ignore: ["lm_head"] + scheme: "FP8_DYNAMIC" diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index 3aeff6f7f..5b475a5b7 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -13,6 +13,7 @@ from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing from tests.examples.utils import requires_gpu_count +""" try: from vllm import LLM, SamplingParams @@ -20,6 +21,7 @@ except ImportError: vllm_installed = False logger.warning("vllm is not installed. This test will be skipped") +""" HF_MODEL_HUB_NAME = "nm-testing" TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", "") @@ -41,7 +43,7 @@ def record_config_file(record_testsuite_property: Callable[[str, object], None]) # Will run each test case in its own process through run_tests.sh # emulating vLLM CI testing @requires_gpu_count(1) -@pytest.mark.skipif(not vllm_installed, reason="vLLM is not installed, skipping test") +# @pytest.mark.skipif(not vllm_installed, reason="vLLM is not installed, skipping test") class TestvLLM: """ The following test quantizes a model using a preset scheme or recipe, @@ -73,6 +75,7 @@ def set_up(self): self.recipe = eval_config.get("recipe") self.quant_type = eval_config.get("quant_type") self.save_dir = eval_config.get("save_dir") + self.save_compressed = eval_config.get("save_compressed", True) logger.info("========== RUNNING ==============") logger.info(self.scheme) @@ -112,7 +115,9 @@ def test_vllm(self): self._check_session_contains_recipe() logger.info("================= SAVING TO DISK ======================") - oneshot_model.save_pretrained(self.save_dir) + oneshot_model.save_pretrained( + self.save_dir, save_compressed=self.save_compressed + ) tokenizer.save_pretrained(self.save_dir) recipe_path = os.path.join(self.save_dir, "recipe.yaml") @@ -134,6 +139,7 @@ def test_vllm(self): folder_path=self.save_dir, ) + """ logger.info("================= RUNNING vLLM =========================") sampling_params = SamplingParams(temperature=0.80, top_p=0.95) @@ -156,6 +162,7 @@ def test_vllm(self): logger.info(generated_text) self.tear_down() + """ def tear_down(self): if self.save_dir is not None: