|
| 1 | +import os |
| 2 | +import shutil |
| 3 | +from pathlib import Path |
| 4 | + |
| 5 | +import numpy |
| 6 | +import pytest |
| 7 | +import yaml |
| 8 | +from loguru import logger |
| 9 | + |
| 10 | +from llmcompressor.core import active_session |
| 11 | +from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing |
| 12 | +from tests.examples.utils import requires_gpu_count |
| 13 | + |
| 14 | +try: |
| 15 | + import lm_eval |
| 16 | + |
| 17 | + lm_eval_installed = True |
| 18 | +except ImportError: |
| 19 | + lm_eval_installed = False |
| 20 | + logger.warning("lm_eval is not installed. This test will be skipped") |
| 21 | + |
| 22 | +TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", None) |
| 23 | + |
| 24 | + |
| 25 | +# Will run each test case in its own process through run_tests.sh |
| 26 | +# emulating vLLM CI testing |
| 27 | +@requires_gpu_count(1) |
| 28 | +@pytest.mark.skipif( |
| 29 | + not lm_eval_installed, reason="lm eval is not installed, skipping test" |
| 30 | +) |
| 31 | +class TestLMEval: |
| 32 | + """ |
| 33 | + The following test quantizes a model using a preset scheme or recipe, |
| 34 | + and then evaluates the model using LM Eval. Each test case is focused on a |
| 35 | + specific quantization type (e.g W4A16 with grouped quantization, |
| 36 | + W4N16 with channel quantization). To add a new test case, a new config has to be |
| 37 | + added to the lm_eval_configs folder. The tests run on a cadence defined by the |
| 38 | + `cadence` field. Each config defines the model to quantize. Optionally, a dataset |
| 39 | + id and split can be provided for calibration. Finally, all config files must list |
| 40 | + a scheme. The scheme can be a preset scheme from |
| 41 | + https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py |
| 42 | + or another identifier which can be used for the particular test case. If a recipe |
| 43 | + is not provided, it is assumed that the scheme provided is a preset scheme and will |
| 44 | + be used for quantization. Otherwise, the recipe will always be used if given. |
| 45 | + """ # noqa: E501 |
| 46 | + |
| 47 | + def set_up(self): |
| 48 | + eval_config = yaml.safe_load(Path(TEST_DATA_FILE).read_text(encoding="utf-8")) |
| 49 | + |
| 50 | + if os.environ.get("CADENCE", "commit") != eval_config.get("cadence"): |
| 51 | + pytest.skip("Skipping test; cadence mismatch") |
| 52 | + |
| 53 | + self.model = eval_config["model"] |
| 54 | + self.scheme = eval_config.get("scheme") |
| 55 | + self.dataset_id = eval_config.get("dataset_id") |
| 56 | + self.dataset_config = eval_config.get("dataset_config") |
| 57 | + self.dataset_split = eval_config.get("dataset_split") |
| 58 | + self.recipe = eval_config.get("recipe") |
| 59 | + self.quant_type = eval_config.get("quant_type") |
| 60 | + self.save_dir = eval_config.get("save_dir") |
| 61 | + self.task = eval_config.get("task") |
| 62 | + self.num_fewshot = eval_config.get("num_fewshot") |
| 63 | + self.limit = eval_config.get("limit") |
| 64 | + self.exact_flex = eval_config.get("exact_match,flexible-extract") |
| 65 | + self.exact_strict = eval_config.get("exact_match,strict-match") |
| 66 | + |
| 67 | + logger.info("========== RUNNING ==============") |
| 68 | + logger.info(self.scheme) |
| 69 | + |
| 70 | + self.device = "cuda:0" |
| 71 | + self.num_calibration_samples = 256 |
| 72 | + self.max_seq_length = 2048 |
| 73 | + |
| 74 | + def test_lm_eval(self): |
| 75 | + # Run vLLM with saved model |
| 76 | + self.set_up() |
| 77 | + if not self.save_dir: |
| 78 | + self.save_dir = self.model.split("/")[1] + f"-{self.scheme}" |
| 79 | + oneshot_model, tokenizer = run_oneshot_for_e2e_testing( |
| 80 | + model=self.model, |
| 81 | + device=self.device, |
| 82 | + num_calibration_samples=self.num_calibration_samples, |
| 83 | + max_seq_length=self.max_seq_length, |
| 84 | + scheme=self.scheme, |
| 85 | + dataset_id=self.dataset_id, |
| 86 | + dataset_config=self.dataset_config, |
| 87 | + dataset_split=self.dataset_split, |
| 88 | + recipe=self.recipe, |
| 89 | + quant_type=self.quant_type, |
| 90 | + ) |
| 91 | + |
| 92 | + logger.info("================= SAVING TO DISK ======================") |
| 93 | + oneshot_model.save_pretrained(self.save_dir) |
| 94 | + tokenizer.save_pretrained(self.save_dir) |
| 95 | + recipe_path = os.path.join(self.save_dir, "recipe.yaml") |
| 96 | + |
| 97 | + # Use the session to fetch the recipe; |
| 98 | + # Reset session for next test case |
| 99 | + session = active_session() |
| 100 | + recipe_yaml_str = session.get_serialized_recipe() |
| 101 | + with open(recipe_path, "w") as fp: |
| 102 | + fp.write(recipe_yaml_str) |
| 103 | + session.reset() |
| 104 | + |
| 105 | + logger.info("================= Running LM Eval ======================") |
| 106 | + |
| 107 | + model_args = f"pretrained={self.save_dir}" |
| 108 | + results = lm_eval.simple_evaluate( |
| 109 | + model="hf", |
| 110 | + model_args=model_args, |
| 111 | + tasks=[self.task], |
| 112 | + num_fewshot=self.num_fewshot, |
| 113 | + limit=self.limit, |
| 114 | + device="cuda:0", |
| 115 | + batch_size=100, |
| 116 | + ) |
| 117 | + |
| 118 | + metrics = results["results"][self.task] |
| 119 | + exact_match_strict = metrics.get("exact_match,strict-match") |
| 120 | + exact_match_flex = metrics.get("exact_match,flexible-extract") |
| 121 | + logger.info("Exact Match, Strict") |
| 122 | + logger.info(exact_match_strict) |
| 123 | + logger.info("Exact Match, Flex") |
| 124 | + logger.info(exact_match_flex) |
| 125 | + assert numpy.isclose(exact_match_strict, self.exact_strict, rtol=0.05) |
| 126 | + assert numpy.isclose(exact_match_flex, self.exact_flex, rtol=0.05) |
| 127 | + self.tear_down() |
| 128 | + |
| 129 | + def tear_down(self): |
| 130 | + if self.save_dir is not None: |
| 131 | + shutil.rmtree(self.save_dir) |
0 commit comments