Skip to content

Commit 606aab2

Browse files
authored
Add LM Eval Testing (#945)
* allow LM Eval Testing * log metrics * fix check * remove upload for now * fix docstring * add config; add version * update
1 parent 9f58887 commit 606aab2

File tree

6 files changed

+170
-11
lines changed

6 files changed

+170
-11
lines changed

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
"pytest-mock>=3.6.0",
7272
"pytest-rerunfailures>=13.0",
7373
"parameterized",
74+
"lm_eval==0.4.5",
7475
# example test dependencies
7576
"beautifulsoup4~=4.12.3",
7677
"cmarkgfm~=2024.1.14",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
cadence: "weekly"
2+
model: meta-llama/Meta-Llama-3-8B-Instruct
3+
scheme: FP8_DYNAMIC
4+
num_fewshot: 5
5+
limit: 1000
6+
task: "gsm8k"
7+
exact_match,flexible-extract: 0.753
8+
exact_match,strict-match: 0.753
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
cadence: "weekly"
2+
model: meta-llama/Meta-Llama-3-8B-Instruct
3+
scheme: INT8
4+
num_fewshot: 5
5+
limit: 250
6+
task: "gsm8k"
7+
exact_match,flexible-extract: 0.728
8+
exact_match,strict-match: 0.728

tests/e2e/vLLM/run_tests.sh

+16-4
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,22 @@
22

33
SUCCESS=0
44

5-
# Parse list of configs.
6-
MODEL_CONFIGS="$PWD/tests/e2e/vLLM/configs"
5+
while getopts "c:t:" OPT; do
6+
case ${OPT} in
7+
c )
8+
CONFIG="$OPTARG"
9+
;;
10+
t )
11+
TEST="$OPTARG"
12+
;;
13+
\? )
14+
exit 1
15+
;;
16+
esac
17+
done
718

8-
for MODEL_CONFIG in "$MODEL_CONFIGS"/*
19+
# Parse list of configs.
20+
for MODEL_CONFIG in "$CONFIG"/*
921
do
1022
LOCAL_SUCCESS=0
1123

@@ -15,7 +27,7 @@ do
1527
pytest \
1628
--capture=tee-sys \
1729
--junitxml="test-results/e2e-$(date +%s).xml" \
18-
"$PWD/tests/e2e/vLLM/test_vllm.py" || LOCAL_SUCCESS=$?
30+
"$TEST" || LOCAL_SUCCESS=$?
1931

2032
if [[ $LOCAL_SUCCESS == 0 ]]; then
2133
echo "=== PASSED MODEL: $MODEL_CONFIG ==="

tests/e2e/vLLM/test_lmeval.py

+131
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import os
2+
import shutil
3+
from pathlib import Path
4+
5+
import numpy
6+
import pytest
7+
import yaml
8+
from loguru import logger
9+
10+
from llmcompressor.core import active_session
11+
from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing
12+
from tests.examples.utils import requires_gpu_count
13+
14+
try:
15+
import lm_eval
16+
17+
lm_eval_installed = True
18+
except ImportError:
19+
lm_eval_installed = False
20+
logger.warning("lm_eval is not installed. This test will be skipped")
21+
22+
TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", None)
23+
24+
25+
# Will run each test case in its own process through run_tests.sh
26+
# emulating vLLM CI testing
27+
@requires_gpu_count(1)
28+
@pytest.mark.skipif(
29+
not lm_eval_installed, reason="lm eval is not installed, skipping test"
30+
)
31+
class TestLMEval:
32+
"""
33+
The following test quantizes a model using a preset scheme or recipe,
34+
and then evaluates the model using LM Eval. Each test case is focused on a
35+
specific quantization type (e.g W4A16 with grouped quantization,
36+
W4N16 with channel quantization). To add a new test case, a new config has to be
37+
added to the lm_eval_configs folder. The tests run on a cadence defined by the
38+
`cadence` field. Each config defines the model to quantize. Optionally, a dataset
39+
id and split can be provided for calibration. Finally, all config files must list
40+
a scheme. The scheme can be a preset scheme from
41+
https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py
42+
or another identifier which can be used for the particular test case. If a recipe
43+
is not provided, it is assumed that the scheme provided is a preset scheme and will
44+
be used for quantization. Otherwise, the recipe will always be used if given.
45+
""" # noqa: E501
46+
47+
def set_up(self):
48+
eval_config = yaml.safe_load(Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
49+
50+
if os.environ.get("CADENCE", "commit") != eval_config.get("cadence"):
51+
pytest.skip("Skipping test; cadence mismatch")
52+
53+
self.model = eval_config["model"]
54+
self.scheme = eval_config.get("scheme")
55+
self.dataset_id = eval_config.get("dataset_id")
56+
self.dataset_config = eval_config.get("dataset_config")
57+
self.dataset_split = eval_config.get("dataset_split")
58+
self.recipe = eval_config.get("recipe")
59+
self.quant_type = eval_config.get("quant_type")
60+
self.save_dir = eval_config.get("save_dir")
61+
self.task = eval_config.get("task")
62+
self.num_fewshot = eval_config.get("num_fewshot")
63+
self.limit = eval_config.get("limit")
64+
self.exact_flex = eval_config.get("exact_match,flexible-extract")
65+
self.exact_strict = eval_config.get("exact_match,strict-match")
66+
67+
logger.info("========== RUNNING ==============")
68+
logger.info(self.scheme)
69+
70+
self.device = "cuda:0"
71+
self.num_calibration_samples = 256
72+
self.max_seq_length = 2048
73+
74+
def test_lm_eval(self):
75+
# Run vLLM with saved model
76+
self.set_up()
77+
if not self.save_dir:
78+
self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
79+
oneshot_model, tokenizer = run_oneshot_for_e2e_testing(
80+
model=self.model,
81+
device=self.device,
82+
num_calibration_samples=self.num_calibration_samples,
83+
max_seq_length=self.max_seq_length,
84+
scheme=self.scheme,
85+
dataset_id=self.dataset_id,
86+
dataset_config=self.dataset_config,
87+
dataset_split=self.dataset_split,
88+
recipe=self.recipe,
89+
quant_type=self.quant_type,
90+
)
91+
92+
logger.info("================= SAVING TO DISK ======================")
93+
oneshot_model.save_pretrained(self.save_dir)
94+
tokenizer.save_pretrained(self.save_dir)
95+
recipe_path = os.path.join(self.save_dir, "recipe.yaml")
96+
97+
# Use the session to fetch the recipe;
98+
# Reset session for next test case
99+
session = active_session()
100+
recipe_yaml_str = session.get_serialized_recipe()
101+
with open(recipe_path, "w") as fp:
102+
fp.write(recipe_yaml_str)
103+
session.reset()
104+
105+
logger.info("================= Running LM Eval ======================")
106+
107+
model_args = f"pretrained={self.save_dir}"
108+
results = lm_eval.simple_evaluate(
109+
model="hf",
110+
model_args=model_args,
111+
tasks=[self.task],
112+
num_fewshot=self.num_fewshot,
113+
limit=self.limit,
114+
device="cuda:0",
115+
batch_size=100,
116+
)
117+
118+
metrics = results["results"][self.task]
119+
exact_match_strict = metrics.get("exact_match,strict-match")
120+
exact_match_flex = metrics.get("exact_match,flexible-extract")
121+
logger.info("Exact Match, Strict")
122+
logger.info(exact_match_strict)
123+
logger.info("Exact Match, Flex")
124+
logger.info(exact_match_flex)
125+
assert numpy.isclose(exact_match_strict, self.exact_strict, rtol=0.05)
126+
assert numpy.isclose(exact_match_flex, self.exact_flex, rtol=0.05)
127+
self.tear_down()
128+
129+
def tear_down(self):
130+
if self.save_dir is not None:
131+
shutil.rmtree(self.save_dir)

tests/e2e/vLLM/test_vllm.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,12 @@ class TestvLLM:
3333
runs the model using vLLM, and then pushes the model to the hub for
3434
future use. Each test case is focused on a specific quantization type
3535
(e.g W4A16 with grouped quantization, W4N16 with channel quantization).
36-
To add a new test case, a new config has to be added to one of the folders
37-
listed in the `CONFIGS` folder. If the test case is for a data type not listed
38-
in `CONFIGS`, a new folder can be created and added to the list. The tests
39-
run on a cadence defined by the `cadence` field. Each config defines the model
40-
to quantize. Optionally, a dataset id and split can be provided for calibration.
41-
Finally, all config files must list a scheme. The scheme can be a preset scheme
42-
from https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py
36+
To add a new test case, a new config has to be added to the `configs` folder.
37+
The tests run on a cadence defined by the `cadence` field. Each config defines
38+
the model to quantize. Optionally, a dataset id and split can be provided for
39+
calibration. Finally, all config files must list a scheme. The scheme can be a
40+
preset scheme from
41+
https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py
4342
or another identifier which can be used for the particular test case. If a recipe
4443
is not provided, it is assumed that the scheme provided is a preset scheme and will
4544
be used for quantization. Otherwise, the recipe will always be used if given.

0 commit comments

Comments
 (0)