-
Notifications
You must be signed in to change notification settings - Fork 75
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Test Fix] Fix/update test_run_compressed (#970)
~~Contingent on merge of huggingface/transformers#34719 ^ has been merged not yet released SUMMARY: Update run_compressed tests from decompression tests to run_comrpressed tests -> test if run_compressed True/False models generate the same output Add decompress tests that copies attrs from the source dir path's model to the target model. TEST PLAN: ran the test using transformers main must pass tests/llmcompressor/transformers/compression/test_decompress.py and tests/llmcompressor/transformers/compression/test_run_compressed.py Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
- Loading branch information
Showing
11 changed files
with
299 additions
and
45 deletions.
There are no files selected for viewing
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" | ||
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed |
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
model_stub: "nm-testing/tinyllama-w4a16-compressed" | ||
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed |
4 changes: 0 additions & 4 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml
This file was deleted.
Oops, something went wrong.
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
model_stub: "nm-testing/tinyllama-w8a8-compressed" | ||
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed |
128 changes: 128 additions & 0 deletions
128
tests/llmcompressor/transformers/compression/test_decompress.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import copy | ||
import shutil | ||
import tempfile | ||
import unittest | ||
|
||
import torch | ||
from compressed_tensors import QUANTIZATION_CONFIG_NAME | ||
from compressed_tensors.compressors import ModelCompressor | ||
from compressed_tensors.quantization import QuantizationStatus | ||
from parameterized import parameterized_class | ||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | ||
from transformers.utils.quantization_config import CompressedTensorsConfig | ||
|
||
from tests.testing_utils import parse_params, requires_gpu | ||
|
||
CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs" | ||
|
||
|
||
@requires_gpu | ||
@parameterized_class(parse_params(CONFIG_DIR)) | ||
class TestDecompression(unittest.TestCase): | ||
""" | ||
Check that HFQuantizer decompression is working as expected. | ||
Manually decompress a compressed model and compare the generations | ||
Decompression: | ||
Given a skeleton model and path to the optimized model, | ||
write the optimized model's safetensors to the skeleton model and decompress | ||
Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16 | ||
""" | ||
|
||
compressed_model_stub = None | ||
skeleton_model_stub = None | ||
|
||
SAMPLE_INPUTS = [ | ||
"I love 4-bit quantization because", | ||
"What is the capital of France?", | ||
"def fibonacci(n):", | ||
] | ||
|
||
@classmethod | ||
def setUpClass(self): | ||
self.test_dir = tempfile.mkdtemp() | ||
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub) | ||
|
||
# Decompress using HFQuantizer from AutoModelForCausalLM | ||
self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained( | ||
self.compressed_model_stub, | ||
torch_dtype="auto", | ||
device_map="auto", | ||
quantization_config=CompressedTensorsConfig(run_compressed=False), | ||
) | ||
|
||
# Manually decompress this model | ||
self.dense_model = AutoModelForCausalLM.from_pretrained( | ||
self.skeleton_model_stub, | ||
torch_dtype=self.decompressed_model_hf_quantizer.dtype, | ||
device_map=self.decompressed_model_hf_quantizer.device, | ||
) | ||
|
||
assert not hasattr( | ||
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale" | ||
) | ||
|
||
config = AutoConfig.from_pretrained(self.compressed_model_stub) | ||
|
||
compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) | ||
self.compressor = ModelCompressor.from_compression_config(compression_config) | ||
self.compressor.quantization_config.quantization_status = ( | ||
QuantizationStatus.FROZEN | ||
) | ||
|
||
# use the model_path to load the decompressed weights into dense_model | ||
dense_model = copy.deepcopy(self.dense_model) | ||
|
||
# overwrite the weights of the dense model | ||
self.compressor.decompress( | ||
model_path=self.compressed_model_stub, | ||
model=self.dense_model, | ||
) | ||
|
||
# self.dense_model should be decompressed | ||
assert dense_model is not self.dense_model | ||
|
||
self.decompressed_model_manual = self.dense_model | ||
|
||
assert hasattr( | ||
self.decompressed_model_manual.model.layers[0].self_attn.q_proj, | ||
"weight_scale", | ||
) | ||
|
||
def test_hf_quantizer_decompress_match_manual_decompress(self): | ||
decompressed_model_manual = self.decompressed_model_manual.device | ||
decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device | ||
|
||
self.decompressed_model_manual = self.decompressed_model_manual.to( | ||
decompressed_model_manual | ||
) | ||
self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to( | ||
decompressed_model_hf_quantizer | ||
) | ||
|
||
for input in self.SAMPLE_INPUTS: | ||
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( | ||
self.decompressed_model_manual.device | ||
) | ||
inputs = inputs.to(self.decompressed_model_manual.device) | ||
|
||
decompressed_model_manual_output = self.tokenizer.batch_decode( | ||
self.decompressed_model_manual.generate(**inputs, max_length=50) | ||
) | ||
|
||
decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode( | ||
self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50) | ||
) | ||
|
||
assert ( | ||
decompressed_model_hf_quantizer_out == decompressed_model_manual_output | ||
) | ||
|
||
@classmethod | ||
def tearDownClass(self): | ||
shutil.rmtree(self.test_dir) | ||
del self.dense_model | ||
del self.decompressed_model_hf_quantizer | ||
del self.decompressed_model_manual | ||
torch.cuda.empty_cache() |
Oops, something went wrong.