From b7a968e98f9114fd756444b8b0497a9fa24aa200 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 10 Dec 2024 22:57:55 -0500 Subject: [PATCH 01/13] update test_run_compressed --- .../run_compressed_configs/fp8_dynamic.yaml | 4 +- .../run_compressed_configs/w4a16.yaml | 4 +- .../run_compressed_configs/w8a16_dense.yaml | 4 +- .../run_compressed_configs/w8a8.yaml | 4 +- .../compression/test_run_compressed.py | 51 +++++++------------ 5 files changed, 27 insertions(+), 40 deletions(-) diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml index d516616bf..ccd43c024 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" -empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml index 7e9bc3f2f..b4f2849c0 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-w4a16-compressed" -empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml index af1e5df8b..e74e83a4d 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-w8a16-dense" -empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml index 086a67ed6..b428c4a54 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -model_stub: "nm-testing/tinyllama-w8a8-compressed" -empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 0c2a0ab0e..047d80e9c 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -3,11 +3,9 @@ import unittest import torch -from compressed_tensors import QUANTIZATION_CONFIG_NAME -from compressed_tensors.compressors import ModelCompressor -from compressed_tensors.quantization import QuantizationStatus from parameterized import parameterized_class -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.utils.quantization_config import CompressedTensorsConfig from tests.testing_utils import parse_params, requires_gpu @@ -17,39 +15,28 @@ @requires_gpu @parameterized_class(parse_params(CONFIG_DIR)) class TestQuantizationMatches(unittest.TestCase): - model_stub = None - empty_model = None + compressed_model_stub = None + uncompressed_model_stub = None @classmethod def setUpClass(cls): cls.test_dir = tempfile.mkdtemp() - # TODO: Give option on HFQuantizer to run run_compressed True/False - # currently hardcoded to True - cls.compressed_model = AutoModelForCausalLM.from_pretrained( - cls.model_stub, + quantization_config = CompressedTensorsConfig(run_compressed=False) + cls.decompressed_model = AutoModelForCausalLM.from_pretrained( + cls.compressed_model_stub, torch_dtype="auto", device_map="auto", - # run_compressed=True, # TODO: Give option on HFQuantizer + quantization_config=quantization_config, ) - # TODO: Use ModelCompressor until decompression is supported through - # HFQuant/run_compressed can be turned off. - cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( - cls.empty_model, - torch_dtype=cls.compressed_model.dtype, - device_map=cls.compressed_model.device, - ) - config = AutoConfig.from_pretrained(cls.model_stub) - compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) - cls.compressor = ModelCompressor.from_compression_config(compression_config) - cls.compressor.quantization_config.quantization_status = ( - QuantizationStatus.FROZEN - ) - cls.compressor.decompress( - model_path=cls.model_stub, model=cls.uncompressed_model + + cls.non_comp_model = AutoModelForCausalLM.from_pretrained( + cls.uncompressed_model_stub, + torch_dtype=cls.decompressed_model.dtype, + device_map=cls.decompressed_model.device, ) - cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub) + cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub) def test_compressed_matches_uncompressed(self): SAMPLE_INPUT = [ @@ -59,13 +46,13 @@ def test_compressed_matches_uncompressed(self): ] inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( - self.compressed_model.device + self.decompressed_model.device ) compressed_output = self.tokenizer.batch_decode( - self.compressed_model.generate(**inputs, max_length=50) + self.decompressed_model.generate(**inputs, max_length=50) ) uncompressed_output = self.tokenizer.batch_decode( - self.uncompressed_model.generate(**inputs, max_length=50) + self.non_comp_model.generate(**inputs, max_length=50) ) for idx in range(len(SAMPLE_INPUT)): @@ -74,6 +61,6 @@ def test_compressed_matches_uncompressed(self): @classmethod def tearDownClass(cls): shutil.rmtree(cls.test_dir) - del cls.compressed_model - del cls.uncompressed_model + del cls.decompressed_model + del cls.non_comp_model torch.cuda.empty_cache() From 7067ad0454bd5013a700e52b08136aae3093bfb6 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 10 Dec 2024 23:00:40 -0500 Subject: [PATCH 02/13] better var name --- .../transformers/compression/test_run_compressed.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 047d80e9c..4b99df761 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -30,7 +30,7 @@ def setUpClass(cls): quantization_config=quantization_config, ) - cls.non_comp_model = AutoModelForCausalLM.from_pretrained( + cls.base_model = AutoModelForCausalLM.from_pretrained( cls.uncompressed_model_stub, torch_dtype=cls.decompressed_model.dtype, device_map=cls.decompressed_model.device, @@ -52,7 +52,7 @@ def test_compressed_matches_uncompressed(self): self.decompressed_model.generate(**inputs, max_length=50) ) uncompressed_output = self.tokenizer.batch_decode( - self.non_comp_model.generate(**inputs, max_length=50) + self.base_model.generate(**inputs, max_length=50) ) for idx in range(len(SAMPLE_INPUT)): @@ -62,5 +62,5 @@ def test_compressed_matches_uncompressed(self): def tearDownClass(cls): shutil.rmtree(cls.test_dir) del cls.decompressed_model - del cls.non_comp_model + del cls.base_model torch.cuda.empty_cache() From 126d3d5977fd6c51c992a1ed0f75803e643156fe Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Wed, 11 Dec 2024 13:09:05 -0500 Subject: [PATCH 03/13] add decompress tests --- .../decompression_configs/fp8_dynamic.yaml | 4 + .../decompression_configs/w4a16.yaml | 4 + .../decompression_configs/w8a16_dense.yaml | 4 + .../decompression_configs/w8a8.yaml | 4 + .../run_compressed_configs/fp8_dynamic.yaml | 4 +- .../run_compressed_configs/w4a16.yaml | 4 +- .../run_compressed_configs/w8a16_dense.yaml | 4 +- .../run_compressed_configs/w8a8.yaml | 4 +- .../compression/test_decompress.py | 98 +++++++++++++++++++ 9 files changed, 122 insertions(+), 8 deletions(-) create mode 100644 tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml create mode 100644 tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml create mode 100644 tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml create mode 100644 tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml create mode 100644 tests/llmcompressor/transformers/compression/test_decompress.py diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml new file mode 100644 index 000000000..ccd43c024 --- /dev/null +++ b/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml @@ -0,0 +1,4 @@ +cadence: "commit" +test_type: "regression" +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml new file mode 100644 index 000000000..b4f2849c0 --- /dev/null +++ b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml @@ -0,0 +1,4 @@ +cadence: "commit" +test_type: "regression" +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml new file mode 100644 index 000000000..e74e83a4d --- /dev/null +++ b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml @@ -0,0 +1,4 @@ +cadence: "commit" +test_type: "regression" +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml new file mode 100644 index 000000000..b428c4a54 --- /dev/null +++ b/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml @@ -0,0 +1,4 @@ +cadence: "commit" +test_type: "regression" +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml index ccd43c024..6685efb1e 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" +skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml index b4f2849c0..144044f28 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" +skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml index e74e83a4d..95e73b148 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" +skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml index b428c4a54..b5a846cbc 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" +skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py new file mode 100644 index 000000000..3f3e38557 --- /dev/null +++ b/tests/llmcompressor/transformers/compression/test_decompress.py @@ -0,0 +1,98 @@ +import copy +import shutil +import tempfile +import unittest + +import torch +from compressed_tensors import QUANTIZATION_CONFIG_NAME +from compressed_tensors.compressors import ModelCompressor +from compressed_tensors.quantization import QuantizationStatus +from parameterized import parameterized_class +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +from tests.testing_utils import parse_params, requires_gpu + +CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs" + + +@requires_gpu +@parameterized_class(parse_params(CONFIG_DIR)) +class TestQuantizationMatches(unittest.TestCase): + compressed_model_stub = None + skeleton_model_stub = None + + SAMPLE_INPUTS = [ + "I love 4-bit quantization because", + "What is the capital of France?", + "def fibonacci(n):", + ] + + @classmethod + def setUpClass(self): + self.test_dir = tempfile.mkdtemp() + self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub) + + self.compressed_model = AutoModelForCausalLM.from_pretrained( + self.compressed_model_stub, + torch_dtype="auto", + device_map="auto", + ) + + self.dense_model = AutoModelForCausalLM.from_pretrained( + self.skeleton_model_stub, + torch_dtype=self.compressed_model.dtype, + device_map=self.compressed_model.device, + ) + + assert not hasattr( + self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale" + ) + + self.decompressed_model = None + config = AutoConfig.from_pretrained(self.compressed_model_stub) + + compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) + self.compressor = ModelCompressor.from_compression_config(compression_config) + self.compressor.quantization_config.quantization_status = ( + QuantizationStatus.FROZEN + ) + + # use the model_path to load the decompressed weights into dense_model + dense_model = copy.deepcopy(self.dense_model) + + # overwrite the weights of the dense model + self.compressor.decompress( + model_path=self.compressed_model_stub, + model=self.dense_model, + ) + + # self.dense_model should be decompressed + assert dense_model is not self.dense_model + + self.decompressed_model = self.dense_model + + assert hasattr( + self.decompressed_model.model.layers[0].self_attn.q_proj, "weight_scale" + ) + + def test_compressed_matches_uncompressed(self): + for input in self.SAMPLE_INPUTS: + inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( + self.compressed_model.device + ) + compressed_output = self.tokenizer.batch_decode( + self.compressed_model.generate(**inputs, max_length=50) + ) + uncompressed_output = self.tokenizer.batch_decode( + self.decompressed_model.generate(**inputs, max_length=50) + ) + + assert compressed_output == uncompressed_output + + @classmethod + def tearDownClass(self): + shutil.rmtree(self.test_dir) + del self.compressed_model + del self.dense_model + del self.decompressed_model + torch.cuda.empty_cache() From e01f31479d39e270836e4956ec5fb61cec079b7c Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Mon, 16 Dec 2024 18:32:58 -0500 Subject: [PATCH 04/13] fix names --- .../decompression_configs/fp8_dynamic.yaml | 4 ++-- .../compression/decompression_configs/w4a16.yaml | 4 ++-- .../decompression_configs/w8a16_dense.yaml | 4 ++-- .../compression/decompression_configs/w8a8.yaml | 4 ++-- .../run_compressed_configs/fp8_dynamic.yaml | 4 ++-- .../run_compressed_configs/w4a16.yaml | 4 ++-- .../run_compressed_configs/w8a16_dense.yaml | 4 ++-- .../compression/run_compressed_configs/w8a8.yaml | 4 ++-- .../transformers/compression/test_decompress.py | 7 ++++++- .../compression/test_run_compressed.py | 16 ++++++++++++---- 10 files changed, 34 insertions(+), 21 deletions(-) diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml index ccd43c024..6685efb1e 100644 --- a/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml +++ b/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" +skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml index b4f2849c0..144044f28 100644 --- a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" +skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml index e74e83a4d..95e73b148 100644 --- a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml +++ b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" +skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml index b428c4a54..b5a846cbc 100644 --- a/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" +skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml index 6685efb1e..ccd43c024 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml index 144044f28..b4f2849c0 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml index 95e73b148..e74e83a4d 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml index b5a846cbc..b428c4a54 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py index 3f3e38557..45921e63c 100644 --- a/tests/llmcompressor/transformers/compression/test_decompress.py +++ b/tests/llmcompressor/transformers/compression/test_decompress.py @@ -12,12 +12,17 @@ from tests.testing_utils import parse_params, requires_gpu -CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs" +CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs" @requires_gpu @parameterized_class(parse_params(CONFIG_DIR)) class TestQuantizationMatches(unittest.TestCase): + """ + Test the decompression, which copies the attrs of compressed_model_stub's + safetensors to skeleton_model_stub and decompresses. Ex. fp4 -> fp16 + """ + compressed_model_stub = None skeleton_model_stub = None diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 4b99df761..352254ed9 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -15,8 +15,16 @@ @requires_gpu @parameterized_class(parse_params(CONFIG_DIR)) class TestQuantizationMatches(unittest.TestCase): - compressed_model_stub = None - uncompressed_model_stub = None + """ + Test the run_compressed input arg to AutoModelForCausalLM, where HFQuantizer is + responsible for decompressing if model is compressed. + + Diagram flow https://tinyurl.com/2ynb6wbu + + """ + + compressed_model_stub = None # model was compressed on save + uncompressed_model_stub = None # model was not compressed on save @classmethod def setUpClass(cls): @@ -30,7 +38,7 @@ def setUpClass(cls): quantization_config=quantization_config, ) - cls.base_model = AutoModelForCausalLM.from_pretrained( + cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( cls.uncompressed_model_stub, torch_dtype=cls.decompressed_model.dtype, device_map=cls.decompressed_model.device, @@ -52,7 +60,7 @@ def test_compressed_matches_uncompressed(self): self.decompressed_model.generate(**inputs, max_length=50) ) uncompressed_output = self.tokenizer.batch_decode( - self.base_model.generate(**inputs, max_length=50) + self.uncompressed_model.generate(**inputs, max_length=50) ) for idx in range(len(SAMPLE_INPUT)): From f36cbac05f61d0102311889a258ef26a6c402407 Mon Sep 17 00:00:00 2001 From: George Date: Mon, 16 Dec 2024 19:34:10 -0500 Subject: [PATCH 05/13] fix typo --- .../transformers/compression/test_run_compressed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 352254ed9..e4ea778a7 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -70,5 +70,5 @@ def test_compressed_matches_uncompressed(self): def tearDownClass(cls): shutil.rmtree(cls.test_dir) del cls.decompressed_model - del cls.base_model + del cls.uncompressed_model torch.cuda.empty_cache() From b4c882806581ab56973825dce3b027113e8e5490 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 17 Dec 2024 12:26:34 -0500 Subject: [PATCH 06/13] add compressedlinear vs linear generation --- .../{ => decompression}/fp8_dynamic.yaml | 0 .../{ => decompression}/w4a16.yaml | 0 .../{ => decompression}/w8a16_dense.yaml | 0 .../{ => decompression}/w8a8.yaml | 0 .../run_compressed/fp8_dynamic.yaml | 4 + .../run_compressed/w4a16.yaml | 4 + .../run_compressed/w8a16_dense.yaml | 4 + .../run_compressed/w8a8.yaml | 4 + .../compression/test_decompress.py | 11 +- .../compression/test_run_compressed.py | 120 +++++++++++++++--- 10 files changed, 130 insertions(+), 17 deletions(-) rename tests/llmcompressor/transformers/compression/run_compressed_configs/{ => decompression}/fp8_dynamic.yaml (100%) rename tests/llmcompressor/transformers/compression/run_compressed_configs/{ => decompression}/w4a16.yaml (100%) rename tests/llmcompressor/transformers/compression/run_compressed_configs/{ => decompression}/w8a16_dense.yaml (100%) rename tests/llmcompressor/transformers/compression/run_compressed_configs/{ => decompression}/w8a8.yaml (100%) create mode 100644 tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/fp8_dynamic.yaml create mode 100644 tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w4a16.yaml create mode 100644 tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a16_dense.yaml create mode 100644 tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a8.yaml diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/fp8_dynamic.yaml similarity index 100% rename from tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml rename to tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/fp8_dynamic.yaml diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w4a16.yaml similarity index 100% rename from tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml rename to tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w4a16.yaml diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a16_dense.yaml similarity index 100% rename from tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml rename to tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a16_dense.yaml diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a8.yaml similarity index 100% rename from tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml rename to tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a8.yaml diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/fp8_dynamic.yaml new file mode 100644 index 000000000..ccd43c024 --- /dev/null +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/fp8_dynamic.yaml @@ -0,0 +1,4 @@ +cadence: "commit" +test_type: "regression" +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w4a16.yaml new file mode 100644 index 000000000..b4f2849c0 --- /dev/null +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w4a16.yaml @@ -0,0 +1,4 @@ +cadence: "commit" +test_type: "regression" +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a16_dense.yaml new file mode 100644 index 000000000..e74e83a4d --- /dev/null +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a16_dense.yaml @@ -0,0 +1,4 @@ +cadence: "commit" +test_type: "regression" +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a8.yaml new file mode 100644 index 000000000..b428c4a54 --- /dev/null +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a8.yaml @@ -0,0 +1,4 @@ +cadence: "commit" +test_type: "regression" +compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed +uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py index 3f3e38557..96014cf76 100644 --- a/tests/llmcompressor/transformers/compression/test_decompress.py +++ b/tests/llmcompressor/transformers/compression/test_decompress.py @@ -12,12 +12,21 @@ from tests.testing_utils import parse_params, requires_gpu -CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs" +CONFIG_DIR = ( + "tests/llmcompressor/transformers/compression/run_compressed_configs/decompression" +) @requires_gpu @parameterized_class(parse_params(CONFIG_DIR)) class TestQuantizationMatches(unittest.TestCase): + """ + Test decompression - given a skeleton model and path to the optimized model, + write the optimized model's safetensors to the skeleton model and decompress + Ex. write weight_scale to skeleton model and then fp4 -> fp16 + + """ + compressed_model_stub = None skeleton_model_stub = None diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 4b99df761..2c0136ecd 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -3,41 +3,129 @@ import unittest import torch +from compressed_tensors.linear.compressed_linear import CompressedLinear +from compressed_tensors.quantization.utils import iter_named_leaf_modules from parameterized import parameterized_class from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.utils.quantization_config import CompressedTensorsConfig from tests.testing_utils import parse_params, requires_gpu -CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs" +COMPRESSED_LINEAR_CONFIG_DIR = ( + "tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed" +) + + +# @requires_gpu +# @parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) +# class TestRunCompressedDecompression(unittest.TestCase): +# """ +# Given an optimized model that was saved (uncompressed), +# and saved as run_compressed (compressed), decompress the compressed model +# and check the outputs. + +# All modules should be linear, runs default foward calls + +# """ + +# compressed_model_stub = None +# uncompressed_model_stub = None + +# @classmethod +# def setUpClass(cls): +# cls.test_dir = tempfile.mkdtemp() + +# quantization_config = CompressedTensorsConfig(run_compressed=False) +# cls.decompressed_model = AutoModelForCausalLM.from_pretrained( +# cls.compressed_model_stub, +# torch_dtype="auto", +# device_map="auto", +# quantization_config=quantization_config, +# ) + +# cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( +# cls.uncompressed_model_stub, +# torch_dtype=cls.decompressed_model.dtype, +# device_map=cls.decompressed_model.device, +# ) + +# cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub) + +# def test_compressed_matches_decompressed(self): +# SAMPLE_INPUT = [ +# "I love 4-bit quantization because", +# "What is the capital of France?", +# "def fibonacci(n):", +# ] + +# inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( +# self.decompressed_model.device +# ) +# decompressed_output = self.tokenizer.batch_decode( +# self.decompressed_model.generate(**inputs, max_length=50) +# ) +# uncompressed_output = self.tokenizer.batch_decode( +# self.uncompressed_model.generate(**inputs, max_length=50) +# ) + +# for idx in range(len(SAMPLE_INPUT)): +# assert decompressed_output[idx] == uncompressed_output[idx] + +# @classmethod +# def tearDownClass(cls): +# shutil.rmtree(cls.test_dir) +# del cls.decompressed_model +# del cls.uncompressed_model +# torch.cuda.empty_cache() @requires_gpu -@parameterized_class(parse_params(CONFIG_DIR)) -class TestQuantizationMatches(unittest.TestCase): +@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) +class TestRunCompressedForward(unittest.TestCase): + """ + Given an optimized model that was saved (uncompressed), + and saved as run_compressed (compressed), do not decompressed the compressed model + and check the outputs. + + All compressed model should have CompressedLinear, which has its custom forward call + + """ + compressed_model_stub = None - uncompressed_model_stub = None @classmethod def setUpClass(cls): cls.test_dir = tempfile.mkdtemp() - quantization_config = CompressedTensorsConfig(run_compressed=False) - cls.decompressed_model = AutoModelForCausalLM.from_pretrained( + # Should have CompressedLinear modules + cls.compressed_model = AutoModelForCausalLM.from_pretrained( cls.compressed_model_stub, torch_dtype="auto", device_map="auto", - quantization_config=quantization_config, ) - cls.base_model = AutoModelForCausalLM.from_pretrained( - cls.uncompressed_model_stub, - torch_dtype=cls.decompressed_model.dtype, - device_map=cls.decompressed_model.device, + # Should just be linear modules + quantization_config = CompressedTensorsConfig(run_compressed=False) + cls.decompressed_model = AutoModelForCausalLM.from_pretrained( + cls.compressed_model_stub, + torch_dtype=cls.compressed_model.dtype, + device_map=cls.compressed_model.device, + quantization_config=quantization_config, ) cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub) + def test_compressed_linear_modules_exist(self): + compressed_linear_counts = 0 + for _, submodule in iter_named_leaf_modules( + self.compressed_model, + ): + if isinstance(submodule, CompressedLinear): + compressed_linear_counts += 1 + + # some linear models are not compressed - ex. lm_head + assert compressed_linear_counts > 0 + def test_compressed_matches_uncompressed(self): SAMPLE_INPUT = [ "I love 4-bit quantization because", @@ -48,19 +136,19 @@ def test_compressed_matches_uncompressed(self): inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( self.decompressed_model.device ) - compressed_output = self.tokenizer.batch_decode( + compressed_model_out = self.tokenizer.batch_decode( self.decompressed_model.generate(**inputs, max_length=50) ) - uncompressed_output = self.tokenizer.batch_decode( - self.base_model.generate(**inputs, max_length=50) + decompressed_model_out = self.tokenizer.batch_decode( + self.decompressed_model.generate(**inputs, max_length=50) ) for idx in range(len(SAMPLE_INPUT)): - assert compressed_output[idx] == uncompressed_output[idx] + assert compressed_model_out[idx] == decompressed_model_out[idx] @classmethod def tearDownClass(cls): shutil.rmtree(cls.test_dir) del cls.decompressed_model - del cls.base_model + del cls.compressed_model torch.cuda.empty_cache() From fd0745fabad2155141f86b4cb46bb38476eacf48 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 17 Dec 2024 12:57:26 -0500 Subject: [PATCH 07/13] revert folder struct for config --- .../run_compressed_configs/decompression/fp8_dynamic.yaml | 4 ---- .../run_compressed_configs/decompression/w4a16.yaml | 4 ---- .../run_compressed_configs/decompression/w8a16_dense.yaml | 4 ---- .../run_compressed_configs/decompression/w8a8.yaml | 4 ---- .../{run_compressed => }/fp8_dynamic.yaml | 0 .../run_compressed_configs/{run_compressed => }/w4a16.yaml | 0 .../{run_compressed => }/w8a16_dense.yaml | 0 .../run_compressed_configs/{run_compressed => }/w8a8.yaml | 0 .../llmcompressor/transformers/compression/test_decompress.py | 4 +--- .../transformers/compression/test_run_compressed.py | 2 +- 10 files changed, 2 insertions(+), 20 deletions(-) delete mode 100644 tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/fp8_dynamic.yaml delete mode 100644 tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w4a16.yaml delete mode 100644 tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a16_dense.yaml delete mode 100644 tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a8.yaml rename tests/llmcompressor/transformers/compression/run_compressed_configs/{run_compressed => }/fp8_dynamic.yaml (100%) rename tests/llmcompressor/transformers/compression/run_compressed_configs/{run_compressed => }/w4a16.yaml (100%) rename tests/llmcompressor/transformers/compression/run_compressed_configs/{run_compressed => }/w8a16_dense.yaml (100%) rename tests/llmcompressor/transformers/compression/run_compressed_configs/{run_compressed => }/w8a8.yaml (100%) diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/fp8_dynamic.yaml deleted file mode 100644 index 6685efb1e..000000000 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/fp8_dynamic.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "commit" -test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w4a16.yaml deleted file mode 100644 index 95e73b148..000000000 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w4a16.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "commit" -test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a16_dense.yaml deleted file mode 100644 index 95e73b148..000000000 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a16_dense.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "commit" -test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a8.yaml deleted file mode 100644 index b5a846cbc..000000000 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/decompression/w8a8.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "commit" -test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml similarity index 100% rename from tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/fp8_dynamic.yaml rename to tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml similarity index 100% rename from tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w4a16.yaml rename to tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml similarity index 100% rename from tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a16_dense.yaml rename to tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml similarity index 100% rename from tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed/w8a8.yaml rename to tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py index 96014cf76..e6625ebd5 100644 --- a/tests/llmcompressor/transformers/compression/test_decompress.py +++ b/tests/llmcompressor/transformers/compression/test_decompress.py @@ -12,9 +12,7 @@ from tests.testing_utils import parse_params, requires_gpu -CONFIG_DIR = ( - "tests/llmcompressor/transformers/compression/run_compressed_configs/decompression" -) +CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs" @requires_gpu diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index f95e6124c..9e2309108 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -12,7 +12,7 @@ from tests.testing_utils import parse_params, requires_gpu COMPRESSED_LINEAR_CONFIG_DIR = ( - "tests/llmcompressor/transformers/compression/run_compressed_configs/run_compressed" + "tests/llmcompressor/transformers/compression/run_compressed_configs" ) From f1d4539c2f750a9a5ae86584507230ccb4b3c90a Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Thu, 9 Jan 2025 23:40:07 -0500 Subject: [PATCH 08/13] fix devices error --- .../compression/test_decompress.py | 10 ++++++ .../compression/test_run_compressed.py | 33 ++++++++++++++++--- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py index e6625ebd5..c451ce11b 100644 --- a/tests/llmcompressor/transformers/compression/test_decompress.py +++ b/tests/llmcompressor/transformers/compression/test_decompress.py @@ -83,13 +83,23 @@ def setUpClass(self): ) def test_compressed_matches_uncompressed(self): + decompressed_device = self.decompressed_model.device + compressed_device = self.compressed_model.device + + self.decompressed_model = self.decompressed_model.to(decompressed_device) + self.compressed_model = self.compressed_model.to(compressed_device) + for input in self.SAMPLE_INPUTS: inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( self.compressed_model.device ) + compressed_output = self.tokenizer.batch_decode( self.compressed_model.generate(**inputs, max_length=50) ) + + inputs = inputs.to(self.decompressed_model.device) + uncompressed_output = self.tokenizer.batch_decode( self.decompressed_model.generate(**inputs, max_length=50) ) diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 9e2309108..f0c54631d 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -69,12 +69,23 @@ def test_compressed_matches_decompressed(self): "def fibonacci(n):", ] + decompressed_device = self.decompressed_model.device + uncompressed_device = self.uncompressed_model.device + + # overwrite weights in cpu to cuda + self.decompressed_model = self.decompressed_model.to(decompressed_device) + self.uncompressed_model = self.uncompressed_model.to(uncompressed_device) + inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( - self.decompressed_model.device + decompressed_device ) + decompressed_output = self.tokenizer.batch_decode( self.decompressed_model.generate(**inputs, max_length=50) ) + + inputs = inputs.to(uncompressed_device) + uncompressed_output = self.tokenizer.batch_decode( self.uncompressed_model.generate(**inputs, max_length=50) ) @@ -144,16 +155,28 @@ def test_compressed_matches_uncompressed(self): "def fibonacci(n):", ] + decompressed_device = self.decompressed_model.device + compressed_device = self.compressed_model.device + + # overwrite weights in cpu to cuda + self.decompressed_model = self.decompressed_model.to(decompressed_device) + self.compressed_model = self.compressed_model.to(compressed_device) + inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( - self.decompressed_model.device - ) - compressed_model_out = self.tokenizer.batch_decode( - self.decompressed_model.generate(**inputs, max_length=50) + decompressed_device ) + decompressed_model_out = self.tokenizer.batch_decode( self.decompressed_model.generate(**inputs, max_length=50) ) + inputs = inputs.to(compressed_device) + + compressed_model_out = self.tokenizer.batch_decode( + self.compressed_model.generate(**inputs, max_length=50) + ) + + # Compare outputs for each input for idx in range(len(SAMPLE_INPUT)): assert compressed_model_out[idx] == decompressed_model_out[idx] From 1f2a5052bb8784aa98e603acf1287ece1561f893 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Thu, 9 Jan 2025 23:43:05 -0500 Subject: [PATCH 09/13] clear func name --- .../transformers/compression/test_run_compressed.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index f0c54631d..5984d854a 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -31,12 +31,6 @@ class TestRunCompressedDecompression(unittest.TestCase): All modules should be linear, runs default foward calls - Test the run_compressed input arg to AutoModelForCausalLM, where HFQuantizer is - responsible for decompressing if model is compressed. - - Diagram flow https://tinyurl.com/2ynb6wbu - - """ compressed_model_stub = None @@ -148,7 +142,7 @@ def test_compressed_linear_modules_exist(self): # some linear models are not compressed - ex. lm_head assert compressed_linear_counts > 0 - def test_compressed_matches_uncompressed(self): + def test_compressed_matches_decompressed__hf_quantizer(self): SAMPLE_INPUT = [ "I love 4-bit quantization because", "What is the capital of France?", From 82ac6bacdc06c067acd64b51af194598853056ba Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Fri, 10 Jan 2025 14:43:30 -0500 Subject: [PATCH 10/13] add automodelforcausallm decopression --- .../compression/test_decompress.py | 41 +++++++++++++++---- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py index c451ce11b..ca4996bc1 100644 --- a/tests/llmcompressor/transformers/compression/test_decompress.py +++ b/tests/llmcompressor/transformers/compression/test_decompress.py @@ -9,6 +9,7 @@ from compressed_tensors.quantization import QuantizationStatus from parameterized import parameterized_class from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers.utils.quantization_config import CompressedTensorsConfig from tests.testing_utils import parse_params, requires_gpu @@ -23,6 +24,9 @@ class TestQuantizationMatches(unittest.TestCase): write the optimized model's safetensors to the skeleton model and decompress Ex. write weight_scale to skeleton model and then fp4 -> fp16 + Check that HFQuantizer decompression and manual decompressed generates the + same output + """ compressed_model_stub = None @@ -44,7 +48,14 @@ def setUpClass(self): torch_dtype="auto", device_map="auto", ) + self.decompressed_model = AutoModelForCausalLM.from_pretrained( + self.compressed_model_stub, + torch_dtype="auto", + device_map="auto", + quantization_config=CompressedTensorsConfig(run_compressed=False), + ) + # manually decompress this model self.dense_model = AutoModelForCausalLM.from_pretrained( self.skeleton_model_stub, torch_dtype=self.compressed_model.dtype, @@ -55,7 +66,7 @@ def setUpClass(self): self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale" ) - self.decompressed_model = None + self.decompressed_model_manual = None config = AutoConfig.from_pretrained(self.compressed_model_stub) compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) @@ -76,18 +87,23 @@ def setUpClass(self): # self.dense_model should be decompressed assert dense_model is not self.dense_model - self.decompressed_model = self.dense_model + self.decompressed_model_manual = self.dense_model assert hasattr( - self.decompressed_model.model.layers[0].self_attn.q_proj, "weight_scale" + self.decompressed_model_manual.model.layers[0].self_attn.q_proj, + "weight_scale", ) def test_compressed_matches_uncompressed(self): - decompressed_device = self.decompressed_model.device + decompressed_model_manual = self.decompressed_model_manual.device compressed_device = self.compressed_model.device + decompressed_model_device = self.decompressed_model.device - self.decompressed_model = self.decompressed_model.to(decompressed_device) + self.decompressed_model_manual = self.decompressed_model_manual.to( + decompressed_model_manual + ) self.compressed_model = self.compressed_model.to(compressed_device) + self.decompressed_model = self.decompressed_model.to(decompressed_model_device) for input in self.SAMPLE_INPUTS: inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( @@ -98,13 +114,21 @@ def test_compressed_matches_uncompressed(self): self.compressed_model.generate(**inputs, max_length=50) ) - inputs = inputs.to(self.decompressed_model.device) + inputs = inputs.to(self.decompressed_model_manual.device) + + decompressed_model_manual_output = self.tokenizer.batch_decode( + self.decompressed_model_manual.generate(**inputs, max_length=50) + ) - uncompressed_output = self.tokenizer.batch_decode( + decompressed_model_out = self.tokenizer.batch_decode( self.decompressed_model.generate(**inputs, max_length=50) ) - assert compressed_output == uncompressed_output + assert ( + compressed_output + == decompressed_model_manual_output + == decompressed_model_out + ) @classmethod def tearDownClass(self): @@ -112,4 +136,5 @@ def tearDownClass(self): del self.compressed_model del self.dense_model del self.decompressed_model + del self.decompressed_model_manual torch.cuda.empty_cache() From 8e89c3b772182b2eba92b46dd1befa656bb422bb Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Mon, 13 Jan 2025 11:20:52 -0500 Subject: [PATCH 11/13] comment --- .../compression/test_decompress.py | 56 ++++++++----------- .../compression/test_run_compressed.py | 25 ++++----- 2 files changed, 34 insertions(+), 47 deletions(-) diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py index ca4996bc1..b7d3bb883 100644 --- a/tests/llmcompressor/transformers/compression/test_decompress.py +++ b/tests/llmcompressor/transformers/compression/test_decompress.py @@ -18,14 +18,15 @@ @requires_gpu @parameterized_class(parse_params(CONFIG_DIR)) -class TestQuantizationMatches(unittest.TestCase): +class TestDecompression(unittest.TestCase): """ - Test decompression - given a skeleton model and path to the optimized model, - write the optimized model's safetensors to the skeleton model and decompress - Ex. write weight_scale to skeleton model and then fp4 -> fp16 + Check that HFQuantizer decompression is working as expected. + Manually decompress a compressed model and compare the generations - Check that HFQuantizer decompression and manual decompressed generates the - same output + Decompression: + Given a skeleton model and path to the optimized model, + write the optimized model's safetensors to the skeleton model and decompress + Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16 """ @@ -43,30 +44,25 @@ def setUpClass(self): self.test_dir = tempfile.mkdtemp() self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub) - self.compressed_model = AutoModelForCausalLM.from_pretrained( - self.compressed_model_stub, - torch_dtype="auto", - device_map="auto", - ) - self.decompressed_model = AutoModelForCausalLM.from_pretrained( + # Decompress using HFQuantizer from AutoModelForCausalLM + self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained( self.compressed_model_stub, torch_dtype="auto", device_map="auto", quantization_config=CompressedTensorsConfig(run_compressed=False), ) - # manually decompress this model + # Manually decompress this model self.dense_model = AutoModelForCausalLM.from_pretrained( self.skeleton_model_stub, - torch_dtype=self.compressed_model.dtype, - device_map=self.compressed_model.device, + torch_dtype=self.decompressed_model_hf_quantizer.dtype, + device_map=self.decompressed_model_hf_quantizer.device, ) assert not hasattr( self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale" ) - self.decompressed_model_manual = None config = AutoConfig.from_pretrained(self.compressed_model_stub) compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) @@ -94,47 +90,39 @@ def setUpClass(self): "weight_scale", ) - def test_compressed_matches_uncompressed(self): + def test_hf_quantizer_decompress_match_manual_decompress(self): decompressed_model_manual = self.decompressed_model_manual.device - compressed_device = self.compressed_model.device - decompressed_model_device = self.decompressed_model.device + decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device self.decompressed_model_manual = self.decompressed_model_manual.to( decompressed_model_manual ) - self.compressed_model = self.compressed_model.to(compressed_device) - self.decompressed_model = self.decompressed_model.to(decompressed_model_device) + self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to( + decompressed_model_hf_quantizer + ) for input in self.SAMPLE_INPUTS: inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( - self.compressed_model.device + self.decompressed_model_manual.device ) - - compressed_output = self.tokenizer.batch_decode( - self.compressed_model.generate(**inputs, max_length=50) - ) - inputs = inputs.to(self.decompressed_model_manual.device) decompressed_model_manual_output = self.tokenizer.batch_decode( self.decompressed_model_manual.generate(**inputs, max_length=50) ) - decompressed_model_out = self.tokenizer.batch_decode( - self.decompressed_model.generate(**inputs, max_length=50) + decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode( + self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50) ) assert ( - compressed_output - == decompressed_model_manual_output - == decompressed_model_out + decompressed_model_hf_quantizer_out == decompressed_model_manual_output ) @classmethod def tearDownClass(self): shutil.rmtree(self.test_dir) - del self.compressed_model del self.dense_model - del self.decompressed_model + del self.decompressed_model_hf_quantizer del self.decompressed_model_manual torch.cuda.empty_cache() diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 5984d854a..aa4383b1e 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -18,18 +18,15 @@ @requires_gpu @parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) -class TestRunCompressedDecompression(unittest.TestCase): +class TestUncompressedDecompressed(unittest.TestCase): """ - Test the run_compressed input arg to AutoModelForCausalLM, where HFQuantizer is - responsible for decompressing if model is compressed. + Uncompressed-decompressed check - Diagram flow https://tinyurl.com/2ynb6wbu + Uncompressed: Optimized model saved as run_compressed=False, no need to decompress + Decompressed: Optimized model saved as run_compressed=True, and decompressed using + AutoModelForCausalLM decompression - Given an optimized model that was saved (uncompressed), - and saved as run_compressed (compressed), decompress the compressed model - and check the outputs. - - All modules should be linear, runs default foward calls + AutoModelForCausalLM decompression diagram flow https://tinyurl.com/2ynb6wbu """ @@ -97,11 +94,13 @@ def tearDownClass(cls): @requires_gpu @parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) -class TestRunCompressedForward(unittest.TestCase): +class TestCompressedDecompressed(unittest.TestCase): """ - Given an optimized model that was saved (uncompressed), - and saved as run_compressed (compressed), do not decompressed the compressed model - and check the outputs. + Compressed-decompressed check + + Compressed: Optimized model saved as run_compressed=True, no decompression + Decompressed: Optimized model saved as run_compressed=True, and decompressed using + AutoModelForCausalLM decompression All compressed model should have CompressedLinear, which has its custom forward call From fc35707941fcfa72c4865a5007ea6de1acf6fea7 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 14 Jan 2025 14:00:21 -0500 Subject: [PATCH 12/13] add more helpful comments --- .../run_compressed_configs/fp8_dynamic.yaml | 4 ++-- .../run_compressed_configs/w4a16.yaml | 4 ++-- .../run_compressed_configs/w8a16.yaml | 4 ++++ .../run_compressed_configs/w8a16_dense.yaml | 4 ---- .../compression/run_compressed_configs/w8a8.yaml | 4 ++-- .../compression/test_run_compressed.py | 16 ++++++++++++---- 6 files changed, 22 insertions(+), 14 deletions(-) create mode 100644 tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml delete mode 100644 tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml index ccd43c024..a8b773259 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed \ No newline at end of file +compressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed +uncompressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml index b4f2849c0..3a87b021c 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed \ No newline at end of file +compressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed +uncompressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml new file mode 100644 index 000000000..df791470f --- /dev/null +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml @@ -0,0 +1,4 @@ +cadence: "commit" +test_type: "regression" +compressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed +uncompressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml deleted file mode 100644 index e74e83a4d..000000000 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "commit" -test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml index b428c4a54..750bc1cdf 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed -uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed \ No newline at end of file +compressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed +uncompressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index aa4383b1e..36547d698 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -18,9 +18,9 @@ @requires_gpu @parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) -class TestUncompressedDecompressed(unittest.TestCase): +class Test_Decompressed_Linear_Uncompressed_Linear(unittest.TestCase): """ - Uncompressed-decompressed check + Uncompressed-Linear-forward decompressed-Linear-foward check Uncompressed: Optimized model saved as run_compressed=False, no need to decompress Decompressed: Optimized model saved as run_compressed=True, and decompressed using @@ -38,6 +38,9 @@ def setUpClass(cls): cls.test_dir = tempfile.mkdtemp() quantization_config = CompressedTensorsConfig(run_compressed=False) + + # Decompressed using HFQuantizer + # Linear foward cls.decompressed_model = AutoModelForCausalLM.from_pretrained( cls.compressed_model_stub, torch_dtype="auto", @@ -45,11 +48,14 @@ def setUpClass(cls): quantization_config=quantization_config, ) + # Load model as is at the uncompressed state + # Linear forward cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( cls.uncompressed_model_stub, torch_dtype=cls.decompressed_model.dtype, device_map=cls.decompressed_model.device, ) + breakpoint() cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub) @@ -94,9 +100,9 @@ def tearDownClass(cls): @requires_gpu @parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) -class TestCompressedDecompressed(unittest.TestCase): +class Test_Compressed_CompressedLinear_Decompressed_Linear(unittest.TestCase): """ - Compressed-decompressed check + Compressed-CompresesdLinear, Decompressed-Linear check Compressed: Optimized model saved as run_compressed=True, no decompression Decompressed: Optimized model saved as run_compressed=True, and decompressed using @@ -113,6 +119,7 @@ def setUpClass(cls): cls.test_dir = tempfile.mkdtemp() # Should have CompressedLinear modules + # Compressed Linear forward cls.compressed_model = AutoModelForCausalLM.from_pretrained( cls.compressed_model_stub, torch_dtype="auto", @@ -120,6 +127,7 @@ def setUpClass(cls): ) # Should just be linear modules + # Linear forward quantization_config = CompressedTensorsConfig(run_compressed=False) cls.decompressed_model = AutoModelForCausalLM.from_pretrained( cls.compressed_model_stub, From 21e6b73ea15c707149a956994f6fd0284b0c3803 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 14 Jan 2025 14:46:39 -0500 Subject: [PATCH 13/13] correct path --- .../compression/run_compressed_configs/fp8_dynamic.yaml | 4 ++-- .../compression/run_compressed_configs/w4a16.yaml | 4 ++-- .../compression/run_compressed_configs/w8a16.yaml | 4 ++-- .../transformers/compression/run_compressed_configs/w8a8.yaml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml index a8b773259..926c31ec3 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed -uncompressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed \ No newline at end of file +compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed +uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml index 3a87b021c..51d9ec25b 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed -uncompressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed \ No newline at end of file +compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed +uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml index df791470f..6521d66ec 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed -uncompressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed \ No newline at end of file +compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed +uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml index 750bc1cdf..3c1646b16 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed -uncompressed_model_stub: nm-testing/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed \ No newline at end of file +compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed +uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed \ No newline at end of file