diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml deleted file mode 100644 index 6685efb1e..000000000 --- a/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "commit" -test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml deleted file mode 100644 index 144044f28..000000000 --- a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "commit" -test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml deleted file mode 100644 index 95e73b148..000000000 --- a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "commit" -test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml deleted file mode 100644 index b5a846cbc..000000000 --- a/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "commit" -test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml index 926c31ec3..d516616bf 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed -uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed \ No newline at end of file +model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml index 51d9ec25b..7e9bc3f2f 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed -uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed \ No newline at end of file +model_stub: "nm-testing/tinyllama-w4a16-compressed" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml deleted file mode 100644 index 6521d66ec..000000000 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "commit" -test_type: "regression" -compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed -uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml new file mode 100644 index 000000000..af1e5df8b --- /dev/null +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml @@ -0,0 +1,4 @@ +cadence: "commit" +test_type: "regression" +model_stub: "nm-testing/tinyllama-w8a16-dense" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml index 3c1646b16..086a67ed6 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml @@ -1,4 +1,4 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed -uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed \ No newline at end of file +model_stub: "nm-testing/tinyllama-w8a8-compressed" +empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py deleted file mode 100644 index b7d3bb883..000000000 --- a/tests/llmcompressor/transformers/compression/test_decompress.py +++ /dev/null @@ -1,128 +0,0 @@ -import copy -import shutil -import tempfile -import unittest - -import torch -from compressed_tensors import QUANTIZATION_CONFIG_NAME -from compressed_tensors.compressors import ModelCompressor -from compressed_tensors.quantization import QuantizationStatus -from parameterized import parameterized_class -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -from transformers.utils.quantization_config import CompressedTensorsConfig - -from tests.testing_utils import parse_params, requires_gpu - -CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs" - - -@requires_gpu -@parameterized_class(parse_params(CONFIG_DIR)) -class TestDecompression(unittest.TestCase): - """ - Check that HFQuantizer decompression is working as expected. - Manually decompress a compressed model and compare the generations - - Decompression: - Given a skeleton model and path to the optimized model, - write the optimized model's safetensors to the skeleton model and decompress - Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16 - - """ - - compressed_model_stub = None - skeleton_model_stub = None - - SAMPLE_INPUTS = [ - "I love 4-bit quantization because", - "What is the capital of France?", - "def fibonacci(n):", - ] - - @classmethod - def setUpClass(self): - self.test_dir = tempfile.mkdtemp() - self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub) - - # Decompress using HFQuantizer from AutoModelForCausalLM - self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained( - self.compressed_model_stub, - torch_dtype="auto", - device_map="auto", - quantization_config=CompressedTensorsConfig(run_compressed=False), - ) - - # Manually decompress this model - self.dense_model = AutoModelForCausalLM.from_pretrained( - self.skeleton_model_stub, - torch_dtype=self.decompressed_model_hf_quantizer.dtype, - device_map=self.decompressed_model_hf_quantizer.device, - ) - - assert not hasattr( - self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale" - ) - - config = AutoConfig.from_pretrained(self.compressed_model_stub) - - compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) - self.compressor = ModelCompressor.from_compression_config(compression_config) - self.compressor.quantization_config.quantization_status = ( - QuantizationStatus.FROZEN - ) - - # use the model_path to load the decompressed weights into dense_model - dense_model = copy.deepcopy(self.dense_model) - - # overwrite the weights of the dense model - self.compressor.decompress( - model_path=self.compressed_model_stub, - model=self.dense_model, - ) - - # self.dense_model should be decompressed - assert dense_model is not self.dense_model - - self.decompressed_model_manual = self.dense_model - - assert hasattr( - self.decompressed_model_manual.model.layers[0].self_attn.q_proj, - "weight_scale", - ) - - def test_hf_quantizer_decompress_match_manual_decompress(self): - decompressed_model_manual = self.decompressed_model_manual.device - decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device - - self.decompressed_model_manual = self.decompressed_model_manual.to( - decompressed_model_manual - ) - self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to( - decompressed_model_hf_quantizer - ) - - for input in self.SAMPLE_INPUTS: - inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( - self.decompressed_model_manual.device - ) - inputs = inputs.to(self.decompressed_model_manual.device) - - decompressed_model_manual_output = self.tokenizer.batch_decode( - self.decompressed_model_manual.generate(**inputs, max_length=50) - ) - - decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode( - self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50) - ) - - assert ( - decompressed_model_hf_quantizer_out == decompressed_model_manual_output - ) - - @classmethod - def tearDownClass(self): - shutil.rmtree(self.test_dir) - del self.dense_model - del self.decompressed_model_hf_quantizer - del self.decompressed_model_manual - torch.cuda.empty_cache() diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 36547d698..0c2a0ab0e 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -3,187 +3,77 @@ import unittest import torch -from compressed_tensors.linear.compressed_linear import CompressedLinear -from compressed_tensors.quantization.utils import iter_named_leaf_modules +from compressed_tensors import QUANTIZATION_CONFIG_NAME +from compressed_tensors.compressors import ModelCompressor +from compressed_tensors.quantization import QuantizationStatus from parameterized import parameterized_class -from transformers import AutoModelForCausalLM, AutoTokenizer -from transformers.utils.quantization_config import CompressedTensorsConfig +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from tests.testing_utils import parse_params, requires_gpu -COMPRESSED_LINEAR_CONFIG_DIR = ( - "tests/llmcompressor/transformers/compression/run_compressed_configs" -) +CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs" @requires_gpu -@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) -class Test_Decompressed_Linear_Uncompressed_Linear(unittest.TestCase): - """ - Uncompressed-Linear-forward decompressed-Linear-foward check - - Uncompressed: Optimized model saved as run_compressed=False, no need to decompress - Decompressed: Optimized model saved as run_compressed=True, and decompressed using - AutoModelForCausalLM decompression - - AutoModelForCausalLM decompression diagram flow https://tinyurl.com/2ynb6wbu - - """ - - compressed_model_stub = None - uncompressed_model_stub = None - - @classmethod - def setUpClass(cls): - cls.test_dir = tempfile.mkdtemp() - - quantization_config = CompressedTensorsConfig(run_compressed=False) - - # Decompressed using HFQuantizer - # Linear foward - cls.decompressed_model = AutoModelForCausalLM.from_pretrained( - cls.compressed_model_stub, - torch_dtype="auto", - device_map="auto", - quantization_config=quantization_config, - ) - - # Load model as is at the uncompressed state - # Linear forward - cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( - cls.uncompressed_model_stub, - torch_dtype=cls.decompressed_model.dtype, - device_map=cls.decompressed_model.device, - ) - breakpoint() - - cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub) - - def test_compressed_matches_decompressed(self): - SAMPLE_INPUT = [ - "I love 4-bit quantization because", - "What is the capital of France?", - "def fibonacci(n):", - ] - - decompressed_device = self.decompressed_model.device - uncompressed_device = self.uncompressed_model.device - - # overwrite weights in cpu to cuda - self.decompressed_model = self.decompressed_model.to(decompressed_device) - self.uncompressed_model = self.uncompressed_model.to(uncompressed_device) - - inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( - decompressed_device - ) - - decompressed_output = self.tokenizer.batch_decode( - self.decompressed_model.generate(**inputs, max_length=50) - ) - - inputs = inputs.to(uncompressed_device) - - uncompressed_output = self.tokenizer.batch_decode( - self.uncompressed_model.generate(**inputs, max_length=50) - ) - - for idx in range(len(SAMPLE_INPUT)): - assert decompressed_output[idx] == uncompressed_output[idx] - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.test_dir) - del cls.decompressed_model - del cls.uncompressed_model - torch.cuda.empty_cache() - - -@requires_gpu -@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) -class Test_Compressed_CompressedLinear_Decompressed_Linear(unittest.TestCase): - """ - Compressed-CompresesdLinear, Decompressed-Linear check - - Compressed: Optimized model saved as run_compressed=True, no decompression - Decompressed: Optimized model saved as run_compressed=True, and decompressed using - AutoModelForCausalLM decompression - - All compressed model should have CompressedLinear, which has its custom forward call - - """ - - compressed_model_stub = None +@parameterized_class(parse_params(CONFIG_DIR)) +class TestQuantizationMatches(unittest.TestCase): + model_stub = None + empty_model = None @classmethod def setUpClass(cls): cls.test_dir = tempfile.mkdtemp() - # Should have CompressedLinear modules - # Compressed Linear forward + # TODO: Give option on HFQuantizer to run run_compressed True/False + # currently hardcoded to True cls.compressed_model = AutoModelForCausalLM.from_pretrained( - cls.compressed_model_stub, + cls.model_stub, torch_dtype="auto", device_map="auto", + # run_compressed=True, # TODO: Give option on HFQuantizer ) - - # Should just be linear modules - # Linear forward - quantization_config = CompressedTensorsConfig(run_compressed=False) - cls.decompressed_model = AutoModelForCausalLM.from_pretrained( - cls.compressed_model_stub, + # TODO: Use ModelCompressor until decompression is supported through + # HFQuant/run_compressed can be turned off. + cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( + cls.empty_model, torch_dtype=cls.compressed_model.dtype, device_map=cls.compressed_model.device, - quantization_config=quantization_config, + ) + config = AutoConfig.from_pretrained(cls.model_stub) + compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) + cls.compressor = ModelCompressor.from_compression_config(compression_config) + cls.compressor.quantization_config.quantization_status = ( + QuantizationStatus.FROZEN + ) + cls.compressor.decompress( + model_path=cls.model_stub, model=cls.uncompressed_model ) - cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub) - - def test_compressed_linear_modules_exist(self): - compressed_linear_counts = 0 - for _, submodule in iter_named_leaf_modules( - self.compressed_model, - ): - if isinstance(submodule, CompressedLinear): - compressed_linear_counts += 1 - - # some linear models are not compressed - ex. lm_head - assert compressed_linear_counts > 0 + cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub) - def test_compressed_matches_decompressed__hf_quantizer(self): + def test_compressed_matches_uncompressed(self): SAMPLE_INPUT = [ "I love 4-bit quantization because", "What is the capital of France?", "def fibonacci(n):", ] - decompressed_device = self.decompressed_model.device - compressed_device = self.compressed_model.device - - # overwrite weights in cpu to cuda - self.decompressed_model = self.decompressed_model.to(decompressed_device) - self.compressed_model = self.compressed_model.to(compressed_device) - inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( - decompressed_device + self.compressed_model.device ) - - decompressed_model_out = self.tokenizer.batch_decode( - self.decompressed_model.generate(**inputs, max_length=50) - ) - - inputs = inputs.to(compressed_device) - - compressed_model_out = self.tokenizer.batch_decode( + compressed_output = self.tokenizer.batch_decode( self.compressed_model.generate(**inputs, max_length=50) ) + uncompressed_output = self.tokenizer.batch_decode( + self.uncompressed_model.generate(**inputs, max_length=50) + ) - # Compare outputs for each input for idx in range(len(SAMPLE_INPUT)): - assert compressed_model_out[idx] == decompressed_model_out[idx] + assert compressed_output[idx] == uncompressed_output[idx] @classmethod def tearDownClass(cls): shutil.rmtree(cls.test_dir) - del cls.decompressed_model del cls.compressed_model + del cls.uncompressed_model torch.cuda.empty_cache()