-
Notifications
You must be signed in to change notification settings - Fork 72
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
112 additions
and
150 deletions.
There are no files selected for viewing
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" | ||
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed |
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
model_stub: "nm-testing/tinyllama-w4a16-compressed" | ||
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed |
4 changes: 0 additions & 4 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml
This file was deleted.
Oops, something went wrong.
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
model_stub: "nm-testing/tinyllama-w8a8-compressed" | ||
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
224 changes: 85 additions & 139 deletions
224
tests/llmcompressor/transformers/compression/test_run_compressed.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,187 +1,133 @@ | ||
import copy | ||
import shutil | ||
import tempfile | ||
import unittest | ||
|
||
import torch | ||
from compressed_tensors.linear.compressed_linear import CompressedLinear | ||
from compressed_tensors.quantization.utils import iter_named_leaf_modules | ||
from compressed_tensors import QUANTIZATION_CONFIG_NAME | ||
from compressed_tensors.compressors import ModelCompressor | ||
from compressed_tensors.quantization import QuantizationStatus | ||
from parameterized import parameterized_class | ||
from transformers import AutoModelForCausalLM, AutoTokenizer | ||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | ||
from transformers.utils.quantization_config import CompressedTensorsConfig | ||
|
||
from tests.testing_utils import parse_params, requires_gpu | ||
|
||
COMPRESSED_LINEAR_CONFIG_DIR = ( | ||
"tests/llmcompressor/transformers/compression/run_compressed_configs" | ||
) | ||
CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs" | ||
|
||
|
||
@requires_gpu | ||
@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) | ||
class Test_Decompressed_Linear_Uncompressed_Linear(unittest.TestCase): | ||
@parameterized_class(parse_params(CONFIG_DIR)) | ||
class TestDecompression(unittest.TestCase): | ||
""" | ||
Uncompressed-Linear-forward decompressed-Linear-foward check | ||
Check that HFQuantizer decompression is working as expected. | ||
Manually decompress a compressed model and compare the generations | ||
Uncompressed: Optimized model saved as run_compressed=False, no need to decompress | ||
Decompressed: Optimized model saved as run_compressed=True, and decompressed using | ||
AutoModelForCausalLM decompression | ||
AutoModelForCausalLM decompression diagram flow https://tinyurl.com/2ynb6wbu | ||
Decompression: | ||
Given a skeleton model and path to the optimized model, | ||
write the optimized model's safetensors to the skeleton model and decompress | ||
Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16 | ||
""" | ||
|
||
compressed_model_stub = None | ||
uncompressed_model_stub = None | ||
skeleton_model_stub = None | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
cls.test_dir = tempfile.mkdtemp() | ||
SAMPLE_INPUTS = [ | ||
"I love 4-bit quantization because", | ||
"What is the capital of France?", | ||
"def fibonacci(n):", | ||
] | ||
|
||
quantization_config = CompressedTensorsConfig(run_compressed=False) | ||
@classmethod | ||
def setUpClass(self): | ||
self.test_dir = tempfile.mkdtemp() | ||
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub) | ||
|
||
# Decompressed using HFQuantizer | ||
# Linear foward | ||
cls.decompressed_model = AutoModelForCausalLM.from_pretrained( | ||
cls.compressed_model_stub, | ||
# Decompress using HFQuantizer from AutoModelForCausalLM | ||
self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained( | ||
self.compressed_model_stub, | ||
torch_dtype="auto", | ||
device_map="auto", | ||
quantization_config=quantization_config, | ||
quantization_config=CompressedTensorsConfig(run_compressed=False), | ||
) | ||
|
||
# Load model as is at the uncompressed state | ||
# Linear forward | ||
cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( | ||
cls.uncompressed_model_stub, | ||
torch_dtype=cls.decompressed_model.dtype, | ||
device_map=cls.decompressed_model.device, | ||
# Manually decompress this model | ||
self.dense_model = AutoModelForCausalLM.from_pretrained( | ||
self.skeleton_model_stub, | ||
torch_dtype=self.decompressed_model_hf_quantizer.dtype, | ||
device_map=self.decompressed_model_hf_quantizer.device, | ||
) | ||
|
||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub) | ||
|
||
def test_compressed_matches_decompressed(self): | ||
SAMPLE_INPUT = [ | ||
"I love 4-bit quantization because", | ||
"What is the capital of France?", | ||
"def fibonacci(n):", | ||
] | ||
|
||
decompressed_device = self.decompressed_model.device | ||
uncompressed_device = self.uncompressed_model.device | ||
|
||
# overwrite weights in cpu to cuda | ||
self.decompressed_model = self.decompressed_model.to(decompressed_device) | ||
self.uncompressed_model = self.uncompressed_model.to(uncompressed_device) | ||
|
||
inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( | ||
decompressed_device | ||
# decompression from HFQuantizer should populate weight_scale | ||
assert hasattr( | ||
self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj, | ||
"weight_scale", | ||
) | ||
|
||
decompressed_output = self.tokenizer.batch_decode( | ||
self.decompressed_model.generate(**inputs, max_length=50) | ||
# dense model should not have weight_scale populated | ||
assert not hasattr( | ||
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale" | ||
) | ||
|
||
inputs = inputs.to(uncompressed_device) | ||
config = AutoConfig.from_pretrained(self.compressed_model_stub) | ||
|
||
uncompressed_output = self.tokenizer.batch_decode( | ||
self.uncompressed_model.generate(**inputs, max_length=50) | ||
compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) | ||
self.compressor = ModelCompressor.from_compression_config(compression_config) | ||
self.compressor.quantization_config.quantization_status = ( | ||
QuantizationStatus.FROZEN | ||
) | ||
|
||
for idx in range(len(SAMPLE_INPUT)): | ||
assert decompressed_output[idx] == uncompressed_output[idx] | ||
|
||
@classmethod | ||
def tearDownClass(cls): | ||
shutil.rmtree(cls.test_dir) | ||
del cls.decompressed_model | ||
del cls.uncompressed_model | ||
torch.cuda.empty_cache() | ||
|
||
|
||
@requires_gpu | ||
@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) | ||
class Test_Compressed_CompressedLinear_Decompressed_Linear(unittest.TestCase): | ||
""" | ||
Compressed-CompresesdLinear, Decompressed-Linear check | ||
Compressed: Optimized model saved as run_compressed=True, no decompression | ||
Decompressed: Optimized model saved as run_compressed=True, and decompressed using | ||
AutoModelForCausalLM decompression | ||
# use the model_path to load the decompressed weights into dense_model | ||
dense_model = copy.deepcopy(self.dense_model) | ||
|
||
All compressed model should have CompressedLinear, which has its custom forward call | ||
""" | ||
|
||
compressed_model_stub = None | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
cls.test_dir = tempfile.mkdtemp() | ||
|
||
# Should have CompressedLinear modules | ||
# Compressed Linear forward | ||
cls.compressed_model = AutoModelForCausalLM.from_pretrained( | ||
cls.compressed_model_stub, | ||
torch_dtype="auto", | ||
device_map="auto", | ||
# overwrite the weights of the dense model | ||
self.compressor.decompress( | ||
model_path=self.compressed_model_stub, | ||
model=self.dense_model, | ||
) | ||
|
||
# Should just be linear modules | ||
# Linear forward | ||
quantization_config = CompressedTensorsConfig(run_compressed=False) | ||
cls.decompressed_model = AutoModelForCausalLM.from_pretrained( | ||
cls.compressed_model_stub, | ||
torch_dtype=cls.compressed_model.dtype, | ||
device_map=cls.compressed_model.device, | ||
quantization_config=quantization_config, | ||
) | ||
# self.dense_model should be decompressed | ||
assert dense_model is not self.dense_model | ||
|
||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub) | ||
self.decompressed_model_manual = self.dense_model | ||
|
||
def test_compressed_linear_modules_exist(self): | ||
compressed_linear_counts = 0 | ||
for _, submodule in iter_named_leaf_modules( | ||
self.compressed_model, | ||
): | ||
if isinstance(submodule, CompressedLinear): | ||
compressed_linear_counts += 1 | ||
|
||
# some linear models are not compressed - ex. lm_head | ||
assert compressed_linear_counts > 0 | ||
|
||
def test_compressed_matches_decompressed__hf_quantizer(self): | ||
SAMPLE_INPUT = [ | ||
"I love 4-bit quantization because", | ||
"What is the capital of France?", | ||
"def fibonacci(n):", | ||
] | ||
|
||
decompressed_device = self.decompressed_model.device | ||
compressed_device = self.compressed_model.device | ||
assert hasattr( | ||
self.decompressed_model_manual.model.layers[0].self_attn.q_proj, | ||
"weight_scale", | ||
) | ||
|
||
# overwrite weights in cpu to cuda | ||
self.decompressed_model = self.decompressed_model.to(decompressed_device) | ||
self.compressed_model = self.compressed_model.to(compressed_device) | ||
def test_hf_quantizer_decompress_match_manual_decompress(self): | ||
manual_device = self.decompressed_model_manual.device | ||
decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device | ||
|
||
inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( | ||
decompressed_device | ||
self.decompressed_model_manual = self.decompressed_model_manual.to( | ||
manual_device | ||
) | ||
|
||
decompressed_model_out = self.tokenizer.batch_decode( | ||
self.decompressed_model.generate(**inputs, max_length=50) | ||
self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to( | ||
decompressed_model_hf_quantizer | ||
) | ||
|
||
inputs = inputs.to(compressed_device) | ||
for input in self.SAMPLE_INPUTS: | ||
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( | ||
self.decompressed_model_manual.device | ||
) | ||
inputs = inputs.to(self.decompressed_model_manual.device) | ||
|
||
compressed_model_out = self.tokenizer.batch_decode( | ||
self.compressed_model.generate(**inputs, max_length=50) | ||
) | ||
decompressed_model_manual_output = self.tokenizer.batch_decode( | ||
self.decompressed_model_manual.generate(**inputs, max_length=50) | ||
) | ||
|
||
decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode( | ||
self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50) | ||
) | ||
|
||
# Compare outputs for each input | ||
for idx in range(len(SAMPLE_INPUT)): | ||
assert compressed_model_out[idx] == decompressed_model_out[idx] | ||
assert ( | ||
decompressed_model_hf_quantizer_out == decompressed_model_manual_output | ||
) | ||
|
||
@classmethod | ||
def tearDownClass(cls): | ||
shutil.rmtree(cls.test_dir) | ||
del cls.decompressed_model | ||
del cls.compressed_model | ||
def tearDownClass(self): | ||
shutil.rmtree(self.test_dir) | ||
del self.dense_model | ||
del self.decompressed_model_hf_quantizer | ||
del self.decompressed_model_manual |