From 09691d16cce911a6d1ef0399f263ec4c918014f8 Mon Sep 17 00:00:00 2001 From: George Date: Mon, 22 Apr 2024 12:38:29 -0400 Subject: [PATCH 01/10] test forward (#16) --- .../quantization/lifecycle/forward.py | 8 +- .../quantization/lifecycle/test_forward.py | 80 +++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 tests/sparsetensors/quantization/lifecycle/test_forward.py diff --git a/src/sparsetensors/quantization/lifecycle/forward.py b/src/sparsetensors/quantization/lifecycle/forward.py index 6416a10b..5e6036ea 100644 --- a/src/sparsetensors/quantization/lifecycle/forward.py +++ b/src/sparsetensors/quantization/lifecycle/forward.py @@ -21,7 +21,13 @@ from torch.nn import Module -__all__ = ["wrap_module_forward_quantized"] +__all__ = [ + "wrap_module_forward_quantized", + "quantize", + "dequantize", + "fake_quantize", + "maybe_calibrate_or_quantize", +] @torch.no_grad() diff --git a/tests/sparsetensors/quantization/lifecycle/test_forward.py b/tests/sparsetensors/quantization/lifecycle/test_forward.py new file mode 100644 index 00000000..c2d27bd1 --- /dev/null +++ b/tests/sparsetensors/quantization/lifecycle/test_forward.py @@ -0,0 +1,80 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +import torch +from sparsetensors.quantization.lifecycle.forward import ( + maybe_calibrate_or_quantize, + wrap_module_forward_quantized, +) +from sparsetensors.quantization.lifecycle.initialize import ( + initialize_module_for_quantization, +) +from sparsetensors.quantization.lifecycle.status import QuantizationStatus +from sparsetensors.quantization.quant_args import QuantizationArgs +from torch.nn import Linear + + +def test_wrap_module_forward_quantized(create_quantization_scheme): + num_bits = 8 + quantization_scheme = create_quantization_scheme( + targets=["*"], + weights=QuantizationArgs(num_bits=num_bits, symmetric=True), + input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), + ) + layer = Linear(4, 4) + + func_forward = layer.forward.__func__ + + # check that the forward call is overwritten + wrap_module_forward_quantized(layer, quantization_scheme) + + assert not func_forward == layer.forward.__func__ + + +@pytest.mark.parametrize( + "quantization_status", ["INITIALIZED", "CALIBRATION", "FROZEN"] +) +def test_maybe_calibrate_or_quantize(create_quantization_scheme, quantization_status): + num_bits = 8 + quantization_scheme = create_quantization_scheme( + targets=["*"], + weights=QuantizationArgs(num_bits=num_bits, symmetric=True), + input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), + ) + quantization_args = QuantizationArgs(num_bits=num_bits, symmetric=False) + layer = Linear(4, 4) + layer.weight.data *= 100 + + initialize_module_for_quantization(layer, quantization_scheme) + layer.quantization_status = QuantizationStatus(quantization_status) + + if layer.quantization_status == QuantizationStatus.INITIALIZED: + out = maybe_calibrate_or_quantize( + layer, layer.weight.data, "input", quantization_args + ) + assert torch.allclose(out, layer.weight.data) + elif layer.quantization_status == QuantizationStatus.CALIBRATION: + out = maybe_calibrate_or_quantize( + layer, layer.weight.data, "input", quantization_args + ) + assert not torch.allclose(out, layer.weight.data) + + elif layer.quantization_status == QuantizationStatus.FROZEN: + # scale and zero points are empty -- cannot quantize + with pytest.raises(ValueError): + out = maybe_calibrate_or_quantize( + layer, layer.weight.data, "input", quantization_args + ) From 10aee1f6d44553b466529acf79838a4baf91f207 Mon Sep 17 00:00:00 2001 From: George Date: Mon, 22 Apr 2024 12:39:36 -0400 Subject: [PATCH 02/10] test frozen (#17) * test frozen * rename --- .../quantization/lifecycle/frozen.py | 9 ++-- .../quantization/lifecycle/test_frozen.py | 47 +++++++++++++++++++ 2 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 tests/sparsetensors/quantization/lifecycle/test_frozen.py diff --git a/src/sparsetensors/quantization/lifecycle/frozen.py b/src/sparsetensors/quantization/lifecycle/frozen.py index 63949cf5..9715a4b2 100644 --- a/src/sparsetensors/quantization/lifecycle/frozen.py +++ b/src/sparsetensors/quantization/lifecycle/frozen.py @@ -35,12 +35,13 @@ def freeze_module_quantization(module: Module): return # delete observers from module - observer_names = [] + submodule_name_do_delete = set() for submodule_name, _ in module.named_modules(): if "." not in submodule_name and submodule_name.endswith("_observer"): # delete any observers that belong directly to this module - observer_names.append(submodule_name) - for observer_name in observer_names: - delattr(module, observer_name) + submodule_name_do_delete.add(submodule_name) + + for submodule_name in submodule_name_do_delete: + delattr(module, submodule_name) module.quantization_status = QuantizationStatus.FROZEN diff --git a/tests/sparsetensors/quantization/lifecycle/test_frozen.py b/tests/sparsetensors/quantization/lifecycle/test_frozen.py new file mode 100644 index 00000000..0b5a18e8 --- /dev/null +++ b/tests/sparsetensors/quantization/lifecycle/test_frozen.py @@ -0,0 +1,47 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from sparsetensors.quantization.lifecycle.frozen import freeze_module_quantization +from sparsetensors.quantization.lifecycle.initialize import ( + initialize_module_for_quantization, +) +from sparsetensors.quantization.lifecycle.status import QuantizationStatus +from sparsetensors.quantization.quant_args import QuantizationArgs +from torch.nn import Linear + + +def test_set_module_for_calibration(create_quantization_scheme): + num_bits = 8 + quantization_scheme = create_quantization_scheme( + targets=["*"], + weights=QuantizationArgs(num_bits=num_bits, symmetric=True), + input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), + ) + + layer = Linear(4, 4) + + initialize_module_for_quantization(layer, quantization_scheme) + layer.quantization_status = QuantizationStatus("CALIBRATION") + + # should have both input and weight observer after initalizing + assert hasattr(layer, "input_observer") + assert hasattr(layer, "weight_observer") + + # observers should get deleted after freezing + freeze_module_quantization(layer) + assert not hasattr(layer, "input_observer") + assert not hasattr(layer, "weight_observer") + + assert layer.quantization_status == QuantizationStatus("FROZEN") From 321bf845b134553f80d2017491391d7ce1ea512e Mon Sep 17 00:00:00 2001 From: George Date: Mon, 22 Apr 2024 14:05:25 -0400 Subject: [PATCH 03/10] lifecycle conftest (#21) --- .../quantization/lifecycle/conftest.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 tests/sparsetensors/quantization/lifecycle/conftest.py diff --git a/tests/sparsetensors/quantization/lifecycle/conftest.py b/tests/sparsetensors/quantization/lifecycle/conftest.py new file mode 100644 index 00000000..a8ad01b2 --- /dev/null +++ b/tests/sparsetensors/quantization/lifecycle/conftest.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +import pytest +from sparsetensors.quantization.quant_args import QuantizationArgs +from sparsetensors.quantization.quant_scheme import QuantizationScheme + + +@pytest.fixture +def create_quantization_scheme(): + def quantization_scheme( + targets: List[str], + weights: Optional[QuantizationArgs] = None, + input_activations: Optional[QuantizationArgs] = None, + output_activations: Optional[QuantizationArgs] = None, + ): + return QuantizationScheme( + targets=targets, + weights=weights, + input_activations=input_activations, + output_activations=output_activations, + ) + + return quantization_scheme From 57578cc3543723d35ac78ee085e2d0e892b5e23e Mon Sep 17 00:00:00 2001 From: George Date: Wed, 24 Apr 2024 10:55:17 -0400 Subject: [PATCH 04/10] test initalize (#18) * test initalize * newline * parametrize weights and inp_act * remove dup --- .../quantization/lifecycle/test_initialize.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 tests/sparsetensors/quantization/lifecycle/test_initialize.py diff --git a/tests/sparsetensors/quantization/lifecycle/test_initialize.py b/tests/sparsetensors/quantization/lifecycle/test_initialize.py new file mode 100644 index 00000000..b2f01c0f --- /dev/null +++ b/tests/sparsetensors/quantization/lifecycle/test_initialize.py @@ -0,0 +1,79 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from sparsetensors.quantization.lifecycle.initialize import ( + initialize_module_for_quantization, +) +from sparsetensors.quantization.quant_args import QuantizationArgs +from sparsetensors.quantization.quant_config import QuantizationStatus +from torch.nn import Linear + + +NUM_BITS = 8 + + +@pytest.mark.parametrize( + "weights,input_activations", + [ + ( + QuantizationArgs(num_bits=NUM_BITS, symmetric=True), + None, + ), + ( + None, + QuantizationArgs(num_bits=NUM_BITS, symmetric=True), + ), + ( + QuantizationArgs(num_bits=NUM_BITS, symmetric=True), + QuantizationArgs(num_bits=NUM_BITS, symmetric=True), + ), + ], +) +def test_initialize_module_for_quantization( + create_quantization_scheme, weights, input_activations +): + quantization_scheme = create_quantization_scheme( + targets=["*"], + weights=weights, + input_activations=input_activations, + ) + layer = Linear(4, 4) + + assert not hasattr(layer, "quantization_scheme") + assert not hasattr(layer, "quantization_status") + + # add attributes, zero_points and scale + initialize_module_for_quantization(layer, quantization_scheme) + + registered_params = {"weight", "bias"} + if weights is not None: + registered_params.add("weight_scale") + registered_params.add("weight_zero_point") + + if input_activations is not None: + registered_params.add("input_scale") + registered_params.add("input_zero_point") + + for key in layer.state_dict().keys(): + assert key in registered_params + registered_params.remove(key) + + assert len(registered_params) == 0 + + assert hasattr(layer, "quantization_scheme") + assert hasattr(layer, "quantization_status") + + assert layer.quantization_status == QuantizationStatus.INITIALIZED From ecadd5245f51467f5765e34da85e273f4b82354b Mon Sep 17 00:00:00 2001 From: George Date: Thu, 25 Apr 2024 12:50:53 -0400 Subject: [PATCH 05/10] test lifecycle (#19) * test lifecycle * comments * comments * add quantization test --- .../quantization/lifecycle/test_lifecycle.py | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 tests/sparsetensors/quantization/lifecycle/test_lifecycle.py diff --git a/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py b/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py new file mode 100644 index 00000000..2884bde4 --- /dev/null +++ b/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py @@ -0,0 +1,117 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy + +import torch +from sparsetensors.quantization.lifecycle.calibration import set_module_for_calibration +from sparsetensors.quantization.lifecycle.frozen import freeze_module_quantization +from sparsetensors.quantization.lifecycle.initialize import ( + initialize_module_for_quantization, +) +from sparsetensors.quantization.quant_args import QuantizationArgs +from sparsetensors.quantization.quant_config import QuantizationStatus +from torch.nn import Linear + + +def test_lifecyle(create_quantization_scheme): + num_bits = 8 + + quantization_scheme = create_quantization_scheme( + input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), + weights=QuantizationArgs(num_bits=num_bits, symmetric=True), + targets=["*"], + ) + + layer = Linear(4, 4) + layer.weight.data *= 100 + + # updated layer keys check + expected_layer_keys = {"weight", "bias"} + for key in layer.state_dict().keys(): + expected_layer_keys.remove(key) + assert len(expected_layer_keys) == 0 + + # over write forward pass and register zero_point and scale + initialize_module_for_quantization(layer, quantization_scheme) + expected_layer_keys = { + "input_scale", + "input_zero_point", + "weight_scale", + "weight_zero_point", + "weight", + "bias", + } + for key in layer.state_dict().keys(): + expected_layer_keys.remove(key) + assert len(expected_layer_keys) == 0 + + # should have both input and weight observer after initalizing + assert hasattr(layer, "input_observer") + assert hasattr(layer, "weight_observer") + + assert hasattr(layer, "quantization_scheme") + assert hasattr(layer, "quantization_status") + assert layer.quantization_status == QuantizationStatus.INITIALIZED + + set_module_for_calibration(layer) + assert layer.quantization_status == QuantizationStatus.CALIBRATION + + # do a calibration step + assert torch.numel(layer.input_zero_point.data) == 0 + assert torch.numel(layer.input_scale) == 0 + assert torch.numel(layer.weight_scale) == 0 + assert torch.numel(layer.weight_zero_point) == 0 + + layer(torch.randn(4, 4)) + + # zero-points and scale should be updated after forward pass + assert torch.numel(layer.input_zero_point.data) > 0 + assert torch.numel(layer.input_scale) > 0 + assert torch.numel(layer.weight_scale) > 0 + assert torch.numel(layer.weight_zero_point) > 0 + + # symmetric zero points should center at 0 + assert layer.weight_zero_point.data == 0 + + # check high and low bound of the weights + assert torch.all(layer.weight.data >= -128) and torch.all(layer.weight.data <= 127) + + initalized_layer = deepcopy(layer) + + # calibrate the layers with each iteration + for _ in range(10): + layer(torch.randn(4, 4)) + + assert initalized_layer.input_zero_point != layer.input_zero_point + assert initalized_layer.input_scale != layer.input_scale + assert initalized_layer.weight_scale != layer.weight_scale + + # check quantization f_q(x) is applied after frozen without update + input_check_for_quant = torch.randn(4, 4) + out_calibration = layer(input_check_for_quant) + + layer_before_freeze = deepcopy(layer) + + # Freeze, no update after any forward pass + freeze_module_quantization(layer) + + for _ in range(10): + layer(torch.randn(4, 4)) + assert layer_before_freeze.input_zero_point == layer.input_zero_point + assert layer_before_freeze.input_scale == layer.input_scale + assert layer_before_freeze.weight_scale == layer.weight_scale + + # check that the same quantization is applied as calibration to frozen + assert torch.all(out_calibration == layer(input_check_for_quant)) From 7ad03ab5003dcfa58ef65ffa53846411a149ae76 Mon Sep 17 00:00:00 2001 From: George Date: Thu, 25 Apr 2024 12:51:37 -0400 Subject: [PATCH 06/10] Lifecycle/min max obs (#20) * min max test * add minmax obs * test scale range and min_max update --- .../quantization/observers/test_min_max.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 tests/sparsetensors/quantization/observers/test_min_max.py diff --git a/tests/sparsetensors/quantization/observers/test_min_max.py b/tests/sparsetensors/quantization/observers/test_min_max.py new file mode 100644 index 00000000..a5273d02 --- /dev/null +++ b/tests/sparsetensors/quantization/observers/test_min_max.py @@ -0,0 +1,89 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +import torch +from sparsetensors.quantization.quant_args import QuantizationArgs + + +@pytest.mark.parametrize( + "symmetric,expected_scale,expected_zero_point", + [ + (True, 0.0078, 0), + (False, 0.0039, 0), + ], +) +def test_min_max_observer(symmetric, expected_scale, expected_zero_point): + tensor = torch.tensor([1, 1, 1, 1, 1]) + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric) + + observer = weights.get_observer() + scale, zero_point = observer(tensor) + + assert round(scale.item(), 4) == expected_scale + assert round(zero_point.item(), 4) == expected_zero_point + + +def test_min_max_observer_symmetric_scale_range(): + tensor = torch.rand(4, 4) + tensor *= 127 + + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=True) + + observer = weights.get_observer() + scale, zero_point = observer(tensor) + + # if symmetric, max symmetric_range = abs(-128) / 255 + assert round(scale.item(), 4) <= 1.0039 + assert round(zero_point.item(), 4) == 0 + + +def test_min_max_observer_value_update(): + inp = torch.tensor([1, 1, 1, 1, 1]) + inp_update_max = torch.tensor([127, 1, 1, 1, 1]) + inp_update_min = torch.tensor([-128, 1, 1, 1, 1]) + + # udpate the min, max twice total + tensors = [ + inp, + inp, + inp_update_max, # update max + inp, + inp_update_min, # update min + ] + + tensor = inp + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=True) + + observer = weights.get_observer() + curr_max = 1 + curr_min = 1 + for i, tensor in enumerate(tensors): + observer(tensor) + curr_max = max(observer.max_val, curr_max) + curr_min = min(observer.min_val, curr_max) + + if i < 2: + assert curr_max == 1 + assert curr_min == 1 + elif i < 4: + assert curr_max == 43 # (127 + 2) / 3 + assert curr_min == 1 + else: + assert curr_max == 43 + assert curr_min == -24.8 # (-128 + 4) / 5 From 89ca72cef1ff0139e436aed38b9452ca8a481e0d Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Fri, 26 Apr 2024 20:31:25 +0000 Subject: [PATCH 07/10] rebase --- README.md | 2 +- setup.py | 4 +- src/sparsetensors/README.md | 162 -------- src/sparsetensors/__init__.py | 21 - src/sparsetensors/base.py | 15 - src/sparsetensors/compressors/__init__.py | 19 - src/sparsetensors/compressors/base.py | 73 ---- src/sparsetensors/compressors/dense.py | 31 -- .../compressors/sparse_bitmask.py | 233 ------------ src/sparsetensors/config/__init__.py | 18 - src/sparsetensors/config/base.py | 36 -- src/sparsetensors/config/dense.py | 36 -- src/sparsetensors/config/sparse_bitmask.py | 36 -- src/sparsetensors/quantization/__init__.py | 21 - .../quantization/lifecycle/__init__.py | 22 -- .../quantization/lifecycle/apply.py | 103 ----- .../quantization/lifecycle/calibration.py | 51 --- .../quantization/lifecycle/forward.py | 135 ------- .../quantization/lifecycle/frozen.py | 47 --- .../quantization/lifecycle/initialize.py | 94 ----- .../quantization/observers/__init__.py | 19 - .../quantization/observers/base.py | 69 ---- .../quantization/observers/memoryless.py | 61 --- .../quantization/observers/min_max.py | 79 ---- src/sparsetensors/quantization/quant_args.py | 85 ----- .../quantization/quant_config.py | 154 -------- .../quantization/quant_scheme.py | 39 -- .../quantization/utils/__init__.py | 16 - .../quantization/utils/helpers.py | 115 ------ src/sparsetensors/registry/__init__.py | 17 - src/sparsetensors/registry/registry.py | 360 ------------------ src/sparsetensors/utils/__init__.py | 17 - src/sparsetensors/utils/helpers.py | 45 --- src/sparsetensors/utils/safetensors_load.py | 196 ---------- tests/quantization/__init__.py | 13 - tests/quantization/lifecycle/__init__.py | 13 - tests/quantization/lifecycle/test_apply.py | 140 ------- tests/quantization/test_quant_args.py | 55 --- tests/quantization/test_quant_config.py | 60 --- tests/quantization/test_quant_scheme.py | 51 --- .../quantization/lifecycle/conftest.py | 37 -- .../quantization/lifecycle/test_forward.py | 80 ---- .../quantization/lifecycle/test_frozen.py | 47 --- .../quantization/lifecycle/test_initialize.py | 79 ---- .../quantization/lifecycle/test_lifecycle.py | 117 ------ .../quantization/observers/test_min_max.py | 89 ----- tests/test_bitmask.py | 2 +- tests/test_registry.py | 2 +- 48 files changed, 5 insertions(+), 3211 deletions(-) delete mode 100644 src/sparsetensors/README.md delete mode 100644 src/sparsetensors/__init__.py delete mode 100644 src/sparsetensors/base.py delete mode 100644 src/sparsetensors/compressors/__init__.py delete mode 100644 src/sparsetensors/compressors/base.py delete mode 100644 src/sparsetensors/compressors/dense.py delete mode 100644 src/sparsetensors/compressors/sparse_bitmask.py delete mode 100644 src/sparsetensors/config/__init__.py delete mode 100644 src/sparsetensors/config/base.py delete mode 100644 src/sparsetensors/config/dense.py delete mode 100644 src/sparsetensors/config/sparse_bitmask.py delete mode 100644 src/sparsetensors/quantization/__init__.py delete mode 100644 src/sparsetensors/quantization/lifecycle/__init__.py delete mode 100644 src/sparsetensors/quantization/lifecycle/apply.py delete mode 100644 src/sparsetensors/quantization/lifecycle/calibration.py delete mode 100644 src/sparsetensors/quantization/lifecycle/forward.py delete mode 100644 src/sparsetensors/quantization/lifecycle/frozen.py delete mode 100644 src/sparsetensors/quantization/lifecycle/initialize.py delete mode 100644 src/sparsetensors/quantization/observers/__init__.py delete mode 100644 src/sparsetensors/quantization/observers/base.py delete mode 100644 src/sparsetensors/quantization/observers/memoryless.py delete mode 100644 src/sparsetensors/quantization/observers/min_max.py delete mode 100644 src/sparsetensors/quantization/quant_args.py delete mode 100644 src/sparsetensors/quantization/quant_config.py delete mode 100644 src/sparsetensors/quantization/quant_scheme.py delete mode 100644 src/sparsetensors/quantization/utils/__init__.py delete mode 100644 src/sparsetensors/quantization/utils/helpers.py delete mode 100644 src/sparsetensors/registry/__init__.py delete mode 100644 src/sparsetensors/registry/registry.py delete mode 100644 src/sparsetensors/utils/__init__.py delete mode 100644 src/sparsetensors/utils/helpers.py delete mode 100644 src/sparsetensors/utils/safetensors_load.py delete mode 100644 tests/quantization/__init__.py delete mode 100644 tests/quantization/lifecycle/__init__.py delete mode 100644 tests/quantization/lifecycle/test_apply.py delete mode 100644 tests/quantization/test_quant_args.py delete mode 100644 tests/quantization/test_quant_config.py delete mode 100644 tests/quantization/test_quant_scheme.py delete mode 100644 tests/sparsetensors/quantization/lifecycle/conftest.py delete mode 100644 tests/sparsetensors/quantization/lifecycle/test_forward.py delete mode 100644 tests/sparsetensors/quantization/lifecycle/test_frozen.py delete mode 100644 tests/sparsetensors/quantization/lifecycle/test_initialize.py delete mode 100644 tests/sparsetensors/quantization/lifecycle/test_lifecycle.py delete mode 100644 tests/sparsetensors/quantization/observers/test_min_max.py diff --git a/README.md b/README.md index 53d7e9bd..fe71acb8 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# sparsetensors +# compressed_tensors diff --git a/setup.py b/setup.py index 303f14ad..959180b1 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def _setup_packages() -> List: return find_packages( - "src", include=["sparsetensors", "sparsetensors.*"], exclude=["*.__pycache__.*"] + "src", include=["compressed_tensors", "compressed_tensors.*"], exclude=["*.__pycache__.*"] ) def _setup_install_requires() -> List: @@ -28,7 +28,7 @@ def _setup_extras() -> Dict: return {"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0",]} setup( - name="sparsetensors", + name="compressed_tensors", version="0.3.0", author="Neuralmagic, Inc.", author_email="support@neuralmagic.com", diff --git a/src/sparsetensors/README.md b/src/sparsetensors/README.md deleted file mode 100644 index 833d1ec2..00000000 --- a/src/sparsetensors/README.md +++ /dev/null @@ -1,162 +0,0 @@ -# Save/Load Compressed SafeTensors - -## Motivation - -* Reduce disk space by saving in a compressed format for sparse models. Models in this compressed format will be loaded by vLLM for more efficient inference -* Set up the save/load architecture such that we can easily expand to additional compression formats in the future. The config should be human readable so users can understand the compression format at a quick glance - -## SafeTensors File Format - -For each parameter in the uncompressed state_dict, we store the following attributes -needed for decompression in the compressed state_dict: - -* compressed tensor -* bitmask -* uncompressed shape -* row offsets - -```python -# dense -{ - PARAM_NAME: uncompressed_tensor -} - -# compressed -{ - PARAM_NAME.compressed: compressed_tensor # 1d tensor - PARAM_NAME.bitmask: value # 2d bitmask tensor (nrows x (ncols / 8)) - PARAM_NAME.shape: value # uncompressed shape tensor - PARAM_NAME.row_offsets: value # 1d offsets tensor -} -``` - -Config information gets stored in the HF config file -```json -// config.json -{ - "sparsity_config": { - "format": "sparse_bitmask", // "dense_sparsity" for original tensor format - - // informational - "sparsity_structure": "unstructured", // or 2:4, 8:16 etc... - "global_sparsity": "0.5" - } -} -``` - -## Saving/Loading Interface - -Loading in a compressed model requires no interface changes - -```python -from sparseml.transformers.utils import SparseAutoModelForCausalLM - -# should contain model.safetensors or model.safetensors.index.json -model_path = "/PATH/TO/COMPRESSED_MODEL" - -model = SparseAutoModelForCausalLM.from_pretrained( - model_name_or_path=model_path, - **model_kwargs, -) -``` - -Saving a compressed model with an explicitly provided compression config. The config -is saved to the model's `config.json` file. **Note:** the model must have been -initialized with SparseAutoModelForCausalLM.from_pretrained() - -```python -from sparsetensors import BitmaskConfig - -output_dir = "/PATH/TO/SAVE/COMPRESSED_MODEL" -sparsity_config = BitmaskConfig() - -model.save_pretrained( - save_directory=output_dir, - sparsity_config=sparsity_config, -) -``` - -Saving a compressed model, inferring the config from the model attributes - -```python -model.save_pretrained( - save_directory=output_dir, - save_compressed=True -) -``` - -Saving a model in the dense format. If the model has at least 5% global sparsity a -sparsity config will still be included in `config.json` with format `dense_sparsity` - -```python -model.save_pretrained( - save_directory=output_dir -) -``` - -Saving a model in the dense format, bypassing the sparsity config calculation. When the -`skip_compression_stats` flag is set, no sparsity config will be written to -`config.json` - -```python -model.save_pretrained( - save_directory=output_dir - skip_compression_stats=True -) -``` - -## Enable Compression During One-Shot and Sparse Finetunining -Models that are saved in a supported compressed format on disk will automatically be -decompressed when loaded as input to `sparseml.transformers.oneshot` or -`sparseml.transformers.train` - -To enable compression on save after oneshot or finetuning simply add the -`save_compressed=True` argument to `sparseml.transformers.oneshot` or -`sparseml.transformers.train` - -```python -from sparseml.transformers import train - -train( - save_compressed=True, - model="neuralmagic/TinyLlama-1.1B-Chat-v1.0-pruned2.4", - recipe=RECIPE, - dataset=DATASET -) -``` - - -## Example Code - -Loads a 60% sparse model, compresses it using the inferred bitmask compression, then -reloads the compressed model. - -```python -from sparseml.transformers import SparseAutoModelForCausalLM -from sparseml.utils.pytorch.utils import measure_cuda_memory -import torch - -MODEL_PATH = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60" -OUTPUT_PATH = "./test_compress_output" -RECIPE = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60" - -torch.cuda.set_device(0) -with measure_cuda_memory() as m: - model = SparseAutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="cuda:0") -print(f"Load dense model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB") - -sparsity_config = getattr(model,"sparsity_config", None) -print(f"Sparsity config before compression: {sparsity_config}") -with measure_cuda_memory() as m: - model.save_pretrained(OUTPUT_PATH, save_compressed=True) -print(f"Save compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB") - -torch.cuda.set_device(1) -with measure_cuda_memory() as m: - model_again = SparseAutoModelForCausalLM.from_pretrained( - OUTPUT_PATH, device_map="cuda:1" - ) -print(f"Load compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB") -sparsity_config = getattr(model_again,"sparsity_config", None) -print(f"Sparsity config after compression: {sparsity_config}") -``` diff --git a/src/sparsetensors/__init__.py b/src/sparsetensors/__init__.py deleted file mode 100644 index 0833dd42..00000000 --- a/src/sparsetensors/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .base import * - -# flake8: noqa -from .compressors import * -from .config import * -from .quantization import QuantizationConfig, QuantizationStatus -from .utils import * diff --git a/src/sparsetensors/base.py b/src/sparsetensors/base.py deleted file mode 100644 index f01a055f..00000000 --- a/src/sparsetensors/base.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -SPARSITY_CONFIG_NAME = "sparsity_config" diff --git a/src/sparsetensors/compressors/__init__.py b/src/sparsetensors/compressors/__init__.py deleted file mode 100644 index 1c7362eb..00000000 --- a/src/sparsetensors/compressors/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# flake8: noqa - -from .base import ModelCompressor -from .dense import DenseCompressor -from .sparse_bitmask import BitmaskCompressor, BitmaskTensor diff --git a/src/sparsetensors/compressors/base.py b/src/sparsetensors/compressors/base.py deleted file mode 100644 index a82ce048..00000000 --- a/src/sparsetensors/compressors/base.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import operator -from typing import Dict, Generator, Tuple - -from sparsetensors.base import SPARSITY_CONFIG_NAME -from sparsetensors.config import CompressionConfig -from sparsetensors.registry import RegistryMixin -from torch import Tensor -from torch.nn import Module, Parameter -from tqdm import tqdm - - -__all__ = ["ModelCompressor"] - - -class ModelCompressor(RegistryMixin): - """ - Base class representing a model compression algorithm. - - :param config: config specifying compression parameters - """ - - def __init__(self, config: CompressionConfig): - self.config = config - - def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]: - """ - Compresses a dense state dict - - :param model_state: state dict of uncompressed model - :return: compressed state dict - """ - raise NotImplementedError() - - def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]: - """ - Reads a compressed state dict located at model_path and returns a - generator for sequentially decompressing back to a dense state dict - - :param model_path: path to compressed safetensors model - :return: compressed state dict - """ - raise NotImplementedError() - - def overwrite_weights(self, model_path: str, model: Module): - """ - Overwrites the weights in model with weights decompressed from model_path - - :param model_path: path to compressed weights - :param model: pytorch model to load decompressed weights into - """ - dense_gen = self.decompress(model_path) - for name, data in tqdm(dense_gen, desc="Decompressing model"): - # loading the decompressed weights into the model - model_device = operator.attrgetter(name)(model).device - data_new = Parameter(data.to(model_device)) - data_old = operator.attrgetter(name)(model) - data_old.data = data_new.data - - setattr(model, SPARSITY_CONFIG_NAME, self.config) diff --git a/src/sparsetensors/compressors/dense.py b/src/sparsetensors/compressors/dense.py deleted file mode 100644 index 87f112ac..00000000 --- a/src/sparsetensors/compressors/dense.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Generator, Tuple - -from sparsetensors.compressors import ModelCompressor -from torch import Tensor - - -@ModelCompressor.register(name="dense_sparsity") -class DenseCompressor(ModelCompressor): - """ - Identity compressor for dense models, returns the original state_dict - """ - - def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]: - return model_state - - def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]: - return iter([]) diff --git a/src/sparsetensors/compressors/sparse_bitmask.py b/src/sparsetensors/compressors/sparse_bitmask.py deleted file mode 100644 index 3043e43b..00000000 --- a/src/sparsetensors/compressors/sparse_bitmask.py +++ /dev/null @@ -1,233 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Dict, Generator, List, Tuple, Union - -import numpy -import torch -from safetensors import safe_open -from sparsetensors.compressors import ModelCompressor -from sparsetensors.utils import get_nested_weight_mappings, merge_names -from torch import Tensor -from tqdm import tqdm - - -__all__ = [ - "BitmaskCompressor", - "BitmaskTensor", - "bitmask_compress", - "bitmask_decompress", - "pack_bitmasks", - "unpack_bitmasks", -] - -_LOGGER: logging.Logger = logging.getLogger(__name__) - - -@ModelCompressor.register(name="sparse_bitmask") -class BitmaskCompressor(ModelCompressor): - """ - Compression for sparse models using bitmasks. Non-zero weights are stored in a 1d - values tensor, with their locations stored in a 2d bitmask - """ - - COMPRESSION_PARAM_NAMES = ["shape", "compressed", "bitmask", "row_offsets"] - - def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]: - """ - Compresses a dense state dict using bitmask compression - - :param model_state: state dict of uncompressed model - :return: compressed state dict - """ - compressed_dict = {} - _LOGGER.debug( - f"Compressing model with {len(model_state)} parameterized layers..." - ) - for name, value in tqdm(model_state.items(), desc="Compressing model"): - bitmask_tensor = BitmaskTensor.from_dense(value) - bitmask_dict = bitmask_tensor.dict(name_prefix=name, device="cpu") - for key in bitmask_dict.keys(): - if key in compressed_dict: - _LOGGER.warn( - f"Expected all compressed state_dict keys to be unique, but " - f"found an existing entry for {key}. The existing entry will " - "be replaced." - ) - compressed_dict |= bitmask_dict - - return compressed_dict - - def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]: - """ - Reads a bitmask compressed state dict located at model_path and returns a - generator for sequentially decompressing back to a dense state dict - - :param model_path: path to compressed safetensors model - :return: iterator for generating decompressed weights - """ - weight_mappings = get_nested_weight_mappings( - model_path, self.COMPRESSION_PARAM_NAMES - ) - for weight_name in weight_mappings.keys(): - weight_data = {} - for param_name, safe_path in weight_mappings[weight_name].items(): - full_name = merge_names(weight_name, param_name) - with safe_open(safe_path, framework="pt", device="cpu") as f: - weight_data[param_name] = f.get_tensor(full_name) - data = BitmaskTensor(**weight_data) - decompressed = data.decompress() - yield weight_name, decompressed - - -class BitmaskTensor: - """ - Owns compressions and decompression for a single bitmask compressed tensor. - Adapted from: https://github.com/mgoin/torch_bitmask/tree/main - - :param shape: shape of dense tensor - :compressed: flat tensor of non-zero values - :bitmask: 2d bitmask of non-zero values - :row_offsets: flat tensor indicating what index in values each dense row starts at - """ - - def __init__( - self, - shape: Union[torch.Size, List], - compressed: Tensor, - bitmask: Tensor, - row_offsets: Tensor, - ): - self.shape = list(shape) - self.compressed = compressed - self.bitmask = bitmask - self.row_offsets = row_offsets - - @staticmethod - def from_dense(tensor: Tensor) -> "BitmaskTensor": - """ - :param tensor: dense tensor to compress - :return: instantiated compressed tensor - """ - shape = tensor.shape - compressed, bitmask, row_offsets = bitmask_compress(tensor.cpu()) - return BitmaskTensor( - shape=shape, compressed=compressed, bitmask=bitmask, row_offsets=row_offsets - ) - - def decompress(self) -> Tensor: - """ - :return: reconstructed dense tensor - """ - return bitmask_decompress(self.compressed, self.bitmask, self.shape) - - def curr_memory_size_bytes(self): - """ - :return: size in bytes required to store compressed tensor on disk - """ - - def sizeof_tensor(a): - return a.element_size() * a.nelement() - - return ( - sizeof_tensor(self.compressed) - + sizeof_tensor(self.bitmask) - + sizeof_tensor(self.row_offsets) - ) - - def dict(self, name_prefix: str, device: str = "cpu") -> Dict[str, Tensor]: - """ - :name_prefix: name of original tensor to store compressed weight as - :return: dict of compressed data for the stored weight - """ - return { - merge_names(name_prefix, "shape"): torch.tensor(self.shape, device=device), - merge_names(name_prefix, "compressed"): self.compressed.to(device), - merge_names(name_prefix, "bitmask"): self.bitmask.to(device), - merge_names(name_prefix, "row_offsets"): self.row_offsets.to(device), - } - - def __repr__(self): - return f"BitmaskTensor(shape={self.shape}, compressed=True)" - - -def bitmask_compress(tensor: Tensor) -> Tuple[Tensor, Tensor, Tensor]: - """ - Compresses a dense tensor using bitmask compression - - :param tensor: dense tensor to compress - :return: tuple of compressed data representing tensor - """ - bytemasks = tensor != 0 - row_counts = bytemasks.sum(dim=-1) - row_offsets = torch.cumsum(row_counts, 0) - row_counts - values = tensor[bytemasks] - bitmasks_packed = pack_bitmasks(bytemasks) - - return values, bitmasks_packed, row_offsets - - -def bitmask_decompress( - values: Tensor, bitmasks: Tensor, original_shape: torch.Size -) -> Tensor: - """ - Reconstructs a dense tensor from a compressed one - - :param values: 1d tensor of non-zero values - :param bitmasks: 2d int8 tensor flagging locations of non-zero values in the - tensors original shape - :param original_shape: shape of the dense tensor - :return: decompressed dense tensor - """ - bytemasks_unpacked = unpack_bitmasks(bitmasks, original_shape) - - decompressed_tensor = torch.zeros(original_shape, dtype=values.dtype) - decompressed_tensor[bytemasks_unpacked] = values - - return decompressed_tensor - - -def pack_bitmasks(bytemasks: Tensor) -> Tensor: - """ - Converts a bytemask tensor to a bitmask tensor to reduce memory. Shape RxC will be - compressed to R x ceil(C/8) - :param bytemasks: mask tensor where each byte corresponds to a weight - :return: mask tensor where each bit corresounds to a weight - """ - packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little") - packed_bits_torch = torch.from_numpy(packed_bits_numpy) - - return packed_bits_torch - - -def unpack_bitmasks(packed_bitmasks: Tensor, original_shape: torch.Size) -> Tensor: - """ - Converts a bitmask tensor back to a bytemask tensor for use during decompression - - :param packed_bitmasks: mask tensor where each bit corresponds to a weight - :param original_shape: dense shape to decompress to - :return: boolean mask of weights in the original dense shape - """ - # Unpack the bits - unpacked_bits = numpy.unpackbits( - packed_bitmasks.numpy(), axis=-1, count=original_shape[-1], bitorder="little" - ) - - # Reshape to match the original shape - unpacked_bitmasks_torch = torch.from_numpy( - unpacked_bits.reshape(original_shape).astype(bool) - ) - - return unpacked_bitmasks_torch diff --git a/src/sparsetensors/config/__init__.py b/src/sparsetensors/config/__init__.py deleted file mode 100644 index ff83f5af..00000000 --- a/src/sparsetensors/config/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# flake8: noqa -from .base import * -from .dense import * -from .sparse_bitmask import * diff --git a/src/sparsetensors/config/base.py b/src/sparsetensors/config/base.py deleted file mode 100644 index 8af48bd9..00000000 --- a/src/sparsetensors/config/base.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -from pydantic import BaseModel -from sparsetensors.registry import RegistryMixin - - -__all__ = ["CompressionConfig"] - - -class CompressionConfig(RegistryMixin, BaseModel): - """ - Base data class for storing compression parameters - - :param format: name of compression format - :param global_sparsity: average sparsity of the entire model - :param sparsity_structure: structure of the sparsity, such as - "unstructured", "2:4", "8:16" etc - """ - - format: str - global_sparsity: Optional[float] = 0.0 - sparsity_structure: Optional[str] = "unstructured" diff --git a/src/sparsetensors/config/dense.py b/src/sparsetensors/config/dense.py deleted file mode 100644 index a37be308..00000000 --- a/src/sparsetensors/config/dense.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -from sparsetensors.config import CompressionConfig - - -__all__ = ["DenseSparsityConfig"] - - -@CompressionConfig.register(name="dense_sparsity") -class DenseSparsityConfig(CompressionConfig): - """ - Identity configuration for storing a sparse model in - an uncompressed dense format - - :param global_sparsity: average sparsity of the entire model - :param sparsity_structure: structure of the sparsity, such as - "unstructured", "2:4", "8:16" etc - """ - - format: str = "dense_sparsity" - global_sparsity: Optional[float] = 0.0 - sparsity_structure: Optional[str] = "unstructured" diff --git a/src/sparsetensors/config/sparse_bitmask.py b/src/sparsetensors/config/sparse_bitmask.py deleted file mode 100644 index d17c6a1a..00000000 --- a/src/sparsetensors/config/sparse_bitmask.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -from sparsetensors.config.base import CompressionConfig - - -__all__ = ["BitmaskConfig"] - - -@CompressionConfig.register(name="sparse_bitmask") -class BitmaskConfig(CompressionConfig): - """ - Configuration for storing a sparse model using - bitmask compression - - :param global_sparsity: average sparsity of the entire model - :param sparsity_structure: structure of the sparsity, such as - "unstructured", "2:4", "8:16" etc - """ - - format: str = "sparse_bitmask" - global_sparsity: Optional[float] = 0.0 - sparsity_structure: Optional[str] = "unstructured" diff --git a/src/sparsetensors/quantization/__init__.py b/src/sparsetensors/quantization/__init__.py deleted file mode 100644 index 9fde69a3..00000000 --- a/src/sparsetensors/quantization/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# flake8: noqa -# isort: skip_file - -from .quant_args import * -from .quant_config import * -from .quant_scheme import * -from .lifecycle import * diff --git a/src/sparsetensors/quantization/lifecycle/__init__.py b/src/sparsetensors/quantization/lifecycle/__init__.py deleted file mode 100644 index 9504597b..00000000 --- a/src/sparsetensors/quantization/lifecycle/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# flake8: noqa -# isort: skip_file - -from .calibration import * -from .forward import * -from .frozen import * -from .initialize import * -from .apply import * diff --git a/src/sparsetensors/quantization/lifecycle/apply.py b/src/sparsetensors/quantization/lifecycle/apply.py deleted file mode 100644 index ac238564..00000000 --- a/src/sparsetensors/quantization/lifecycle/apply.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -from collections import OrderedDict -from typing import Iterable, Optional - -from sparsetensors.quantization.lifecycle.calibration import set_module_for_calibration -from sparsetensors.quantization.lifecycle.frozen import freeze_module_quantization -from sparsetensors.quantization.lifecycle.initialize import ( - initialize_module_for_quantization, -) -from sparsetensors.quantization.quant_config import ( - QuantizationConfig, - QuantizationStatus, -) -from sparsetensors.quantization.utils import iter_named_leaf_modules -from torch.nn import Module - - -__all__ = [ - "apply_quantization_config", - "apply_quantization_status", -] - - -def apply_quantization_config(model: Module, config: QuantizationConfig): - """ - Initializes the model for quantization in-place based on the given config - - :param model: model to apply quantization config to - :param config: quantization config - """ - # build mapping of targets to schemes for easier matching - # use ordered dict to preserve target ordering in config - target_to_scheme = OrderedDict() - for scheme in config.config_groups.values(): - for target in scheme.targets: - target_to_scheme[target] = scheme - - # mark appropriate layers for quantization by setting their quantization schemes - for name, submodule in iter_named_leaf_modules(model): - if _find_first_name_or_class_match(name, submodule, config.ignore): - continue # layer matches ignore list, continue - target = _find_first_name_or_class_match(name, submodule, target_to_scheme) - if target is not None: - # target matched - add layer and scheme to target list - submodule.quantization_scheme = target_to_scheme[target] - - # apply current quantization status across all targeted layers - apply_quantization_status(model, config.quantization_status) - - -def apply_quantization_status(model: Module, status: QuantizationStatus): - """ - Applies in place the quantization lifecycle up to the given status - - :param model: model to apply quantization to - :param status: status to update the module to - """ - if status >= QuantizationStatus.INITIALIZED: - model.apply(initialize_module_for_quantization) - if status >= QuantizationStatus.CALIBRATION: - model.apply(set_module_for_calibration) - if status >= QuantizationStatus.FROZEN: - model.apply(freeze_module_quantization) - - -def _find_first_name_or_class_match( - name: str, - module: Module, - targets: Iterable[str], -) -> Optional[str]: - # first element of targets that matches the given name - # if no name matches returns first target that matches the class name - # returns None otherwise - return _find_first_match(name, targets) or _find_first_match( - module.__class__.__name__, targets - ) - - -def _find_first_match(value: str, targets: Iterable[str]) -> Optional[str]: - # returns first element of target that matches value either - # exactly or as a regex after 're:' - for target in targets: - if target.startswith("re:"): - pattern = target[3:] - if re.match(pattern, value): - return target - elif target == value: - return target - return None diff --git a/src/sparsetensors/quantization/lifecycle/calibration.py b/src/sparsetensors/quantization/lifecycle/calibration.py deleted file mode 100644 index 51c594fb..00000000 --- a/src/sparsetensors/quantization/lifecycle/calibration.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import logging - -from sparsetensors.quantization.quant_config import QuantizationStatus -from torch.nn import Module - - -__all__ = [ - "set_module_for_calibration", -] - - -_LOGGER = logging.getLogger(__name__) - - -def set_module_for_calibration(module: Module): - """ - marks a layer as ready for calibration which activates observers - to update scales and zero points on each forward pass - - apply to full model with `model.apply(set_module_for_calibration)` - - :param module: module to set for calibration - """ - if not getattr(module, "quantization_scheme", None): - # no quantization scheme nothing to do - return - status = getattr(module, "quantization_status", None) - if not status or status != QuantizationStatus.INITIALIZED: - raise _LOGGER.warning( - f"Attempting set module with status {status} to calibration mode. " - f"but status is not {QuantizationStatus.INITIALIZED} - you may " - "be calibrating an uninitialized module which may fail or attempting " - "to re-calibrate a frozen module" - ) - - module.quantization_status = QuantizationStatus.CALIBRATION diff --git a/src/sparsetensors/quantization/lifecycle/forward.py b/src/sparsetensors/quantization/lifecycle/forward.py deleted file mode 100644 index 5e6036ea..00000000 --- a/src/sparsetensors/quantization/lifecycle/forward.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from functools import wraps - -import torch -from sparsetensors.quantization.quant_args import QuantizationArgs -from sparsetensors.quantization.quant_config import QuantizationStatus -from sparsetensors.quantization.quant_scheme import QuantizationScheme -from torch.nn import Module - - -__all__ = [ - "wrap_module_forward_quantized", - "quantize", - "dequantize", - "fake_quantize", - "maybe_calibrate_or_quantize", -] - - -@torch.no_grad() -def quantize( - x: torch.Tensor, - scale: torch.Tensor, - zero_point: torch.Tensor, - q_max: torch.Tensor, -) -> torch.Tensor: - return torch.clamp( - torch.round( - x / scale + zero_point, - ), - 0, - q_max, - ) - - -@torch.no_grad() -def dequantize( - x_q: torch.Tensor, - scale: torch.Tensor, - zero_point: torch.Tensor, -) -> torch.Tensor: - return (x_q - zero_point) * scale - - -@torch.no_grad() -def fake_quantize( - x: torch.Tensor, - scale: torch.Tensor, - zero_point: torch.Tensor, - args: QuantizationArgs, -) -> torch.Tensor: - max_q = torch.tensor(2**args.num_bits - 1, device=x.device) - Q = torch.zeros_like(x) - Q = quantize(x, scale, zero_point, max_q) - return dequantize(Q, scale, zero_point) - - -def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme): - # expects a module already initialized and injected with the parameters in - # initialize_module_for_quantization - forward_func_orig = module.forward.__func__ - - @wraps(forward_func_orig) # ensures docstring, names, etc are propagated - def wrapped_forward(self, *args, **kwargs): - input_ = args[0] - - if scheme.input_activations is not None: - # calibrate and (fake) quantize input activations when applicable - input_ = _maybe_calibrate_or_quantize( - module, input_, "input", scheme.input_activations - ) - - if scheme.weights is not None: - # calibrate and (fake) quantize weights when applicable - self.weight.data = _maybe_calibrate_or_quantize( - module, self.weight, "weight", scheme.weights - ) - - # perform wrapped forward call - output = forward_func_orig.__get__(module, module.__class__)( - input_, *args[1:], **kwargs - ) - - if scheme.output_activations is not None: - # calibrate and (fake) quantize output activations when applicable - output = _maybe_calibrate_or_quantize( - module, output, "output", scheme.output_activations - ) - - return output - - # bind wrapped forward to module class so reference to `self` is correct - bound_wrapped_forward = wrapped_forward.__get__(module, module.__class__) - # set forward to wrapped forward - setattr(module, "forward", bound_wrapped_forward) - - -def _maybe_calibrate_or_quantize( - module: Module, value: Module, base_name: str, args: "QuantizationArgs" -) -> torch.Tensor: - # only run quantized for the included stages - if module.quantization_status not in { - QuantizationStatus.CALIBRATION, - QuantizationStatus.FROZEN, - }: - return value - - device = next(module.parameters()).device - scale = getattr(module, f"{base_name}_scale") - # zero_point = getattr(module, f"{base_name}_zero_point").data - zero_point = getattr(module, f"{base_name}_zero_point") - - if module.quantization_status == QuantizationStatus.CALIBRATION: - # get observer and get new quant params from observation - observer = getattr(module, f"{base_name}_observer") - updated_scale, updated_zero_point = observer(value) - - # update scale and zero point - scale.data = updated_scale.to(device) - zero_point.data = updated_zero_point.to(device) - - return fake_quantize(value, scale, zero_point, args) diff --git a/src/sparsetensors/quantization/lifecycle/frozen.py b/src/sparsetensors/quantization/lifecycle/frozen.py deleted file mode 100644 index 9715a4b2..00000000 --- a/src/sparsetensors/quantization/lifecycle/frozen.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from sparsetensors.quantization.quant_config import QuantizationStatus -from torch.nn import Module - - -__all__ = [ - "freeze_module_quantization", -] - - -def freeze_module_quantization(module: Module): - """ - deletes observers so static quantization is completed. - - apply to full model with `model.apply(freeze_module_quantization)` - - :param module: module to freeze quantization for - """ - if not getattr(module, "quantization_scheme", None): - # no quantization scheme nothing to do - return - - # delete observers from module - submodule_name_do_delete = set() - for submodule_name, _ in module.named_modules(): - if "." not in submodule_name and submodule_name.endswith("_observer"): - # delete any observers that belong directly to this module - submodule_name_do_delete.add(submodule_name) - - for submodule_name in submodule_name_do_delete: - delattr(module, submodule_name) - - module.quantization_status = QuantizationStatus.FROZEN diff --git a/src/sparsetensors/quantization/lifecycle/initialize.py b/src/sparsetensors/quantization/lifecycle/initialize.py deleted file mode 100644 index aa6e3994..00000000 --- a/src/sparsetensors/quantization/lifecycle/initialize.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import logging -from typing import Optional - -import torch -from sparsetensors.quantization.lifecycle.forward import wrap_module_forward_quantized -from sparsetensors.quantization.quant_args import QuantizationArgs -from sparsetensors.quantization.quant_config import QuantizationStatus -from sparsetensors.quantization.quant_scheme import QuantizationScheme -from torch.nn import Module, Parameter - - -__all__ = [ - "initialize_module_for_quantization", -] - - -_LOGGER = logging.getLogger(__name__) - - -def initialize_module_for_quantization( - module: Module, - scheme: Optional[QuantizationScheme] = None, -): - """ - attaches appropriate scales, zero points, and observers to a layer - given its target quantization scheme - - apply to full model with `model.apply(initialize_module_for_quantization)` - - :param module: module to set for calibration - :param scheme: scheme to use for quantization. if None is provided, - will attempt to use scheme stored in the module under `quantization_scheme`, - if not provided, the layer will be skipped - """ - scheme = scheme or getattr(module, "quantization_scheme", None) - if scheme is None: - # no scheme passed and layer not targeted for quantization - skip - return - - if scheme.input_activations is not None: - _initialize_scale_zero_point_observer(module, "input", scheme.input_activations) - if scheme.weights is not None: - if hasattr(module, "weight"): - _initialize_scale_zero_point_observer(module, "weight", scheme.weights) - else: - _LOGGER.warning( - f"module type {type(module)} targeted for weight quantization but " - "has no attribute weight, skipping weight quantization " - f"for {type(module)}" - ) - if scheme.output_activations is not None: - _initialize_scale_zero_point_observer( - module, "output", scheme.output_activations - ) - - module.quantization_scheme = scheme - module.quantization_status = QuantizationStatus.INITIALIZED - - # wrap forward call of module to perform quantized actions based on calltime status - wrap_module_forward_quantized(module, scheme) - - -def _initialize_scale_zero_point_observer( - module: Module, base_name: str, quantization_args: QuantizationArgs -): - device = next(module.parameters()).device - - # initializes empty scale and zero point parameters for the module - init_scale = Parameter(torch.empty(0, device=device), requires_grad=False) - module.register_parameter(f"{base_name}_scale", init_scale) - - init_zero_point = Parameter( - torch.empty(0, device=device, dtype=int), requires_grad=False - ) - module.register_parameter(f"{base_name}_zero_point", init_zero_point) - - # initialize observer module and attach as submodule - observer = quantization_args.get_observer() - module.register_module(f"{base_name}_observer", observer) diff --git a/src/sparsetensors/quantization/observers/__init__.py b/src/sparsetensors/quantization/observers/__init__.py deleted file mode 100644 index d0362b8f..00000000 --- a/src/sparsetensors/quantization/observers/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# flake8: noqa - -from .base import * -from .memoryless import * -from .min_max import * diff --git a/src/sparsetensors/quantization/observers/base.py b/src/sparsetensors/quantization/observers/base.py deleted file mode 100644 index 52a464b9..00000000 --- a/src/sparsetensors/quantization/observers/base.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional, Tuple - -from sparsetensors.quantization.quant_args import QuantizationArgs -from sparsetensors.registry.registry import RegistryMixin -from torch import FloatTensor, IntTensor, Tensor -from torch.nn import Module - - -__all__ = ["Observer"] - - -class Observer(Module, RegistryMixin): - """ - Base Observer class to be subclassed for specific implementation. - Subclasses should override `calculate_qparams` to return a scale, zero_point - pair - """ - - def __init__(self, quantization_args: QuantizationArgs): - self.quantization_args: QuantizationArgs = quantization_args - super().__init__() - self._scale = None - self._zero_point = None - - def forward(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]: - """ - maps directly to get_qparams - :param observed: optional observed tensor to calculate quantization parameters - from - :return: tuple of scale and zero point based on last observed value - """ - return self.get_qparams(observed=observed) - - def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]: - """ - :param observed: observed tensor to calculate quantization parameters for - :return: tuple of scale and zero point derived from the observed tensor - """ - raise NotImplementedError(f"{self.__class__} must implement calculate_qparams") - - def get_qparams( - self, observed: Optional[Tensor] = None - ) -> Tuple[FloatTensor, IntTensor]: - """ - Convenience function to wrap overwritten calculate_qparams - adds support to make observed tensor optional and support for tracking latest - calculated scale and zero point - :param observed: optional observed tensor to calculate quantization parameters - from - :return: tuple of scale and zero point based on last observed value - """ - if observed is not None: - # re-calcualte scale and zero point, update the stored value - self._scale, self._zero_point = self.calculate_qparams(observed) - return self._scale, self._zero_point diff --git a/src/sparsetensors/quantization/observers/memoryless.py b/src/sparsetensors/quantization/observers/memoryless.py deleted file mode 100644 index 5fd92a6e..00000000 --- a/src/sparsetensors/quantization/observers/memoryless.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Tuple - -import torch -from sparsetensors.quantization.observers.base import Observer -from torch import FloatTensor, IntTensor, Tensor - - -__all__ = ["MemorylessObserver"] - - -@Observer.register("memoryless") -class MemorylessObserver(Observer): - """ - Implements a dynamic quantization observer that sets the scale and - zero point based on the latest observed value - """ - - def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]: - """ - :param observed: observed tensor to calculate quantization parameters for - :return: tuple of scale and zero point derived from the observed tensor - """ - # TODO: Add support for full range of quantization Args, only supports 8bit - # per tensor - bit_range = 255 - min_val = observed.min() - max_val = observed.max() - - # ensure zero is in the range - min_val = torch.min(min_val, torch.zeros_like(min_val)) - max_val = torch.max(max_val, torch.zeros_like(max_val)) - - if self.quantization_args.symmetric: - symmetric_range = 2 * max(min_val.abs(), max_val.abs()) - scale = symmetric_range / bit_range - zero_point = torch.tensor(0).to(torch.int8) - else: - # non-symmetric - observed_range = max_val - min_val - scale = observed_range / bit_range - - # scales from a 0 range should be set to 1 - scale[observed_range == 0] = 1 - - zero_point = ((0 - min_val) / scale).to(torch.int8) - - return scale, zero_point diff --git a/src/sparsetensors/quantization/observers/min_max.py b/src/sparsetensors/quantization/observers/min_max.py deleted file mode 100644 index e73805b4..00000000 --- a/src/sparsetensors/quantization/observers/min_max.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Tuple - -import torch -from sparsetensors.quantization.observers.base import Observer -from sparsetensors.quantization.quant_args import QuantizationArgs -from torch import FloatTensor, IntTensor, Tensor - - -__all__ = ["MinMaxObserver"] - - -@Observer.register("minmax") -class MinMaxObserver(Observer): - """ - Implements a dynamic quantization observer that sets the scale and - zero point based on the latest observed value - """ - - def __init__(self, quantization_args: QuantizationArgs): - super().__init__(quantization_args=quantization_args) - - self.min_val = float("inf") - self.max_val = -float("inf") - self.counter = 0 - - def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]: - """ - :param observed: observed tensor to calculate quantization parameters for - :return: tuple of scale and zero point derived from the observed tensor - """ - # TODO: Add support for full range of quantization Args, only supports 8bit - # per tensor - bit_range = 255 - min_val = torch.tensor([observed.min()]) - max_val = torch.tensor([observed.max()]) - - # update running average - if self.counter > 0: - self.min_val = (self.min_val * self.counter + min_val) / (self.counter + 1) - self.max_val = (self.max_val * self.counter + max_val) / (self.counter + 1) - else: - self.min_val = min_val - self.max_val = max_val - - # ensure that the zeros are in the range - min_val = torch.min(self.min_val, torch.zeros_like(self.min_val)) - max_val = torch.max(self.max_val, torch.zeros_like(self.max_val)) - - self.counter += 1 - - if self.quantization_args.symmetric: - symmetric_range = 2 * max(min_val.abs(), max_val.abs()) - scale = symmetric_range / bit_range - zero_point = torch.tensor(0).to(torch.int8) - else: - # non-symmetric - observed_range = max_val - min_val - scale = observed_range / bit_range - - # scales from a 0 range should be set to 1 - scale[observed_range == 0] = 1 - - zero_point = ((0 - min_val) / scale).to(torch.int8) - - return scale, zero_point diff --git a/src/sparsetensors/quantization/quant_args.py b/src/sparsetensors/quantization/quant_args.py deleted file mode 100644 index 76bd61f0..00000000 --- a/src/sparsetensors/quantization/quant_args.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from enum import Enum -from typing import Any, Dict, Optional - -from pydantic import BaseModel, Field - - -__all__ = ["QuantizationType", "QuantizationStrategy", "QuantizationArgs"] - - -class QuantizationType(str, Enum): - """ - Enum storing quantization type options - """ - - INT = "int" - FLOAT = "float" - - -class QuantizationStrategy(str, Enum): - """ - Enum storing quantization strategy options - """ - - TENSOR = "tensor" - CHANNEL = "channel" - GROUP = "group" - BLOCK = "block" - - -class QuantizationArgs(BaseModel): - """ - User facing arguments used to define a quantization config for weights or - activations - - :param num_bits: quantization bit depth - :param type: dtype to quantized to, either int or float - :param symmetric: whether or not quantization scale is symmetric about zero-point - :param strategy: string id determining the scope of scale/zero-point to apply - :param group_size: group length to use for the group strategy - :param block_structure: 2d block structure to use for the block strategy, must be - of the format "2x4", "8x16", etc. - """ - - num_bits: int = 8 - type: QuantizationType = QuantizationType.INT - symmetric: bool = True - strategy: QuantizationStrategy = QuantizationStrategy.TENSOR - group_size: Optional[int] = None - block_structure: Optional[str] = None - observer: str = Field( - default="minmax", - description=( - "The class to use to compute the quantization param - " - "scale and zero-point'" - ), - ) - observer_kwargs: Dict[str, Any] = Field( - default_factory=dict, - description=( - "optional dict of kwargs to be passed directly to torch quantization " - "Observers constructor excluding quantization range or symmetry" - ), - ) - - def get_observer(self): - """ - :return: torch quantization FakeQuantize built based on these QuantizationArgs - """ - from sparsetensors.quantization.observers.base import Observer - - return Observer.load_from_registry(self.observer, quantization_args=self) diff --git a/src/sparsetensors/quantization/quant_config.py b/src/sparsetensors/quantization/quant_config.py deleted file mode 100644 index 2a2b345f..00000000 --- a/src/sparsetensors/quantization/quant_config.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from enum import Enum -from typing import Dict, List, Optional - -from pydantic import BaseModel, Field -from sparsetensors.quantization.quant_scheme import QuantizationScheme -from sparsetensors.quantization.utils import ( - calculate_compression_ratio, - is_module_quantized, - iter_named_leaf_modules, - module_type, -) -from torch.nn import Module - - -__all__ = [ - "QuantizationStatus", - "QuantizationConfig", - "LIFECYCLE_ORDER", -] - - -class QuantizationStatus(str, Enum): - """ - Enum storing the different states a quantized layer can be in - - Initialized: scale, zero points and observers have been attached to the layer but - are set to dummy values (not yet calibrated) - Calibration: scale and zero points have been calibrated through OBCQ or similar - algorithm, observers are still attached - Frozen: scale and zero points are finalized, observers have been deleted, weights - are still in their original precision - Compressed: weights have been converted to their target type or compressed to - their closed approximation - """ - - INITIALIZED = "initialized" - CALIBRATION = "calibration" - FROZEN = "frozen" - COMPRESSED = "compressed" - - @classmethod - def lifecycle_order(cls) -> List["QuantizationStatus"]: - """ - :return: list of correct quantization lifecycle order - """ - return - - def __ge__(self, other): - if not isinstance(other, self.__class__): - raise NotImplementedError - return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other) - - -LIFECYCLE_ORDER = [ - QuantizationStatus.INITIALIZED, - QuantizationStatus.CALIBRATION, - QuantizationStatus.FROZEN, - QuantizationStatus.COMPRESSED, -] - - -class QuantizationConfig(BaseModel): - """ - Full configuration specifying how a model is quantized. Each quantized layer is - mapped to a QuantizationScheme in config_groups. - - :param config_groups: dict of QuantizationSchemes specifying the quantization - settings for each quantized layer - :param quant_method: a constant used to differentiate sparseML quantization from - other quantization configs - :param format: specifies how the quantized model is stored on disk - :quantization_status: specifies the current status of all quantized layers. It is - assumed all layers are in the same state. - :global_compression_ratio: optional informational config to report the model - compression ratio acheived by the quantization config - :ignore: optional list of layers to ignore from config_groups. Layers in this list - are not quantized even if they match up with a target in config_groups - """ - - config_groups: Dict[str, QuantizationScheme] - quant_method: str = "sparseml" - format: str = "fakequant" - quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED - global_compression_ratio: Optional[float] = None - ignore: Optional[List[str]] = Field(default_factory=list) - - @staticmethod - def from_pretrained(model: Module) -> "QuantizationConfig": - """ - Converts a model into its associated QuantizationConfig based on the - QuantizationScheme attached to each quanitzed module - - :param model: model to calculate quantization scheme of - :return: filled out QuantizationScheme for the input model - """ - quant_scheme_to_layers = [] - quantization_status = None - ignore = {} - quantization_type_names = set() - for name, submodule in iter_named_leaf_modules(model): - layer_type = module_type(submodule) - if not is_module_quantized(submodule): - if layer_type not in ignore: - ignore[layer_type] = [] - ignore[layer_type].append(name) - else: - quantization_status = submodule.quantization_status - scheme = submodule.quantization_scheme - quantization_type_names.add(layer_type) - - match_found = False - for existing_scheme in quant_scheme_to_layers: - if scheme == existing_scheme: - match_found = True - break - if not match_found: - quant_scheme_to_layers.append(scheme) - - # clean up ignore list, we can leave out layers types if none of the - # instances are quantized - consolidated_ignore = [] - for layer_type, ignore_names in ignore.items(): - if layer_type in quantization_type_names: - # specific layers of a quantized type are ignored - consolidated_ignore += ignore_names - # else we leave it off the ignore list, doesn't fall under any of the - # existing quantization schemes so it won't be quantized - - config_groups = {} - for idx, scheme in enumerate(quant_scheme_to_layers): - group_name = "group_" + str(idx) - config_groups[group_name] = scheme - - compression_ratio = calculate_compression_ratio(model) - return QuantizationConfig( - config_groups=config_groups, - quantization_status=quantization_status, - global_compression_ratio=compression_ratio, - ignore=consolidated_ignore, - ) diff --git a/src/sparsetensors/quantization/quant_scheme.py b/src/sparsetensors/quantization/quant_scheme.py deleted file mode 100644 index 7077c24e..00000000 --- a/src/sparsetensors/quantization/quant_scheme.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List, Optional - -from pydantic import BaseModel -from sparsetensors.quantization.quant_args import QuantizationArgs - - -__all__ = ["QuantizationScheme"] - - -class QuantizationScheme(BaseModel): - """ - Set of QuantizationArgs defining how the weights, inputs and outputs of target list - of modules should be quantized - - :param targets: list of modules to apply the QuantizationArgs to, can be layer - names, layer types or a regular expression - :param weights: quantization config for layer weights - :param input_activations: quantization config for layer inputs - :param output_activations: quantization config for layer outputs - """ - - targets: List[str] - weights: Optional[QuantizationArgs] = None - input_activations: Optional[QuantizationArgs] = None - output_activations: Optional[QuantizationArgs] = None diff --git a/src/sparsetensors/quantization/utils/__init__.py b/src/sparsetensors/quantization/utils/__init__.py deleted file mode 100644 index a91f9e5d..00000000 --- a/src/sparsetensors/quantization/utils/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# flake8: noqa -from .helpers import * diff --git a/src/sparsetensors/quantization/utils/helpers.py b/src/sparsetensors/quantization/utils/helpers.py deleted file mode 100644 index 3c00cdbe..00000000 --- a/src/sparsetensors/quantization/utils/helpers.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Tuple - -import torch -from torch.nn import Module -from tqdm import tqdm - - -__all__ = [ - "is_module_quantized", - "is_model_quantized", - "iter_named_leaf_modules", - "module_type", - "calculate_compression_ratio", -] - - -def is_module_quantized(module: Module) -> bool: - """ - Check if a module is quantized, based on the existence of a non-empty quantization - scheme - - :param module: pytorch module to check - :return: True if module is quantized, False otherwise - """ - if not hasattr(module, "quantization_scheme"): - return False - - if module.quantization_scheme.weights is not None: - return True - - if module.quantization_scheme.input_activations is not None: - return True - - if module.quantization_scheme.output_activations is not None: - return True - - return False - - -def is_model_quantized(model: Module) -> bool: - """ - Check if any modules in a model are quantized, based on the existence of a non-empty - quantization scheme in at least one module - - :param model: pytorch model - :return: True if model is quantized, False otherwise - """ - - for _, submodule in iter_named_leaf_modules(model): - if is_module_quantized(submodule): - return True - - return False - - -def module_type(module: Module) -> str: - """ - Gets a string representation of a module type - - :module: pytorch module to get type of - :return: module type as a string - """ - return type(module).__name__ - - -def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]: - # yields modules that do not have any submodules - # TODO: potentially expand to add list of allowed submodules such as observers - for name, submodule in model.named_modules(): - if len(list(submodule.children())) == 0: - yield name, submodule - - -def calculate_compression_ratio(model: Module) -> float: - """ - Calculates the quantization compression ratio of a pytorch model, based on the - number of bits needed to represent the total weights in compressed form. Does not - take into account activation quantizatons. - - :param model: pytorch module to calculate compression ratio for - :return: compression ratio of the whole model - """ - total_compressed = 0.0 - total_uncompressed = 0.0 - for name, submodule in tqdm( - iter_named_leaf_modules(model), - desc="Calculating quantization compression ratio", - ): - for parameter in model.parameters(): - try: - uncompressed_bits = torch.finfo(parameter.dtype).bits - except TypeError: - uncompressed_bits = torch.iinfo(parameter.dtype).bits - compressed_bits = uncompressed_bits - if is_module_quantized(submodule): - compressed_bits = submodule.quantization_scheme.weights.num_bits - num_weights = parameter.numel() - total_compressed += compressed_bits * num_weights - total_uncompressed += uncompressed_bits * num_weights - - return total_uncompressed / total_compressed diff --git a/src/sparsetensors/registry/__init__.py b/src/sparsetensors/registry/__init__.py deleted file mode 100644 index 241d9d55..00000000 --- a/src/sparsetensors/registry/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# flake8: noqa - -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .registry import * diff --git a/src/sparsetensors/registry/registry.py b/src/sparsetensors/registry/registry.py deleted file mode 100644 index d8d8bc6d..00000000 --- a/src/sparsetensors/registry/registry.py +++ /dev/null @@ -1,360 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Universal registry to support registration and loading of child classes and plugins -of neuralmagic utilities -""" - -import importlib -from collections import defaultdict -from typing import Any, Dict, List, Optional, Type, Union - - -__all__ = [ - "RegistryMixin", - "register", - "get_from_registry", - "registered_names", - "registered_aliases", - "standardize_lookup_name", -] - - -_ALIAS_REGISTRY: Dict[Type, Dict[str, str]] = defaultdict(dict) -_REGISTRY: Dict[Type, Dict[str, Any]] = defaultdict(dict) - - -def standardize_lookup_name(name: str) -> str: - """ - Standardize the given name for lookup in the registry. - This will replace all underscores and spaces with hyphens and - convert the name to lowercase. - - example: - ``` - standardize_lookup_name("Foo_bar baz") == "foo-bar-baz" - ``` - - :param name: name to standardize - :return: standardized name - """ - return name.replace("_", "-").replace(" ", "-").lower() - - -def standardize_alias_name( - name: Union[None, str, List[str]] -) -> Union[None, str, List[str]]: - if name is None: - return None - elif isinstance(name, str): - return standardize_lookup_name(name) - else: # isinstance(name, list) - return [standardize_lookup_name(n) for n in name] - - -class RegistryMixin: - """ - Universal registry to support registration and loading of child classes and plugins - of neuralmagic utilities. - - Classes that require a registry or plugins may add the `RegistryMixin` and use - `register` and `load` as the main entrypoints for adding new implementations and - loading requested values from its registry. - - If a class should only have its child classes in its registry, the class should - set the static attribute `registry_requires_subclass` to True - - example - ```python - class Dataset(RegistryMixin): - pass - - - # register with default name - @Dataset.register() - class ImageNetDataset(Dataset): - pass - - # load as "ImageNetDataset" - imagenet = Dataset.load("ImageNetDataset") - - # register with custom name - @Dataset.register(name="cifar-dataset") - class Cifar(Dataset): - pass - - Note: the name will be standardized for lookup in the registry. - For example, if a class is registered as "cifar_dataset" or - "cifar dataset", it will be stored as "cifar-dataset". The user - will be able to load the class with any of the three name variants. - - # register with multiple aliases - @Dataset.register(alias=["cifar-10-dataset", "cifar_100_dataset"]) - class Cifar(Dataset): - pass - - # load as "cifar-dataset" - cifar = Dataset.load_from_registry("cifar-dataset") - - # load from custom file that implements a dataset - mnist = Dataset.load_from_registry("/path/to/mnnist_dataset.py:MnistDataset") - ``` - """ - - # set to True in child class to add check that registered/retrieved values - # implement the class it is registered to - registry_requires_subclass: bool = False - - @classmethod - def register( - cls, name: Optional[str] = None, alias: Union[List[str], str, None] = None - ): - """ - Decorator for registering a value (ie class or function) wrapped by this - decorator to the base class (class that .register is called from) - - :param name: name or list of names to register the wrapped value as, - defaults to value.__name__ - :param alias: alias or list of aliases to register the wrapped value as, - defaults to None - :return: register decorator - """ - - def decorator(value: Any): - cls.register_value(value, name=name, alias=alias) - return value - - return decorator - - @classmethod - def register_value( - cls, value: Any, name: str, alias: Union[str, List[str], None] = None - ): - """ - Registers the given value to the class `.register_value` is called from - :param value: value to register - :param name: name to register the wrapped value as, - defaults to value.__name__ - :param alias: alias or list of aliases to register the wrapped value as, - defaults to None - """ - register( - parent_class=cls, - value=value, - name=name, - alias=alias, - require_subclass=cls.registry_requires_subclass, - ) - - @classmethod - def load_from_registry(cls, name: str, **constructor_kwargs) -> object: - """ - :param name: name of registered class to load - :param constructor_kwargs: arguments to pass to the constructor retrieved - from the registry - :return: loaded object registered to this class under the given name, - constructed with the given kwargs. Raises error if the name is - not found in the registry - """ - constructor = cls.get_value_from_registry(name=name) - return constructor(**constructor_kwargs) - - @classmethod - def get_value_from_registry(cls, name: str): - """ - :param name: name to retrieve from the registry - :return: value from retrieved the registry for the given name, raises - error if not found - """ - return get_from_registry( - parent_class=cls, - name=name, - require_subclass=cls.registry_requires_subclass, - ) - - @classmethod - def registered_names(cls) -> List[str]: - """ - :return: list of all names registered to this class - """ - return registered_names(cls) - - @classmethod - def registered_aliases(cls) -> List[str]: - """ - :return: list of all aliases registered to this class - """ - return registered_aliases(cls) - - -def register( - parent_class: Type, - value: Any, - name: Optional[str] = None, - alias: Union[List[str], str, None] = None, - require_subclass: bool = False, -): - """ - :param parent_class: class to register the name under - :param value: the value to register - :param name: name to register the wrapped value as, defaults to value.__name__ - :param alias: alias or list of aliases to register the wrapped value as, - defaults to None - :param require_subclass: require that value is a subclass of the class this - method is called from - """ - if name is None: - # default name - name = value.__name__ - - name = standardize_lookup_name(name) - alias = standardize_alias_name(alias) - register_alias(name=name, alias=alias, parent_class=parent_class) - - if require_subclass: - _validate_subclass(parent_class, value) - - if name in _REGISTRY[parent_class]: - # name already exists - raise error if two different values are attempting - # to share the same name - registered_value = _REGISTRY[parent_class][name] - if registered_value is not value: - raise RuntimeError( - f"Attempting to register name {name} as {value} " - f"however {name} has already been registered as {registered_value}" - ) - else: - _REGISTRY[parent_class][name] = value - - -def get_from_registry( - parent_class: Type, name: str, require_subclass: bool = False -) -> Any: - """ - :param parent_class: class that the name is registered under - :param name: name to retrieve from the registry of the class - :param require_subclass: require that value is a subclass of the class this - method is called from - :return: value from retrieved the registry for the given name, raises - error if not found - """ - name = standardize_lookup_name(name) - - if ":" in name: - # user specifying specific module to load and value to import - module_path, value_name = name.split(":") - retrieved_value = _import_and_get_value_from_module(module_path, value_name) - else: - # look up name in alias registry - name = _ALIAS_REGISTRY[parent_class].get(name) - # look up name in registry - retrieved_value = _REGISTRY[parent_class].get(name) - if retrieved_value is None: - raise KeyError( - f"Unable to find {name} registered under type {parent_class}.\n" - f"Registered values for {parent_class}: " - f"{registered_names(parent_class)}\n" - f"Registered aliases for {parent_class}: " - f"{registered_aliases(parent_class)}" - ) - - if require_subclass: - _validate_subclass(parent_class, retrieved_value) - - return retrieved_value - - -def registered_names(parent_class: Type) -> List[str]: - """ - :param parent_class: class to look up the registry of - :return: all names registered to the given class - """ - return list(_REGISTRY[parent_class].keys()) - - -def registered_aliases(parent_class: Type) -> List[str]: - """ - :param parent_class: class to look up the registry of - :return: all aliases registered to the given class - """ - registered_aliases_plus_names = list(_ALIAS_REGISTRY[parent_class].keys()) - registered_aliases = list( - set(registered_aliases_plus_names) - set(registered_names(parent_class)) - ) - return registered_aliases - - -def register_alias( - name: str, parent_class: Type, alias: Union[str, List[str], None] = None -): - """ - Updates the mapping from the alias(es) to the given name. - If the alias is None, the name is used as the alias. - ``` - - :param name: name that the alias refers to - :param parent_class: class that the name is registered under - :param alias: single alias or list of aliases that - refer to the name, defaults to None - """ - if alias is not None: - alias = alias if isinstance(alias, list) else [alias] - else: - alias = [] - - if name in alias: - raise KeyError( - f"Attempting to register alias {name}, " - f"that is identical to the standardized name: {name}." - ) - alias.append(name) - - for alias_name in alias: - if alias_name in _ALIAS_REGISTRY[parent_class]: - raise KeyError( - f"Attempting to register alias {alias_name} as {name} " - f"however {alias_name} has already been registered as " - f"{_ALIAS_REGISTRY[alias_name]}" - ) - _ALIAS_REGISTRY[parent_class][alias_name] = name - - -def _import_and_get_value_from_module(module_path: str, value_name: str) -> Any: - # import the given module path and try to get the value_name if it is included - # in the module - - # load module - spec = importlib.util.spec_from_file_location( - f"plugin_module_for_{value_name}", module_path - ) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - - # get value from module - value = getattr(module, value_name, None) - - if not value: - raise RuntimeError( - f"Unable to find attribute {value_name} in module {module_path}" - ) - return value - - -def _validate_subclass(parent_class: Type, child_class: Type): - if not issubclass(child_class, parent_class): - raise ValueError( - f"class {child_class} is not a subclass of the class it is " - f"registered for: {parent_class}." - ) diff --git a/src/sparsetensors/utils/__init__.py b/src/sparsetensors/utils/__init__.py deleted file mode 100644 index e9e78d44..00000000 --- a/src/sparsetensors/utils/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# flake8: noqa - -from .helpers import * -from .safetensors_load import * diff --git a/src/sparsetensors/utils/helpers.py b/src/sparsetensors/utils/helpers.py deleted file mode 100644 index c584c2ee..00000000 --- a/src/sparsetensors/utils/helpers.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional - -from sparsetensors.base import SPARSITY_CONFIG_NAME -from sparsetensors.compressors import ModelCompressor -from sparsetensors.config import CompressionConfig -from transformers import AutoConfig - - -__all__ = ["infer_compressor_from_model_config"] - - -def infer_compressor_from_model_config( - pretrained_model_name_or_path: str, -) -> Optional[ModelCompressor]: - """ - Given a path to a model config, extract a sparsity config if it exists and return - the associated ModelCompressor - - :param pretrained_model_name_or_path: path to model config on disk or HF hub - :return: matching compressor if config contains a sparsity config - """ - config = AutoConfig.from_pretrained(pretrained_model_name_or_path) - sparsity_config = getattr(config, SPARSITY_CONFIG_NAME, None) - if sparsity_config is None: - return None - - format = sparsity_config.get("format") - sparsity_config = CompressionConfig.load_from_registry(format, **sparsity_config) - compressor = ModelCompressor.load_from_registry(format, config=sparsity_config) - return compressor diff --git a/src/sparsetensors/utils/safetensors_load.py b/src/sparsetensors/utils/safetensors_load.py deleted file mode 100644 index 4d71482a..00000000 --- a/src/sparsetensors/utils/safetensors_load.py +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import re -import struct -from typing import Dict, List, Optional - -from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, cached_file - - -__all__ = [ - "get_safetensors_folder", - "get_safetensors_header", - "match_param_name", - "merge_names", - "get_weight_mappings", - "get_nested_weight_mappings", -] - - -def get_safetensors_folder( - pretrained_model_name_or_path: str, cache_dir: Optional[str] = None -) -> str: - """ - Given a Hugging Face stub or a local path, return the folder containing the - safetensors weight files - - :param pretrained_model_name_or_path: local path to model or HF stub - :param cache_dir: optional cache dir to search through, if none is specified the - model will be searched for in the default TRANSFORMERS_CACHE - :return: local folder containing model data - """ - if os.path.exists(pretrained_model_name_or_path): - # argument is a path to a local folder - return pretrained_model_name_or_path - - safetensors_path = cached_file( - pretrained_model_name_or_path, - SAFE_WEIGHTS_NAME, - cache_dir=cache_dir, - _raise_exceptions_for_missing_entries=False, - ) - index_path = cached_file( - pretrained_model_name_or_path, - SAFE_WEIGHTS_INDEX_NAME, - cache_dir=cache_dir, - _raise_exceptions_for_missing_entries=False, - ) - if safetensors_path is not None: - # found a single cached safetensors file - return os.path.split(safetensors_path)[0] - if index_path is not None: - # found a cached safetensors weight index file - return os.path.split(index_path)[0] - - # model weights could not be found locally or cached from HF Hub - raise ValueError( - "Could not locate safetensors weight or index file from " - f"{pretrained_model_name_or_path}." - ) - - -def get_safetensors_header(safetensors_path: str) -> Dict[str, str]: - """ - Extracts the metadata from a safetensors file as JSON - - :param safetensors_path: path to a safetensors file - :return: dictionary of metadata extracted from the safetensors file - """ - with open(safetensors_path, "rb") as f: - length_of_header = struct.unpack(" str: - """ - Helper function extracting the uncompressed parameterized layer name from a - compressed name. Assumes the compressed name was merged using merge_names. - - :param full_name: full name of parameter in compressed model - :param param_name: compression paramater name - :return: uncompressed name of the uncompressed parameterized layer - """ - pattern = r"^(.*)\." + param_name + r"$" - regex = re.findall(pattern, full_name) - if len(regex) == 0: - return None - return regex[0] - - -def merge_names(parent_name: str, child_name: str) -> str: - """ - Helper function for merging an uncompressed parameterized layer name with a - compression parameter. Names merged with this function can then be parsed by - match_param_name. - - :param parent_name: uncompressed parameterized layer name - :param child_name: compression parameter name - :return: merged compressed name - """ - return parent_name + "." + child_name - - -def get_weight_mappings(model_path: str) -> Dict[str, str]: - """ - Takes a path to a state dict saved in safetensors format and returns a mapping - from parameterized layer name to file location. - - { - layer.weight.bitmask: file_location, - layer.weight.row_offsets: file_location, - layer.weight.shape: file_location, - layer.weight.compressed: file_location - } - - This generalizes to cases where the model is split into multiple safetensors files - - :param model_path: path to safetensors state dict, must contain either a single - safetensors file or multiple files with an index - :return: mapping of parameterized layer name to file location - """ - safetensors_path = os.path.join(model_path, SAFE_WEIGHTS_NAME) - index_path = os.path.join(model_path, SAFE_WEIGHTS_INDEX_NAME) - if os.path.exists(safetensors_path): - # we have a single safetensors file to read - header = get_safetensors_header(safetensors_path) - for key in header.keys(): - header[key] = SAFE_WEIGHTS_NAME - header.pop("__metadata__", None) - elif os.path.exists(index_path): - # we have multiple safetensors file, read from index - with open(index_path, "r", encoding="utf-8") as f: - index = json.load(f) - header = index["weight_map"] - else: - raise ValueError( - f"Could not find a safetensors weight or index file at {model_path}" - ) - - # convert weight locations to full paths - for key, value in header.items(): - header[key] = os.path.join(model_path, value) - - return header - - -def get_nested_weight_mappings( - model_path: str, params_to_nest: List[str] -) -> Dict[str, Dict[str, str]]: - """ - Takes a path to a state dict saved in safetensors format and returns a nested - mapping from uncompressed parameterized layer names to the file locations of each - of the layers compression parameters. - - layer.weight: { - bitmask: file_location, - row_offsets: file_location, - shape: file_location, - compressed: file_location - } - - This generalizes to cases where the model is split into multiple safetensors files - - :param model_path: path to safetensors state dict, must contain either a single - safetensors file or multiple files with an index - :return: nested mapping of parameterized layer name to file location - """ - weight_mappings = get_weight_mappings(model_path) - - nested_weight_mappings = {} - for key in weight_mappings.keys(): - for param_name in params_to_nest: - maybe_match = match_param_name(key, param_name) - if maybe_match is not None: - dense_param = maybe_match - if dense_param not in nested_weight_mappings: - nested_weight_mappings[dense_param] = {} - nested_weight_mappings[dense_param][param_name] = weight_mappings[key] - - return nested_weight_mappings diff --git a/tests/quantization/__init__.py b/tests/quantization/__init__.py deleted file mode 100644 index 0c44f887..00000000 --- a/tests/quantization/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/quantization/lifecycle/__init__.py b/tests/quantization/lifecycle/__init__.py deleted file mode 100644 index 0c44f887..00000000 --- a/tests/quantization/lifecycle/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/quantization/lifecycle/test_apply.py b/tests/quantization/lifecycle/test_apply.py deleted file mode 100644 index eeb29a41..00000000 --- a/tests/quantization/lifecycle/test_apply.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from sparsetensors.quantization.lifecycle import apply_quantization_config -from sparsetensors.quantization.quant_config import ( - QuantizationConfig, - QuantizationStatus, -) -from transformers import AutoModelForCausalLM - - -def test_apply_quantization_config_tinyllama(): - quant_config = get_sample_tinyllama_quant_config() - model = get_tinyllama_model() - - # check that model is not already quantized - for module in model.modules(): - _test_layer_quantization_status(module, inputs=False, weights=False) - - # apply quant config to model - apply_quantization_config(model, quant_config) - - # check for correct application of quant config - num_linears = 0 - num_embeddings = 0 - num_rotary_embeddings = 0 - for name, module in model.named_modules(): - if name in quant_config.ignore: - continue - module_type = module.__class__.__name__ - if module_type == "Linear": - num_linears += 1 - _test_layer_quantization_status(module, inputs=True, weights=True) - elif module_type == "Embedding": - num_embeddings += 1 - _test_layer_quantization_status(module, inputs=False, weights=True) - elif module_type == "LlamaRotaryEmbedding": - num_rotary_embeddings += 1 - _test_layer_quantization_status(module, inputs=False, weights=False) - - # sanity check correct number of layers targeted - assert num_linears == 154 # 155 Linear layers - 1 that gets ignored - assert num_embeddings == 1 - assert num_rotary_embeddings == 22 - - -def test_serialize_config_tinyllama(): - quant_config = get_sample_tinyllama_quant_config() - model = get_tinyllama_model() - - # check that model is not already quantized - for module in model.modules(): - _test_layer_quantization_status(module, inputs=False, weights=False) - - # apply quant config to model - apply_quantization_config(model, quant_config) - - serialized_config = QuantizationConfig.from_pretrained(model) - assert len(serialized_config.config_groups) == 2 - assert serialized_config.config_groups["group_0"].targets == ["Embedding"] - assert serialized_config.config_groups["group_0"].input_activations is None - assert serialized_config.config_groups["group_1"].targets == ["Linear"] - assert serialized_config.config_groups["group_1"].input_activations is not None - assert serialized_config.quantization_status == QuantizationStatus.FROZEN - assert serialized_config.format == "fakequant" - assert serialized_config.quant_method == "sparseml" - assert serialized_config.ignore == ["model.layers.1.mlp.down_proj"] - assert serialized_config.global_compression_ratio > 1.0 - assert serialized_config.global_compression_ratio < 8.0 - - -def _test_layer_quantization_status(module, inputs: bool, weights: bool): - # check if quantization is applied at all (true if inputs or weights targeted) - quantized = inputs or weights - assert hasattr(module, "quantization_scheme") == quantized - assert hasattr(module, "quantization_status") == quantized - - # check inputs matches expected - assert hasattr(module, "input_scale") == inputs - assert hasattr(module, "input_zero_point") == inputs - - # check weights matches expected - assert hasattr(module, "weight_scale") == weights - assert hasattr(module, "weight_zero_point") == weights - - -def get_tinyllama_model(): - return AutoModelForCausalLM.from_pretrained( - "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" - ) - - -def get_sample_tinyllama_quant_config(): - config_dict = { - "quant_method": "sparseml", - "format": "fakequant", - "quantization_status": "frozen", - "global_compression_ratio": None, - "config_groups": { - "group_1": { - "weights": { - "num_bits": 8, - "type": "int", - "symmetric": True, - "strategy": "tensor", - }, - "input_activations": { - "num_bits": 8, - "type": "int", - "symmetric": True, - "strategy": "tensor", - }, - "targets": ["Linear"], - }, - "group_2": { - "weights": { - "num_bits": 8, - "type": "int", - "symmetric": False, - "strategy": "tensor", - }, - "input_activations": None, - "targets": ["Embedding"], - }, - }, - "ignore": ["LlamaRotaryEmbedding", "model.layers.1.mlp.down_proj"], - } - return QuantizationConfig.parse_obj(config_dict) diff --git a/tests/quantization/test_quant_args.py b/tests/quantization/test_quant_args.py deleted file mode 100644 index c407eae5..00000000 --- a/tests/quantization/test_quant_args.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from pydantic import ValidationError -from sparsetensors.quantization import ( - QuantizationArgs, - QuantizationStrategy, - QuantizationType, -) - - -def test_defaults(): - default = QuantizationArgs() - - assert default.num_bits == 8 - assert default.type == QuantizationType.INT - assert default.symmetric - assert default.strategy == QuantizationStrategy.TENSOR - assert default.group_size is None - assert default.block_structure is None - - -def test_group(): - kwargs = {"strategy": "group", "group_size": 128} - - group = QuantizationArgs(**kwargs) - assert group.strategy == QuantizationStrategy.GROUP - assert group.group_size == kwargs["group_size"] - - -def test_block(): - kwargs = {"strategy": "block", "block_structure": "2x4"} - - block = QuantizationArgs(**kwargs) - assert block.strategy == QuantizationStrategy.BLOCK - assert block.block_structure == kwargs["block_structure"] - - -def test_invalid(): - with pytest.raises(ValidationError): - _ = QuantizationArgs(type="invalid") - with pytest.raises(ValidationError): - _ = QuantizationArgs(strategy="invalid") diff --git a/tests/quantization/test_quant_config.py b/tests/quantization/test_quant_config.py deleted file mode 100644 index 92b68ab7..00000000 --- a/tests/quantization/test_quant_config.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytest -from pydantic import ValidationError -from sparsetensors.quantization import ( - QuantizationConfig, - QuantizationScheme, - QuantizationStatus, -) - - -def test_basic_config(): - config_groups = {"group_1": QuantizationScheme(targets=[])} - config = QuantizationConfig(config_groups=config_groups) - - assert config.config_groups == config_groups - assert config.quant_method == "sparseml" - assert config.format == "fakequant" - assert config.quantization_status == QuantizationStatus.INITIALIZED - assert config.global_compression_ratio is None - assert isinstance(config.ignore, list) and len(config.ignore) == 0 - - -def test_full_config(): - config_groups = { - "group_1": QuantizationScheme(targets=[]), - "group_2": QuantizationScheme(targets=[]), - } - global_compression_ratio = 3.5 - ignore = ["model.layers.0"] - quantization_status = "compressed" - - config = QuantizationConfig( - config_groups=config_groups, - global_compression_ratio=global_compression_ratio, - ignore=ignore, - quantization_status=quantization_status, - ) - assert config.config_groups == config_groups - assert config.global_compression_ratio == global_compression_ratio - assert config.ignore == ignore - assert config.quantization_status == QuantizationStatus.COMPRESSED - - -def test_need_config_groups(): - with pytest.raises(ValidationError): - _ = QuantizationScheme() diff --git a/tests/quantization/test_quant_scheme.py b/tests/quantization/test_quant_scheme.py deleted file mode 100644 index 63b135b5..00000000 --- a/tests/quantization/test_quant_scheme.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from pydantic import ValidationError -from sparsetensors.quantization import QuantizationArgs, QuantizationScheme - - -def test_basic_scheme(): - targets = ["model.layer.0", "model.layer.3"] - weights = QuantizationArgs() - - scheme = QuantizationScheme(targets=targets, weights=weights) - assert scheme.targets == targets - assert scheme.weights == weights - assert scheme.input_activations is None - assert scheme.output_activations is None - - -def test_full_scheme(): - targets = ["Linear"] - weights = QuantizationArgs() - input_activations = QuantizationArgs(num_bits=4) - output_activations = QuantizationArgs(num_bits=8, type="float", symmetric=False) - - scheme = QuantizationScheme( - targets=targets, - weights=weights, - input_activations=input_activations, - output_activations=output_activations, - ) - assert scheme.targets == targets - assert scheme.weights == weights - assert scheme.input_activations == input_activations - assert scheme.output_activations == output_activations - - -def test_needs_targets(): - with pytest.raises(ValidationError): - _ = QuantizationScheme() diff --git a/tests/sparsetensors/quantization/lifecycle/conftest.py b/tests/sparsetensors/quantization/lifecycle/conftest.py deleted file mode 100644 index a8ad01b2..00000000 --- a/tests/sparsetensors/quantization/lifecycle/conftest.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List, Optional - -import pytest -from sparsetensors.quantization.quant_args import QuantizationArgs -from sparsetensors.quantization.quant_scheme import QuantizationScheme - - -@pytest.fixture -def create_quantization_scheme(): - def quantization_scheme( - targets: List[str], - weights: Optional[QuantizationArgs] = None, - input_activations: Optional[QuantizationArgs] = None, - output_activations: Optional[QuantizationArgs] = None, - ): - return QuantizationScheme( - targets=targets, - weights=weights, - input_activations=input_activations, - output_activations=output_activations, - ) - - return quantization_scheme diff --git a/tests/sparsetensors/quantization/lifecycle/test_forward.py b/tests/sparsetensors/quantization/lifecycle/test_forward.py deleted file mode 100644 index c2d27bd1..00000000 --- a/tests/sparsetensors/quantization/lifecycle/test_forward.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytest -import torch -from sparsetensors.quantization.lifecycle.forward import ( - maybe_calibrate_or_quantize, - wrap_module_forward_quantized, -) -from sparsetensors.quantization.lifecycle.initialize import ( - initialize_module_for_quantization, -) -from sparsetensors.quantization.lifecycle.status import QuantizationStatus -from sparsetensors.quantization.quant_args import QuantizationArgs -from torch.nn import Linear - - -def test_wrap_module_forward_quantized(create_quantization_scheme): - num_bits = 8 - quantization_scheme = create_quantization_scheme( - targets=["*"], - weights=QuantizationArgs(num_bits=num_bits, symmetric=True), - input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), - ) - layer = Linear(4, 4) - - func_forward = layer.forward.__func__ - - # check that the forward call is overwritten - wrap_module_forward_quantized(layer, quantization_scheme) - - assert not func_forward == layer.forward.__func__ - - -@pytest.mark.parametrize( - "quantization_status", ["INITIALIZED", "CALIBRATION", "FROZEN"] -) -def test_maybe_calibrate_or_quantize(create_quantization_scheme, quantization_status): - num_bits = 8 - quantization_scheme = create_quantization_scheme( - targets=["*"], - weights=QuantizationArgs(num_bits=num_bits, symmetric=True), - input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), - ) - quantization_args = QuantizationArgs(num_bits=num_bits, symmetric=False) - layer = Linear(4, 4) - layer.weight.data *= 100 - - initialize_module_for_quantization(layer, quantization_scheme) - layer.quantization_status = QuantizationStatus(quantization_status) - - if layer.quantization_status == QuantizationStatus.INITIALIZED: - out = maybe_calibrate_or_quantize( - layer, layer.weight.data, "input", quantization_args - ) - assert torch.allclose(out, layer.weight.data) - elif layer.quantization_status == QuantizationStatus.CALIBRATION: - out = maybe_calibrate_or_quantize( - layer, layer.weight.data, "input", quantization_args - ) - assert not torch.allclose(out, layer.weight.data) - - elif layer.quantization_status == QuantizationStatus.FROZEN: - # scale and zero points are empty -- cannot quantize - with pytest.raises(ValueError): - out = maybe_calibrate_or_quantize( - layer, layer.weight.data, "input", quantization_args - ) diff --git a/tests/sparsetensors/quantization/lifecycle/test_frozen.py b/tests/sparsetensors/quantization/lifecycle/test_frozen.py deleted file mode 100644 index 0b5a18e8..00000000 --- a/tests/sparsetensors/quantization/lifecycle/test_frozen.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from sparsetensors.quantization.lifecycle.frozen import freeze_module_quantization -from sparsetensors.quantization.lifecycle.initialize import ( - initialize_module_for_quantization, -) -from sparsetensors.quantization.lifecycle.status import QuantizationStatus -from sparsetensors.quantization.quant_args import QuantizationArgs -from torch.nn import Linear - - -def test_set_module_for_calibration(create_quantization_scheme): - num_bits = 8 - quantization_scheme = create_quantization_scheme( - targets=["*"], - weights=QuantizationArgs(num_bits=num_bits, symmetric=True), - input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), - ) - - layer = Linear(4, 4) - - initialize_module_for_quantization(layer, quantization_scheme) - layer.quantization_status = QuantizationStatus("CALIBRATION") - - # should have both input and weight observer after initalizing - assert hasattr(layer, "input_observer") - assert hasattr(layer, "weight_observer") - - # observers should get deleted after freezing - freeze_module_quantization(layer) - assert not hasattr(layer, "input_observer") - assert not hasattr(layer, "weight_observer") - - assert layer.quantization_status == QuantizationStatus("FROZEN") diff --git a/tests/sparsetensors/quantization/lifecycle/test_initialize.py b/tests/sparsetensors/quantization/lifecycle/test_initialize.py deleted file mode 100644 index b2f01c0f..00000000 --- a/tests/sparsetensors/quantization/lifecycle/test_initialize.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytest -from sparsetensors.quantization.lifecycle.initialize import ( - initialize_module_for_quantization, -) -from sparsetensors.quantization.quant_args import QuantizationArgs -from sparsetensors.quantization.quant_config import QuantizationStatus -from torch.nn import Linear - - -NUM_BITS = 8 - - -@pytest.mark.parametrize( - "weights,input_activations", - [ - ( - QuantizationArgs(num_bits=NUM_BITS, symmetric=True), - None, - ), - ( - None, - QuantizationArgs(num_bits=NUM_BITS, symmetric=True), - ), - ( - QuantizationArgs(num_bits=NUM_BITS, symmetric=True), - QuantizationArgs(num_bits=NUM_BITS, symmetric=True), - ), - ], -) -def test_initialize_module_for_quantization( - create_quantization_scheme, weights, input_activations -): - quantization_scheme = create_quantization_scheme( - targets=["*"], - weights=weights, - input_activations=input_activations, - ) - layer = Linear(4, 4) - - assert not hasattr(layer, "quantization_scheme") - assert not hasattr(layer, "quantization_status") - - # add attributes, zero_points and scale - initialize_module_for_quantization(layer, quantization_scheme) - - registered_params = {"weight", "bias"} - if weights is not None: - registered_params.add("weight_scale") - registered_params.add("weight_zero_point") - - if input_activations is not None: - registered_params.add("input_scale") - registered_params.add("input_zero_point") - - for key in layer.state_dict().keys(): - assert key in registered_params - registered_params.remove(key) - - assert len(registered_params) == 0 - - assert hasattr(layer, "quantization_scheme") - assert hasattr(layer, "quantization_status") - - assert layer.quantization_status == QuantizationStatus.INITIALIZED diff --git a/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py b/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py deleted file mode 100644 index 2884bde4..00000000 --- a/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from copy import deepcopy - -import torch -from sparsetensors.quantization.lifecycle.calibration import set_module_for_calibration -from sparsetensors.quantization.lifecycle.frozen import freeze_module_quantization -from sparsetensors.quantization.lifecycle.initialize import ( - initialize_module_for_quantization, -) -from sparsetensors.quantization.quant_args import QuantizationArgs -from sparsetensors.quantization.quant_config import QuantizationStatus -from torch.nn import Linear - - -def test_lifecyle(create_quantization_scheme): - num_bits = 8 - - quantization_scheme = create_quantization_scheme( - input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), - weights=QuantizationArgs(num_bits=num_bits, symmetric=True), - targets=["*"], - ) - - layer = Linear(4, 4) - layer.weight.data *= 100 - - # updated layer keys check - expected_layer_keys = {"weight", "bias"} - for key in layer.state_dict().keys(): - expected_layer_keys.remove(key) - assert len(expected_layer_keys) == 0 - - # over write forward pass and register zero_point and scale - initialize_module_for_quantization(layer, quantization_scheme) - expected_layer_keys = { - "input_scale", - "input_zero_point", - "weight_scale", - "weight_zero_point", - "weight", - "bias", - } - for key in layer.state_dict().keys(): - expected_layer_keys.remove(key) - assert len(expected_layer_keys) == 0 - - # should have both input and weight observer after initalizing - assert hasattr(layer, "input_observer") - assert hasattr(layer, "weight_observer") - - assert hasattr(layer, "quantization_scheme") - assert hasattr(layer, "quantization_status") - assert layer.quantization_status == QuantizationStatus.INITIALIZED - - set_module_for_calibration(layer) - assert layer.quantization_status == QuantizationStatus.CALIBRATION - - # do a calibration step - assert torch.numel(layer.input_zero_point.data) == 0 - assert torch.numel(layer.input_scale) == 0 - assert torch.numel(layer.weight_scale) == 0 - assert torch.numel(layer.weight_zero_point) == 0 - - layer(torch.randn(4, 4)) - - # zero-points and scale should be updated after forward pass - assert torch.numel(layer.input_zero_point.data) > 0 - assert torch.numel(layer.input_scale) > 0 - assert torch.numel(layer.weight_scale) > 0 - assert torch.numel(layer.weight_zero_point) > 0 - - # symmetric zero points should center at 0 - assert layer.weight_zero_point.data == 0 - - # check high and low bound of the weights - assert torch.all(layer.weight.data >= -128) and torch.all(layer.weight.data <= 127) - - initalized_layer = deepcopy(layer) - - # calibrate the layers with each iteration - for _ in range(10): - layer(torch.randn(4, 4)) - - assert initalized_layer.input_zero_point != layer.input_zero_point - assert initalized_layer.input_scale != layer.input_scale - assert initalized_layer.weight_scale != layer.weight_scale - - # check quantization f_q(x) is applied after frozen without update - input_check_for_quant = torch.randn(4, 4) - out_calibration = layer(input_check_for_quant) - - layer_before_freeze = deepcopy(layer) - - # Freeze, no update after any forward pass - freeze_module_quantization(layer) - - for _ in range(10): - layer(torch.randn(4, 4)) - assert layer_before_freeze.input_zero_point == layer.input_zero_point - assert layer_before_freeze.input_scale == layer.input_scale - assert layer_before_freeze.weight_scale == layer.weight_scale - - # check that the same quantization is applied as calibration to frozen - assert torch.all(out_calibration == layer(input_check_for_quant)) diff --git a/tests/sparsetensors/quantization/observers/test_min_max.py b/tests/sparsetensors/quantization/observers/test_min_max.py deleted file mode 100644 index a5273d02..00000000 --- a/tests/sparsetensors/quantization/observers/test_min_max.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytest -import torch -from sparsetensors.quantization.quant_args import QuantizationArgs - - -@pytest.mark.parametrize( - "symmetric,expected_scale,expected_zero_point", - [ - (True, 0.0078, 0), - (False, 0.0039, 0), - ], -) -def test_min_max_observer(symmetric, expected_scale, expected_zero_point): - tensor = torch.tensor([1, 1, 1, 1, 1]) - num_bits = 8 - weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric) - - observer = weights.get_observer() - scale, zero_point = observer(tensor) - - assert round(scale.item(), 4) == expected_scale - assert round(zero_point.item(), 4) == expected_zero_point - - -def test_min_max_observer_symmetric_scale_range(): - tensor = torch.rand(4, 4) - tensor *= 127 - - num_bits = 8 - weights = QuantizationArgs(num_bits=num_bits, symmetric=True) - - observer = weights.get_observer() - scale, zero_point = observer(tensor) - - # if symmetric, max symmetric_range = abs(-128) / 255 - assert round(scale.item(), 4) <= 1.0039 - assert round(zero_point.item(), 4) == 0 - - -def test_min_max_observer_value_update(): - inp = torch.tensor([1, 1, 1, 1, 1]) - inp_update_max = torch.tensor([127, 1, 1, 1, 1]) - inp_update_min = torch.tensor([-128, 1, 1, 1, 1]) - - # udpate the min, max twice total - tensors = [ - inp, - inp, - inp_update_max, # update max - inp, - inp_update_min, # update min - ] - - tensor = inp - num_bits = 8 - weights = QuantizationArgs(num_bits=num_bits, symmetric=True) - - observer = weights.get_observer() - curr_max = 1 - curr_min = 1 - for i, tensor in enumerate(tensors): - observer(tensor) - curr_max = max(observer.max_val, curr_max) - curr_min = min(observer.min_val, curr_max) - - if i < 2: - assert curr_max == 1 - assert curr_min == 1 - elif i < 4: - assert curr_max == 43 # (127 + 2) / 3 - assert curr_min == 1 - else: - assert curr_max == 43 - assert curr_min == -24.8 # (-128 + 4) / 5 diff --git a/tests/test_bitmask.py b/tests/test_bitmask.py index b5bca142..248580bc 100644 --- a/tests/test_bitmask.py +++ b/tests/test_bitmask.py @@ -17,8 +17,8 @@ import pytest import torch +from compressed_tensors import BitmaskCompressor, BitmaskConfig, BitmaskTensor from safetensors.torch import save_file -from sparsetensors import BitmaskCompressor, BitmaskConfig, BitmaskTensor @pytest.mark.parametrize( diff --git a/tests/test_registry.py b/tests/test_registry.py index b73d357f..a183d77d 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -13,7 +13,7 @@ # limitations under the License. import pytest -from sparsetensors import ( +from compressed_tensors import ( BitmaskCompressor, BitmaskConfig, CompressionConfig, From 3e037a537f788bc1af53fba336be8e8fda721b82 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Fri, 26 Apr 2024 20:32:06 +0000 Subject: [PATCH 08/10] rebase --- src/compressed_tensors/README.md | 162 ++++++++ src/compressed_tensors/__init__.py | 21 + src/compressed_tensors/base.py | 15 + .../compressors/__init__.py | 19 + src/compressed_tensors/compressors/base.py | 73 ++++ src/compressed_tensors/compressors/dense.py | 31 ++ .../compressors/sparse_bitmask.py | 233 ++++++++++++ src/compressed_tensors/config/__init__.py | 18 + src/compressed_tensors/config/base.py | 36 ++ src/compressed_tensors/config/dense.py | 36 ++ .../config/sparse_bitmask.py | 36 ++ .../quantization/__init__.py | 21 + .../quantization/lifecycle/__init__.py | 22 ++ .../quantization/lifecycle/apply.py | 105 +++++ .../quantization/lifecycle/calibration.py | 51 +++ .../quantization/lifecycle/forward.py | 137 +++++++ .../quantization/lifecycle/frozen.py | 47 +++ .../quantization/lifecycle/initialize.py | 96 +++++ .../quantization/observers/__init__.py | 19 + .../quantization/observers/base.py | 69 ++++ .../quantization/observers/memoryless.py | 61 +++ .../quantization/observers/min_max.py | 79 ++++ .../quantization/quant_args.py | 85 +++++ .../quantization/quant_config.py | 154 ++++++++ .../quantization/quant_scheme.py | 39 ++ .../quantization/utils/__init__.py | 16 + .../quantization/utils/helpers.py | 115 ++++++ src/compressed_tensors/registry/__init__.py | 17 + src/compressed_tensors/registry/registry.py | 360 ++++++++++++++++++ src/compressed_tensors/utils/__init__.py | 17 + src/compressed_tensors/utils/helpers.py | 45 +++ .../utils/safetensors_load.py | 196 ++++++++++ .../observers/quantization/__init__.py | 13 + .../quantization/lifecycle/__init__.py | 13 + .../quantization/lifecycle/conftest.py | 37 ++ .../quantization/lifecycle/test_apply.py | 140 +++++++ .../quantization/lifecycle/test_forward.py | 82 ++++ .../quantization/lifecycle/test_frozen.py | 47 +++ .../quantization/lifecycle/test_initialize.py | 79 ++++ .../quantization/lifecycle/test_lifecycle.py | 119 ++++++ .../observers/quantization/test_quant_args.py | 55 +++ .../quantization/test_quant_config.py | 60 +++ .../quantization/test_quant_scheme.py | 51 +++ .../quantization/observers/test_min_max.py | 89 +++++ 44 files changed, 3216 insertions(+) create mode 100644 src/compressed_tensors/README.md create mode 100644 src/compressed_tensors/__init__.py create mode 100644 src/compressed_tensors/base.py create mode 100644 src/compressed_tensors/compressors/__init__.py create mode 100644 src/compressed_tensors/compressors/base.py create mode 100644 src/compressed_tensors/compressors/dense.py create mode 100644 src/compressed_tensors/compressors/sparse_bitmask.py create mode 100644 src/compressed_tensors/config/__init__.py create mode 100644 src/compressed_tensors/config/base.py create mode 100644 src/compressed_tensors/config/dense.py create mode 100644 src/compressed_tensors/config/sparse_bitmask.py create mode 100644 src/compressed_tensors/quantization/__init__.py create mode 100644 src/compressed_tensors/quantization/lifecycle/__init__.py create mode 100644 src/compressed_tensors/quantization/lifecycle/apply.py create mode 100644 src/compressed_tensors/quantization/lifecycle/calibration.py create mode 100644 src/compressed_tensors/quantization/lifecycle/forward.py create mode 100644 src/compressed_tensors/quantization/lifecycle/frozen.py create mode 100644 src/compressed_tensors/quantization/lifecycle/initialize.py create mode 100644 src/compressed_tensors/quantization/observers/__init__.py create mode 100644 src/compressed_tensors/quantization/observers/base.py create mode 100644 src/compressed_tensors/quantization/observers/memoryless.py create mode 100644 src/compressed_tensors/quantization/observers/min_max.py create mode 100644 src/compressed_tensors/quantization/quant_args.py create mode 100644 src/compressed_tensors/quantization/quant_config.py create mode 100644 src/compressed_tensors/quantization/quant_scheme.py create mode 100644 src/compressed_tensors/quantization/utils/__init__.py create mode 100644 src/compressed_tensors/quantization/utils/helpers.py create mode 100644 src/compressed_tensors/registry/__init__.py create mode 100644 src/compressed_tensors/registry/registry.py create mode 100644 src/compressed_tensors/utils/__init__.py create mode 100644 src/compressed_tensors/utils/helpers.py create mode 100644 src/compressed_tensors/utils/safetensors_load.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/__init__.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py create mode 100644 tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py create mode 100644 tests/compressed_tensors/quantization/observers/test_min_max.py diff --git a/src/compressed_tensors/README.md b/src/compressed_tensors/README.md new file mode 100644 index 00000000..5b1c8ece --- /dev/null +++ b/src/compressed_tensors/README.md @@ -0,0 +1,162 @@ +# Save/Load Compressed SafeTensors + +## Motivation + +* Reduce disk space by saving in a compressed format for sparse models. Models in this compressed format will be loaded by vLLM for more efficient inference +* Set up the save/load architecture such that we can easily expand to additional compression formats in the future. The config should be human readable so users can understand the compression format at a quick glance + +## SafeTensors File Format + +For each parameter in the uncompressed state_dict, we store the following attributes +needed for decompression in the compressed state_dict: + +* compressed tensor +* bitmask +* uncompressed shape +* row offsets + +```python +# dense +{ + PARAM_NAME: uncompressed_tensor +} + +# compressed +{ + PARAM_NAME.compressed: compressed_tensor # 1d tensor + PARAM_NAME.bitmask: value # 2d bitmask tensor (nrows x (ncols / 8)) + PARAM_NAME.shape: value # uncompressed shape tensor + PARAM_NAME.row_offsets: value # 1d offsets tensor +} +``` + +Config information gets stored in the HF config file +```json +// config.json +{ + "sparsity_config": { + "format": "sparse_bitmask", // "dense_sparsity" for original tensor format + + // informational + "sparsity_structure": "unstructured", // or 2:4, 8:16 etc... + "global_sparsity": "0.5" + } +} +``` + +## Saving/Loading Interface + +Loading in a compressed model requires no interface changes + +```python +from sparseml.transformers.utils import SparseAutoModelForCausalLM + +# should contain model.safetensors or model.safetensors.index.json +model_path = "/PATH/TO/COMPRESSED_MODEL" + +model = SparseAutoModelForCausalLM.from_pretrained( + model_name_or_path=model_path, + **model_kwargs, +) +``` + +Saving a compressed model with an explicitly provided compression config. The config +is saved to the model's `config.json` file. **Note:** the model must have been +initialized with SparseAutoModelForCausalLM.from_pretrained() + +```python +from compressed_tensors import BitmaskConfig + +output_dir = "/PATH/TO/SAVE/COMPRESSED_MODEL" +sparsity_config = BitmaskConfig() + +model.save_pretrained( + save_directory=output_dir, + sparsity_config=sparsity_config, +) +``` + +Saving a compressed model, inferring the config from the model attributes + +```python +model.save_pretrained( + save_directory=output_dir, + save_compressed=True +) +``` + +Saving a model in the dense format. If the model has at least 5% global sparsity a +sparsity config will still be included in `config.json` with format `dense_sparsity` + +```python +model.save_pretrained( + save_directory=output_dir +) +``` + +Saving a model in the dense format, bypassing the sparsity config calculation. When the +`skip_compression_stats` flag is set, no sparsity config will be written to +`config.json` + +```python +model.save_pretrained( + save_directory=output_dir + skip_compression_stats=True +) +``` + +## Enable Compression During One-Shot and Sparse Finetunining +Models that are saved in a supported compressed format on disk will automatically be +decompressed when loaded as input to `sparseml.transformers.oneshot` or +`sparseml.transformers.train` + +To enable compression on save after oneshot or finetuning simply add the +`save_compressed=True` argument to `sparseml.transformers.oneshot` or +`sparseml.transformers.train` + +```python +from sparseml.transformers import train + +train( + save_compressed=True, + model="neuralmagic/TinyLlama-1.1B-Chat-v1.0-pruned2.4", + recipe=RECIPE, + dataset=DATASET +) +``` + + +## Example Code + +Loads a 60% sparse model, compresses it using the inferred bitmask compression, then +reloads the compressed model. + +```python +from sparseml.transformers import SparseAutoModelForCausalLM +from sparseml.utils.pytorch.utils import measure_cuda_memory +import torch + +MODEL_PATH = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60" +OUTPUT_PATH = "./test_compress_output" +RECIPE = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60" + +torch.cuda.set_device(0) +with measure_cuda_memory() as m: + model = SparseAutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="cuda:0") +print(f"Load dense model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB") + +sparsity_config = getattr(model,"sparsity_config", None) +print(f"Sparsity config before compression: {sparsity_config}") +with measure_cuda_memory() as m: + model.save_pretrained(OUTPUT_PATH, save_compressed=True) +print(f"Save compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB") + +torch.cuda.set_device(1) +with measure_cuda_memory() as m: + model_again = SparseAutoModelForCausalLM.from_pretrained( + OUTPUT_PATH, device_map="cuda:1" + ) +print(f"Load compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB") +sparsity_config = getattr(model_again,"sparsity_config", None) +print(f"Sparsity config after compression: {sparsity_config}") +``` diff --git a/src/compressed_tensors/__init__.py b/src/compressed_tensors/__init__.py new file mode 100644 index 00000000..0833dd42 --- /dev/null +++ b/src/compressed_tensors/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .base import * + +# flake8: noqa +from .compressors import * +from .config import * +from .quantization import QuantizationConfig, QuantizationStatus +from .utils import * diff --git a/src/compressed_tensors/base.py b/src/compressed_tensors/base.py new file mode 100644 index 00000000..f01a055f --- /dev/null +++ b/src/compressed_tensors/base.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SPARSITY_CONFIG_NAME = "sparsity_config" diff --git a/src/compressed_tensors/compressors/__init__.py b/src/compressed_tensors/compressors/__init__.py new file mode 100644 index 00000000..1c7362eb --- /dev/null +++ b/src/compressed_tensors/compressors/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa + +from .base import ModelCompressor +from .dense import DenseCompressor +from .sparse_bitmask import BitmaskCompressor, BitmaskTensor diff --git a/src/compressed_tensors/compressors/base.py b/src/compressed_tensors/compressors/base.py new file mode 100644 index 00000000..9c205f93 --- /dev/null +++ b/src/compressed_tensors/compressors/base.py @@ -0,0 +1,73 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +from typing import Dict, Generator, Tuple + +from compressed_tensors.base import SPARSITY_CONFIG_NAME +from compressed_tensors.config import CompressionConfig +from compressed_tensors.registry import RegistryMixin +from torch import Tensor +from torch.nn import Module, Parameter +from tqdm import tqdm + + +__all__ = ["ModelCompressor"] + + +class ModelCompressor(RegistryMixin): + """ + Base class representing a model compression algorithm. + + :param config: config specifying compression parameters + """ + + def __init__(self, config: CompressionConfig): + self.config = config + + def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]: + """ + Compresses a dense state dict + + :param model_state: state dict of uncompressed model + :return: compressed state dict + """ + raise NotImplementedError() + + def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]: + """ + Reads a compressed state dict located at model_path and returns a + generator for sequentially decompressing back to a dense state dict + + :param model_path: path to compressed safetensors model + :return: compressed state dict + """ + raise NotImplementedError() + + def overwrite_weights(self, model_path: str, model: Module): + """ + Overwrites the weights in model with weights decompressed from model_path + + :param model_path: path to compressed weights + :param model: pytorch model to load decompressed weights into + """ + dense_gen = self.decompress(model_path) + for name, data in tqdm(dense_gen, desc="Decompressing model"): + # loading the decompressed weights into the model + model_device = operator.attrgetter(name)(model).device + data_new = Parameter(data.to(model_device)) + data_old = operator.attrgetter(name)(model) + data_old.data = data_new.data + + setattr(model, SPARSITY_CONFIG_NAME, self.config) diff --git a/src/compressed_tensors/compressors/dense.py b/src/compressed_tensors/compressors/dense.py new file mode 100644 index 00000000..6e8785bc --- /dev/null +++ b/src/compressed_tensors/compressors/dense.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, Generator, Tuple + +from compressed_tensors.compressors import ModelCompressor +from torch import Tensor + + +@ModelCompressor.register(name="dense_sparsity") +class DenseCompressor(ModelCompressor): + """ + Identity compressor for dense models, returns the original state_dict + """ + + def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]: + return model_state + + def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]: + return iter([]) diff --git a/src/compressed_tensors/compressors/sparse_bitmask.py b/src/compressed_tensors/compressors/sparse_bitmask.py new file mode 100644 index 00000000..f6f03f0b --- /dev/null +++ b/src/compressed_tensors/compressors/sparse_bitmask.py @@ -0,0 +1,233 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Dict, Generator, List, Tuple, Union + +import numpy +import torch +from compressed_tensors.compressors import ModelCompressor +from compressed_tensors.utils import get_nested_weight_mappings, merge_names +from safetensors import safe_open +from torch import Tensor +from tqdm import tqdm + + +__all__ = [ + "BitmaskCompressor", + "BitmaskTensor", + "bitmask_compress", + "bitmask_decompress", + "pack_bitmasks", + "unpack_bitmasks", +] + +_LOGGER: logging.Logger = logging.getLogger(__name__) + + +@ModelCompressor.register(name="sparse_bitmask") +class BitmaskCompressor(ModelCompressor): + """ + Compression for sparse models using bitmasks. Non-zero weights are stored in a 1d + values tensor, with their locations stored in a 2d bitmask + """ + + COMPRESSION_PARAM_NAMES = ["shape", "compressed", "bitmask", "row_offsets"] + + def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]: + """ + Compresses a dense state dict using bitmask compression + + :param model_state: state dict of uncompressed model + :return: compressed state dict + """ + compressed_dict = {} + _LOGGER.debug( + f"Compressing model with {len(model_state)} parameterized layers..." + ) + for name, value in tqdm(model_state.items(), desc="Compressing model"): + bitmask_tensor = BitmaskTensor.from_dense(value) + bitmask_dict = bitmask_tensor.dict(name_prefix=name, device="cpu") + for key in bitmask_dict.keys(): + if key in compressed_dict: + _LOGGER.warn( + f"Expected all compressed state_dict keys to be unique, but " + f"found an existing entry for {key}. The existing entry will " + "be replaced." + ) + compressed_dict |= bitmask_dict + + return compressed_dict + + def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]: + """ + Reads a bitmask compressed state dict located at model_path and returns a + generator for sequentially decompressing back to a dense state dict + + :param model_path: path to compressed safetensors model + :return: iterator for generating decompressed weights + """ + weight_mappings = get_nested_weight_mappings( + model_path, self.COMPRESSION_PARAM_NAMES + ) + for weight_name in weight_mappings.keys(): + weight_data = {} + for param_name, safe_path in weight_mappings[weight_name].items(): + full_name = merge_names(weight_name, param_name) + with safe_open(safe_path, framework="pt", device="cpu") as f: + weight_data[param_name] = f.get_tensor(full_name) + data = BitmaskTensor(**weight_data) + decompressed = data.decompress() + yield weight_name, decompressed + + +class BitmaskTensor: + """ + Owns compressions and decompression for a single bitmask compressed tensor. + Adapted from: https://github.com/mgoin/torch_bitmask/tree/main + + :param shape: shape of dense tensor + :compressed: flat tensor of non-zero values + :bitmask: 2d bitmask of non-zero values + :row_offsets: flat tensor indicating what index in values each dense row starts at + """ + + def __init__( + self, + shape: Union[torch.Size, List], + compressed: Tensor, + bitmask: Tensor, + row_offsets: Tensor, + ): + self.shape = list(shape) + self.compressed = compressed + self.bitmask = bitmask + self.row_offsets = row_offsets + + @staticmethod + def from_dense(tensor: Tensor) -> "BitmaskTensor": + """ + :param tensor: dense tensor to compress + :return: instantiated compressed tensor + """ + shape = tensor.shape + compressed, bitmask, row_offsets = bitmask_compress(tensor.cpu()) + return BitmaskTensor( + shape=shape, compressed=compressed, bitmask=bitmask, row_offsets=row_offsets + ) + + def decompress(self) -> Tensor: + """ + :return: reconstructed dense tensor + """ + return bitmask_decompress(self.compressed, self.bitmask, self.shape) + + def curr_memory_size_bytes(self): + """ + :return: size in bytes required to store compressed tensor on disk + """ + + def sizeof_tensor(a): + return a.element_size() * a.nelement() + + return ( + sizeof_tensor(self.compressed) + + sizeof_tensor(self.bitmask) + + sizeof_tensor(self.row_offsets) + ) + + def dict(self, name_prefix: str, device: str = "cpu") -> Dict[str, Tensor]: + """ + :name_prefix: name of original tensor to store compressed weight as + :return: dict of compressed data for the stored weight + """ + return { + merge_names(name_prefix, "shape"): torch.tensor(self.shape, device=device), + merge_names(name_prefix, "compressed"): self.compressed.to(device), + merge_names(name_prefix, "bitmask"): self.bitmask.to(device), + merge_names(name_prefix, "row_offsets"): self.row_offsets.to(device), + } + + def __repr__(self): + return f"BitmaskTensor(shape={self.shape}, compressed=True)" + + +def bitmask_compress(tensor: Tensor) -> Tuple[Tensor, Tensor, Tensor]: + """ + Compresses a dense tensor using bitmask compression + + :param tensor: dense tensor to compress + :return: tuple of compressed data representing tensor + """ + bytemasks = tensor != 0 + row_counts = bytemasks.sum(dim=-1) + row_offsets = torch.cumsum(row_counts, 0) - row_counts + values = tensor[bytemasks] + bitmasks_packed = pack_bitmasks(bytemasks) + + return values, bitmasks_packed, row_offsets + + +def bitmask_decompress( + values: Tensor, bitmasks: Tensor, original_shape: torch.Size +) -> Tensor: + """ + Reconstructs a dense tensor from a compressed one + + :param values: 1d tensor of non-zero values + :param bitmasks: 2d int8 tensor flagging locations of non-zero values in the + tensors original shape + :param original_shape: shape of the dense tensor + :return: decompressed dense tensor + """ + bytemasks_unpacked = unpack_bitmasks(bitmasks, original_shape) + + decompressed_tensor = torch.zeros(original_shape, dtype=values.dtype) + decompressed_tensor[bytemasks_unpacked] = values + + return decompressed_tensor + + +def pack_bitmasks(bytemasks: Tensor) -> Tensor: + """ + Converts a bytemask tensor to a bitmask tensor to reduce memory. Shape RxC will be + compressed to R x ceil(C/8) + :param bytemasks: mask tensor where each byte corresponds to a weight + :return: mask tensor where each bit corresounds to a weight + """ + packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little") + packed_bits_torch = torch.from_numpy(packed_bits_numpy) + + return packed_bits_torch + + +def unpack_bitmasks(packed_bitmasks: Tensor, original_shape: torch.Size) -> Tensor: + """ + Converts a bitmask tensor back to a bytemask tensor for use during decompression + + :param packed_bitmasks: mask tensor where each bit corresponds to a weight + :param original_shape: dense shape to decompress to + :return: boolean mask of weights in the original dense shape + """ + # Unpack the bits + unpacked_bits = numpy.unpackbits( + packed_bitmasks.numpy(), axis=-1, count=original_shape[-1], bitorder="little" + ) + + # Reshape to match the original shape + unpacked_bitmasks_torch = torch.from_numpy( + unpacked_bits.reshape(original_shape).astype(bool) + ) + + return unpacked_bitmasks_torch diff --git a/src/compressed_tensors/config/__init__.py b/src/compressed_tensors/config/__init__.py new file mode 100644 index 00000000..ff83f5af --- /dev/null +++ b/src/compressed_tensors/config/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa +from .base import * +from .dense import * +from .sparse_bitmask import * diff --git a/src/compressed_tensors/config/base.py b/src/compressed_tensors/config/base.py new file mode 100644 index 00000000..f58b11f8 --- /dev/null +++ b/src/compressed_tensors/config/base.py @@ -0,0 +1,36 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from compressed_tensors.registry import RegistryMixin +from pydantic import BaseModel + + +__all__ = ["CompressionConfig"] + + +class CompressionConfig(RegistryMixin, BaseModel): + """ + Base data class for storing compression parameters + + :param format: name of compression format + :param global_sparsity: average sparsity of the entire model + :param sparsity_structure: structure of the sparsity, such as + "unstructured", "2:4", "8:16" etc + """ + + format: str + global_sparsity: Optional[float] = 0.0 + sparsity_structure: Optional[str] = "unstructured" diff --git a/src/compressed_tensors/config/dense.py b/src/compressed_tensors/config/dense.py new file mode 100644 index 00000000..aa23220c --- /dev/null +++ b/src/compressed_tensors/config/dense.py @@ -0,0 +1,36 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from compressed_tensors.config import CompressionConfig + + +__all__ = ["DenseSparsityConfig"] + + +@CompressionConfig.register(name="dense_sparsity") +class DenseSparsityConfig(CompressionConfig): + """ + Identity configuration for storing a sparse model in + an uncompressed dense format + + :param global_sparsity: average sparsity of the entire model + :param sparsity_structure: structure of the sparsity, such as + "unstructured", "2:4", "8:16" etc + """ + + format: str = "dense_sparsity" + global_sparsity: Optional[float] = 0.0 + sparsity_structure: Optional[str] = "unstructured" diff --git a/src/compressed_tensors/config/sparse_bitmask.py b/src/compressed_tensors/config/sparse_bitmask.py new file mode 100644 index 00000000..9b9cf211 --- /dev/null +++ b/src/compressed_tensors/config/sparse_bitmask.py @@ -0,0 +1,36 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from compressed_tensors.config.base import CompressionConfig + + +__all__ = ["BitmaskConfig"] + + +@CompressionConfig.register(name="sparse_bitmask") +class BitmaskConfig(CompressionConfig): + """ + Configuration for storing a sparse model using + bitmask compression + + :param global_sparsity: average sparsity of the entire model + :param sparsity_structure: structure of the sparsity, such as + "unstructured", "2:4", "8:16" etc + """ + + format: str = "sparse_bitmask" + global_sparsity: Optional[float] = 0.0 + sparsity_structure: Optional[str] = "unstructured" diff --git a/src/compressed_tensors/quantization/__init__.py b/src/compressed_tensors/quantization/__init__.py new file mode 100644 index 00000000..9fde69a3 --- /dev/null +++ b/src/compressed_tensors/quantization/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa +# isort: skip_file + +from .quant_args import * +from .quant_config import * +from .quant_scheme import * +from .lifecycle import * diff --git a/src/compressed_tensors/quantization/lifecycle/__init__.py b/src/compressed_tensors/quantization/lifecycle/__init__.py new file mode 100644 index 00000000..9504597b --- /dev/null +++ b/src/compressed_tensors/quantization/lifecycle/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa +# isort: skip_file + +from .calibration import * +from .forward import * +from .frozen import * +from .initialize import * +from .apply import * diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py new file mode 100644 index 00000000..08cb42f9 --- /dev/null +++ b/src/compressed_tensors/quantization/lifecycle/apply.py @@ -0,0 +1,105 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from collections import OrderedDict +from typing import Iterable, Optional + +from compressed_tensors.quantization.lifecycle.calibration import ( + set_module_for_calibration, +) +from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quantization +from compressed_tensors.quantization.lifecycle.initialize import ( + initialize_module_for_quantization, +) +from compressed_tensors.quantization.quant_config import ( + QuantizationConfig, + QuantizationStatus, +) +from compressed_tensors.quantization.utils import iter_named_leaf_modules +from torch.nn import Module + + +__all__ = [ + "apply_quantization_config", + "apply_quantization_status", +] + + +def apply_quantization_config(model: Module, config: QuantizationConfig): + """ + Initializes the model for quantization in-place based on the given config + + :param model: model to apply quantization config to + :param config: quantization config + """ + # build mapping of targets to schemes for easier matching + # use ordered dict to preserve target ordering in config + target_to_scheme = OrderedDict() + for scheme in config.config_groups.values(): + for target in scheme.targets: + target_to_scheme[target] = scheme + + # mark appropriate layers for quantization by setting their quantization schemes + for name, submodule in iter_named_leaf_modules(model): + if _find_first_name_or_class_match(name, submodule, config.ignore): + continue # layer matches ignore list, continue + target = _find_first_name_or_class_match(name, submodule, target_to_scheme) + if target is not None: + # target matched - add layer and scheme to target list + submodule.quantization_scheme = target_to_scheme[target] + + # apply current quantization status across all targeted layers + apply_quantization_status(model, config.quantization_status) + + +def apply_quantization_status(model: Module, status: QuantizationStatus): + """ + Applies in place the quantization lifecycle up to the given status + + :param model: model to apply quantization to + :param status: status to update the module to + """ + if status >= QuantizationStatus.INITIALIZED: + model.apply(initialize_module_for_quantization) + if status >= QuantizationStatus.CALIBRATION: + model.apply(set_module_for_calibration) + if status >= QuantizationStatus.FROZEN: + model.apply(freeze_module_quantization) + + +def _find_first_name_or_class_match( + name: str, + module: Module, + targets: Iterable[str], +) -> Optional[str]: + # first element of targets that matches the given name + # if no name matches returns first target that matches the class name + # returns None otherwise + return _find_first_match(name, targets) or _find_first_match( + module.__class__.__name__, targets + ) + + +def _find_first_match(value: str, targets: Iterable[str]) -> Optional[str]: + # returns first element of target that matches value either + # exactly or as a regex after 're:' + for target in targets: + if target.startswith("re:"): + pattern = target[3:] + if re.match(pattern, value): + return target + elif target == value: + return target + return None diff --git a/src/compressed_tensors/quantization/lifecycle/calibration.py b/src/compressed_tensors/quantization/lifecycle/calibration.py new file mode 100644 index 00000000..7ab1d896 --- /dev/null +++ b/src/compressed_tensors/quantization/lifecycle/calibration.py @@ -0,0 +1,51 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging + +from compressed_tensors.quantization.quant_config import QuantizationStatus +from torch.nn import Module + + +__all__ = [ + "set_module_for_calibration", +] + + +_LOGGER = logging.getLogger(__name__) + + +def set_module_for_calibration(module: Module): + """ + marks a layer as ready for calibration which activates observers + to update scales and zero points on each forward pass + + apply to full model with `model.apply(set_module_for_calibration)` + + :param module: module to set for calibration + """ + if not getattr(module, "quantization_scheme", None): + # no quantization scheme nothing to do + return + status = getattr(module, "quantization_status", None) + if not status or status != QuantizationStatus.INITIALIZED: + raise _LOGGER.warning( + f"Attempting set module with status {status} to calibration mode. " + f"but status is not {QuantizationStatus.INITIALIZED} - you may " + "be calibrating an uninitialized module which may fail or attempting " + "to re-calibrate a frozen module" + ) + + module.quantization_status = QuantizationStatus.CALIBRATION diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py new file mode 100644 index 00000000..2118cf74 --- /dev/null +++ b/src/compressed_tensors/quantization/lifecycle/forward.py @@ -0,0 +1,137 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import wraps + +import torch +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization.quant_config import QuantizationStatus +from compressed_tensors.quantization.quant_scheme import QuantizationScheme +from torch.nn import Module + + +__all__ = [ + "wrap_module_forward_quantized", + "quantize", + "dequantize", + "fake_quantize", + "maybe_calibrate_or_quantize", +] + + +@torch.no_grad() +def quantize( + x: torch.Tensor, + scale: torch.Tensor, + zero_point: torch.Tensor, + q_min: torch.Tensor, + q_max: torch.Tensor, +) -> torch.Tensor: + return torch.clamp( + torch.round( + x / scale + zero_point, + ), + q_min, + q_max, + ) + + +@torch.no_grad() +def dequantize( + x_q: torch.Tensor, + scale: torch.Tensor, + zero_point: torch.Tensor, +) -> torch.Tensor: + return (x_q - zero_point) * scale + + +@torch.no_grad() +def fake_quantize( + x: torch.Tensor, + scale: torch.Tensor, + zero_point: torch.Tensor, + args: QuantizationArgs, +) -> torch.Tensor: + bit_range = 2**args.num_bits + max_q = torch.tensor(bit_range / 2 - 1, device=x.device) + min_q = torch.tensor(-bit_range / 2, device=x.device) + Q = torch.zeros_like(x) + Q = quantize(x, scale, zero_point, min_q, max_q) + return dequantize(Q, scale, zero_point) + + +def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme): + # expects a module already initialized and injected with the parameters in + # initialize_module_for_quantization + forward_func_orig = module.forward.__func__ + + @wraps(forward_func_orig) # ensures docstring, names, etc are propagated + def wrapped_forward(self, *args, **kwargs): + input_ = args[0] + + if scheme.input_activations is not None: + # calibrate and (fake) quantize input activations when applicable + input_ = maybe_calibrate_or_quantize( + module, input_, "input", scheme.input_activations + ) + + if scheme.weights is not None: + # calibrate and (fake) quantize weights when applicable + self.weight.data = maybe_calibrate_or_quantize( + module, self.weight, "weight", scheme.weights + ) + + # perform wrapped forward call + output = forward_func_orig.__get__(module, module.__class__)( + input_, *args[1:], **kwargs + ) + + if scheme.output_activations is not None: + # calibrate and (fake) quantize output activations when applicable + output = maybe_calibrate_or_quantize( + module, output, "output", scheme.output_activations + ) + + return output + + # bind wrapped forward to module class so reference to `self` is correct + bound_wrapped_forward = wrapped_forward.__get__(module, module.__class__) + # set forward to wrapped forward + setattr(module, "forward", bound_wrapped_forward) + + +def maybe_calibrate_or_quantize( + module: Module, value: Module, base_name: str, args: "QuantizationArgs" +) -> torch.Tensor: + # only run quantized for the included stages + if module.quantization_status not in { + QuantizationStatus.CALIBRATION, + QuantizationStatus.FROZEN, + }: + return value + + device = next(module.parameters()).device + scale = getattr(module, f"{base_name}_scale") + zero_point = getattr(module, f"{base_name}_zero_point") + + if module.quantization_status == QuantizationStatus.CALIBRATION: + # get observer and get new quant params from observation + observer = getattr(module, f"{base_name}_observer") + updated_scale, updated_zero_point = observer(value) + + # update scale and zero point + scale.data = updated_scale.to(device) + zero_point.data = updated_zero_point.to(device) + + return fake_quantize(value, scale, zero_point, args) diff --git a/src/compressed_tensors/quantization/lifecycle/frozen.py b/src/compressed_tensors/quantization/lifecycle/frozen.py new file mode 100644 index 00000000..3fa91fa9 --- /dev/null +++ b/src/compressed_tensors/quantization/lifecycle/frozen.py @@ -0,0 +1,47 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from compressed_tensors.quantization.quant_config import QuantizationStatus +from torch.nn import Module + + +__all__ = [ + "freeze_module_quantization", +] + + +def freeze_module_quantization(module: Module): + """ + deletes observers so static quantization is completed. + + apply to full model with `model.apply(freeze_module_quantization)` + + :param module: module to freeze quantization for + """ + if not getattr(module, "quantization_scheme", None): + # no quantization scheme nothing to do + return + + # delete observers from module + submodule_name_do_delete = set() + for submodule_name, _ in module.named_modules(): + if "." not in submodule_name and submodule_name.endswith("_observer"): + # delete any observers that belong directly to this module + submodule_name_do_delete.add(submodule_name) + + for submodule_name in submodule_name_do_delete: + delattr(module, submodule_name) + + module.quantization_status = QuantizationStatus.FROZEN diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py new file mode 100644 index 00000000..4ef6379b --- /dev/null +++ b/src/compressed_tensors/quantization/lifecycle/initialize.py @@ -0,0 +1,96 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +from typing import Optional + +import torch +from compressed_tensors.quantization.lifecycle.forward import ( + wrap_module_forward_quantized, +) +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization.quant_config import QuantizationStatus +from compressed_tensors.quantization.quant_scheme import QuantizationScheme +from torch.nn import Module, Parameter + + +__all__ = [ + "initialize_module_for_quantization", +] + + +_LOGGER = logging.getLogger(__name__) + + +def initialize_module_for_quantization( + module: Module, + scheme: Optional[QuantizationScheme] = None, +): + """ + attaches appropriate scales, zero points, and observers to a layer + given its target quantization scheme + + apply to full model with `model.apply(initialize_module_for_quantization)` + + :param module: module to set for calibration + :param scheme: scheme to use for quantization. if None is provided, + will attempt to use scheme stored in the module under `quantization_scheme`, + if not provided, the layer will be skipped + """ + scheme = scheme or getattr(module, "quantization_scheme", None) + if scheme is None: + # no scheme passed and layer not targeted for quantization - skip + return + + if scheme.input_activations is not None: + _initialize_scale_zero_point_observer(module, "input", scheme.input_activations) + if scheme.weights is not None: + if hasattr(module, "weight"): + _initialize_scale_zero_point_observer(module, "weight", scheme.weights) + else: + _LOGGER.warning( + f"module type {type(module)} targeted for weight quantization but " + "has no attribute weight, skipping weight quantization " + f"for {type(module)}" + ) + if scheme.output_activations is not None: + _initialize_scale_zero_point_observer( + module, "output", scheme.output_activations + ) + + module.quantization_scheme = scheme + module.quantization_status = QuantizationStatus.INITIALIZED + + # wrap forward call of module to perform quantized actions based on calltime status + wrap_module_forward_quantized(module, scheme) + + +def _initialize_scale_zero_point_observer( + module: Module, base_name: str, quantization_args: QuantizationArgs +): + device = next(module.parameters()).device + + # initializes empty scale and zero point parameters for the module + init_scale = Parameter(torch.empty(0, device=device), requires_grad=False) + module.register_parameter(f"{base_name}_scale", init_scale) + + init_zero_point = Parameter( + torch.empty(0, device=device, dtype=int), requires_grad=False + ) + module.register_parameter(f"{base_name}_zero_point", init_zero_point) + + # initialize observer module and attach as submodule + observer = quantization_args.get_observer() + module.register_module(f"{base_name}_observer", observer) diff --git a/src/compressed_tensors/quantization/observers/__init__.py b/src/compressed_tensors/quantization/observers/__init__.py new file mode 100644 index 00000000..d0362b8f --- /dev/null +++ b/src/compressed_tensors/quantization/observers/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa + +from .base import * +from .memoryless import * +from .min_max import * diff --git a/src/compressed_tensors/quantization/observers/base.py b/src/compressed_tensors/quantization/observers/base.py new file mode 100644 index 00000000..96fe1049 --- /dev/null +++ b/src/compressed_tensors/quantization/observers/base.py @@ -0,0 +1,69 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Tuple + +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.registry.registry import RegistryMixin +from torch import FloatTensor, IntTensor, Tensor +from torch.nn import Module + + +__all__ = ["Observer"] + + +class Observer(Module, RegistryMixin): + """ + Base Observer class to be subclassed for specific implementation. + Subclasses should override `calculate_qparams` to return a scale, zero_point + pair + """ + + def __init__(self, quantization_args: QuantizationArgs): + self.quantization_args: QuantizationArgs = quantization_args + super().__init__() + self._scale = None + self._zero_point = None + + def forward(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]: + """ + maps directly to get_qparams + :param observed: optional observed tensor to calculate quantization parameters + from + :return: tuple of scale and zero point based on last observed value + """ + return self.get_qparams(observed=observed) + + def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]: + """ + :param observed: observed tensor to calculate quantization parameters for + :return: tuple of scale and zero point derived from the observed tensor + """ + raise NotImplementedError(f"{self.__class__} must implement calculate_qparams") + + def get_qparams( + self, observed: Optional[Tensor] = None + ) -> Tuple[FloatTensor, IntTensor]: + """ + Convenience function to wrap overwritten calculate_qparams + adds support to make observed tensor optional and support for tracking latest + calculated scale and zero point + :param observed: optional observed tensor to calculate quantization parameters + from + :return: tuple of scale and zero point based on last observed value + """ + if observed is not None: + # re-calcualte scale and zero point, update the stored value + self._scale, self._zero_point = self.calculate_qparams(observed) + return self._scale, self._zero_point diff --git a/src/compressed_tensors/quantization/observers/memoryless.py b/src/compressed_tensors/quantization/observers/memoryless.py new file mode 100644 index 00000000..0ba4d9f6 --- /dev/null +++ b/src/compressed_tensors/quantization/observers/memoryless.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import torch +from compressed_tensors.quantization.observers.base import Observer +from torch import FloatTensor, IntTensor, Tensor + + +__all__ = ["MemorylessObserver"] + + +@Observer.register("memoryless") +class MemorylessObserver(Observer): + """ + Implements a dynamic quantization observer that sets the scale and + zero point based on the latest observed value + """ + + def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]: + """ + :param observed: observed tensor to calculate quantization parameters for + :return: tuple of scale and zero point derived from the observed tensor + """ + # TODO: Add support for full range of quantization Args, only supports 8bit + # per tensor + bit_range = 255 + min_val = observed.min() + max_val = observed.max() + + # ensure zero is in the range + min_val = torch.min(min_val, torch.zeros_like(min_val)) + max_val = torch.max(max_val, torch.zeros_like(max_val)) + + if self.quantization_args.symmetric: + symmetric_range = 2 * max(min_val.abs(), max_val.abs()) + scale = symmetric_range / bit_range + zero_point = torch.tensor(0).to(torch.int8) + else: + # non-symmetric + observed_range = max_val - min_val + scale = observed_range / bit_range + + # scales from a 0 range should be set to 1 + scale[observed_range == 0] = 1 + + zero_point = ((0 - min_val) / scale).to(torch.int8) + + return scale, zero_point diff --git a/src/compressed_tensors/quantization/observers/min_max.py b/src/compressed_tensors/quantization/observers/min_max.py new file mode 100644 index 00000000..eb575df1 --- /dev/null +++ b/src/compressed_tensors/quantization/observers/min_max.py @@ -0,0 +1,79 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import torch +from compressed_tensors.quantization.observers.base import Observer +from compressed_tensors.quantization.quant_args import QuantizationArgs +from torch import FloatTensor, IntTensor, Tensor + + +__all__ = ["MinMaxObserver"] + + +@Observer.register("minmax") +class MinMaxObserver(Observer): + """ + Implements a dynamic quantization observer that sets the scale and + zero point based on the latest observed value + """ + + def __init__(self, quantization_args: QuantizationArgs): + super().__init__(quantization_args=quantization_args) + + self.min_val = float("inf") + self.max_val = -float("inf") + self.counter = 0 + + def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]: + """ + :param observed: observed tensor to calculate quantization parameters for + :return: tuple of scale and zero point derived from the observed tensor + """ + # TODO: Add support for full range of quantization Args, only supports 8bit + # per tensor + bit_range = 255 + min_val = torch.tensor([observed.min()]) + max_val = torch.tensor([observed.max()]) + + # update running average + if self.counter > 0: + self.min_val = (self.min_val * self.counter + min_val) / (self.counter + 1) + self.max_val = (self.max_val * self.counter + max_val) / (self.counter + 1) + else: + self.min_val = min_val + self.max_val = max_val + + # ensure that the zeros are in the range + min_val = torch.min(self.min_val, torch.zeros_like(self.min_val)) + max_val = torch.max(self.max_val, torch.zeros_like(self.max_val)) + + self.counter += 1 + + if self.quantization_args.symmetric: + symmetric_range = 2 * max(min_val.abs(), max_val.abs()) + scale = symmetric_range / bit_range + zero_point = torch.tensor(0).to(torch.int8) + else: + # non-symmetric + observed_range = max_val - min_val + scale = observed_range / bit_range + + # scales from a 0 range should be set to 1 + scale[observed_range == 0] = 1 + + zero_point = ((0 - min_val) / scale).to(torch.int8) + + return scale, zero_point diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py new file mode 100644 index 00000000..64b5005f --- /dev/null +++ b/src/compressed_tensors/quantization/quant_args.py @@ -0,0 +1,85 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + + +__all__ = ["QuantizationType", "QuantizationStrategy", "QuantizationArgs"] + + +class QuantizationType(str, Enum): + """ + Enum storing quantization type options + """ + + INT = "int" + FLOAT = "float" + + +class QuantizationStrategy(str, Enum): + """ + Enum storing quantization strategy options + """ + + TENSOR = "tensor" + CHANNEL = "channel" + GROUP = "group" + BLOCK = "block" + + +class QuantizationArgs(BaseModel): + """ + User facing arguments used to define a quantization config for weights or + activations + + :param num_bits: quantization bit depth + :param type: dtype to quantized to, either int or float + :param symmetric: whether or not quantization scale is symmetric about zero-point + :param strategy: string id determining the scope of scale/zero-point to apply + :param group_size: group length to use for the group strategy + :param block_structure: 2d block structure to use for the block strategy, must be + of the format "2x4", "8x16", etc. + """ + + num_bits: int = 8 + type: QuantizationType = QuantizationType.INT + symmetric: bool = True + strategy: QuantizationStrategy = QuantizationStrategy.TENSOR + group_size: Optional[int] = None + block_structure: Optional[str] = None + observer: str = Field( + default="minmax", + description=( + "The class to use to compute the quantization param - " + "scale and zero-point'" + ), + ) + observer_kwargs: Dict[str, Any] = Field( + default_factory=dict, + description=( + "optional dict of kwargs to be passed directly to torch quantization " + "Observers constructor excluding quantization range or symmetry" + ), + ) + + def get_observer(self): + """ + :return: torch quantization FakeQuantize built based on these QuantizationArgs + """ + from compressed_tensors.quantization.observers.base import Observer + + return Observer.load_from_registry(self.observer, quantization_args=self) diff --git a/src/compressed_tensors/quantization/quant_config.py b/src/compressed_tensors/quantization/quant_config.py new file mode 100644 index 00000000..a62a79bd --- /dev/null +++ b/src/compressed_tensors/quantization/quant_config.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum +from typing import Dict, List, Optional + +from compressed_tensors.quantization.quant_scheme import QuantizationScheme +from compressed_tensors.quantization.utils import ( + calculate_compression_ratio, + is_module_quantized, + iter_named_leaf_modules, + module_type, +) +from pydantic import BaseModel, Field +from torch.nn import Module + + +__all__ = [ + "QuantizationStatus", + "QuantizationConfig", + "LIFECYCLE_ORDER", +] + + +class QuantizationStatus(str, Enum): + """ + Enum storing the different states a quantized layer can be in + + Initialized: scale, zero points and observers have been attached to the layer but + are set to dummy values (not yet calibrated) + Calibration: scale and zero points have been calibrated through OBCQ or similar + algorithm, observers are still attached + Frozen: scale and zero points are finalized, observers have been deleted, weights + are still in their original precision + Compressed: weights have been converted to their target type or compressed to + their closed approximation + """ + + INITIALIZED = "initialized" + CALIBRATION = "calibration" + FROZEN = "frozen" + COMPRESSED = "compressed" + + @classmethod + def lifecycle_order(cls) -> List["QuantizationStatus"]: + """ + :return: list of correct quantization lifecycle order + """ + return + + def __ge__(self, other): + if not isinstance(other, self.__class__): + raise NotImplementedError + return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other) + + +LIFECYCLE_ORDER = [ + QuantizationStatus.INITIALIZED, + QuantizationStatus.CALIBRATION, + QuantizationStatus.FROZEN, + QuantizationStatus.COMPRESSED, +] + + +class QuantizationConfig(BaseModel): + """ + Full configuration specifying how a model is quantized. Each quantized layer is + mapped to a QuantizationScheme in config_groups. + + :param config_groups: dict of QuantizationSchemes specifying the quantization + settings for each quantized layer + :param quant_method: a constant used to differentiate sparseML quantization from + other quantization configs + :param format: specifies how the quantized model is stored on disk + :quantization_status: specifies the current status of all quantized layers. It is + assumed all layers are in the same state. + :global_compression_ratio: optional informational config to report the model + compression ratio acheived by the quantization config + :ignore: optional list of layers to ignore from config_groups. Layers in this list + are not quantized even if they match up with a target in config_groups + """ + + config_groups: Dict[str, QuantizationScheme] + quant_method: str = "sparseml" + format: str = "fakequant" + quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED + global_compression_ratio: Optional[float] = None + ignore: Optional[List[str]] = Field(default_factory=list) + + @staticmethod + def from_pretrained(model: Module) -> "QuantizationConfig": + """ + Converts a model into its associated QuantizationConfig based on the + QuantizationScheme attached to each quanitzed module + + :param model: model to calculate quantization scheme of + :return: filled out QuantizationScheme for the input model + """ + quant_scheme_to_layers = [] + quantization_status = None + ignore = {} + quantization_type_names = set() + for name, submodule in iter_named_leaf_modules(model): + layer_type = module_type(submodule) + if not is_module_quantized(submodule): + if layer_type not in ignore: + ignore[layer_type] = [] + ignore[layer_type].append(name) + else: + quantization_status = submodule.quantization_status + scheme = submodule.quantization_scheme + quantization_type_names.add(layer_type) + + match_found = False + for existing_scheme in quant_scheme_to_layers: + if scheme == existing_scheme: + match_found = True + break + if not match_found: + quant_scheme_to_layers.append(scheme) + + # clean up ignore list, we can leave out layers types if none of the + # instances are quantized + consolidated_ignore = [] + for layer_type, ignore_names in ignore.items(): + if layer_type in quantization_type_names: + # specific layers of a quantized type are ignored + consolidated_ignore += ignore_names + # else we leave it off the ignore list, doesn't fall under any of the + # existing quantization schemes so it won't be quantized + + config_groups = {} + for idx, scheme in enumerate(quant_scheme_to_layers): + group_name = "group_" + str(idx) + config_groups[group_name] = scheme + + compression_ratio = calculate_compression_ratio(model) + return QuantizationConfig( + config_groups=config_groups, + quantization_status=quantization_status, + global_compression_ratio=compression_ratio, + ignore=consolidated_ignore, + ) diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py new file mode 100644 index 00000000..ed0f8245 --- /dev/null +++ b/src/compressed_tensors/quantization/quant_scheme.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +from compressed_tensors.quantization.quant_args import QuantizationArgs +from pydantic import BaseModel + + +__all__ = ["QuantizationScheme"] + + +class QuantizationScheme(BaseModel): + """ + Set of QuantizationArgs defining how the weights, inputs and outputs of target list + of modules should be quantized + + :param targets: list of modules to apply the QuantizationArgs to, can be layer + names, layer types or a regular expression + :param weights: quantization config for layer weights + :param input_activations: quantization config for layer inputs + :param output_activations: quantization config for layer outputs + """ + + targets: List[str] + weights: Optional[QuantizationArgs] = None + input_activations: Optional[QuantizationArgs] = None + output_activations: Optional[QuantizationArgs] = None diff --git a/src/compressed_tensors/quantization/utils/__init__.py b/src/compressed_tensors/quantization/utils/__init__.py new file mode 100644 index 00000000..a91f9e5d --- /dev/null +++ b/src/compressed_tensors/quantization/utils/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa +from .helpers import * diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py new file mode 100644 index 00000000..3c00cdbe --- /dev/null +++ b/src/compressed_tensors/quantization/utils/helpers.py @@ -0,0 +1,115 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import torch +from torch.nn import Module +from tqdm import tqdm + + +__all__ = [ + "is_module_quantized", + "is_model_quantized", + "iter_named_leaf_modules", + "module_type", + "calculate_compression_ratio", +] + + +def is_module_quantized(module: Module) -> bool: + """ + Check if a module is quantized, based on the existence of a non-empty quantization + scheme + + :param module: pytorch module to check + :return: True if module is quantized, False otherwise + """ + if not hasattr(module, "quantization_scheme"): + return False + + if module.quantization_scheme.weights is not None: + return True + + if module.quantization_scheme.input_activations is not None: + return True + + if module.quantization_scheme.output_activations is not None: + return True + + return False + + +def is_model_quantized(model: Module) -> bool: + """ + Check if any modules in a model are quantized, based on the existence of a non-empty + quantization scheme in at least one module + + :param model: pytorch model + :return: True if model is quantized, False otherwise + """ + + for _, submodule in iter_named_leaf_modules(model): + if is_module_quantized(submodule): + return True + + return False + + +def module_type(module: Module) -> str: + """ + Gets a string representation of a module type + + :module: pytorch module to get type of + :return: module type as a string + """ + return type(module).__name__ + + +def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]: + # yields modules that do not have any submodules + # TODO: potentially expand to add list of allowed submodules such as observers + for name, submodule in model.named_modules(): + if len(list(submodule.children())) == 0: + yield name, submodule + + +def calculate_compression_ratio(model: Module) -> float: + """ + Calculates the quantization compression ratio of a pytorch model, based on the + number of bits needed to represent the total weights in compressed form. Does not + take into account activation quantizatons. + + :param model: pytorch module to calculate compression ratio for + :return: compression ratio of the whole model + """ + total_compressed = 0.0 + total_uncompressed = 0.0 + for name, submodule in tqdm( + iter_named_leaf_modules(model), + desc="Calculating quantization compression ratio", + ): + for parameter in model.parameters(): + try: + uncompressed_bits = torch.finfo(parameter.dtype).bits + except TypeError: + uncompressed_bits = torch.iinfo(parameter.dtype).bits + compressed_bits = uncompressed_bits + if is_module_quantized(submodule): + compressed_bits = submodule.quantization_scheme.weights.num_bits + num_weights = parameter.numel() + total_compressed += compressed_bits * num_weights + total_uncompressed += uncompressed_bits * num_weights + + return total_uncompressed / total_compressed diff --git a/src/compressed_tensors/registry/__init__.py b/src/compressed_tensors/registry/__init__.py new file mode 100644 index 00000000..241d9d55 --- /dev/null +++ b/src/compressed_tensors/registry/__init__.py @@ -0,0 +1,17 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .registry import * diff --git a/src/compressed_tensors/registry/registry.py b/src/compressed_tensors/registry/registry.py new file mode 100644 index 00000000..d8d8bc6d --- /dev/null +++ b/src/compressed_tensors/registry/registry.py @@ -0,0 +1,360 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Universal registry to support registration and loading of child classes and plugins +of neuralmagic utilities +""" + +import importlib +from collections import defaultdict +from typing import Any, Dict, List, Optional, Type, Union + + +__all__ = [ + "RegistryMixin", + "register", + "get_from_registry", + "registered_names", + "registered_aliases", + "standardize_lookup_name", +] + + +_ALIAS_REGISTRY: Dict[Type, Dict[str, str]] = defaultdict(dict) +_REGISTRY: Dict[Type, Dict[str, Any]] = defaultdict(dict) + + +def standardize_lookup_name(name: str) -> str: + """ + Standardize the given name for lookup in the registry. + This will replace all underscores and spaces with hyphens and + convert the name to lowercase. + + example: + ``` + standardize_lookup_name("Foo_bar baz") == "foo-bar-baz" + ``` + + :param name: name to standardize + :return: standardized name + """ + return name.replace("_", "-").replace(" ", "-").lower() + + +def standardize_alias_name( + name: Union[None, str, List[str]] +) -> Union[None, str, List[str]]: + if name is None: + return None + elif isinstance(name, str): + return standardize_lookup_name(name) + else: # isinstance(name, list) + return [standardize_lookup_name(n) for n in name] + + +class RegistryMixin: + """ + Universal registry to support registration and loading of child classes and plugins + of neuralmagic utilities. + + Classes that require a registry or plugins may add the `RegistryMixin` and use + `register` and `load` as the main entrypoints for adding new implementations and + loading requested values from its registry. + + If a class should only have its child classes in its registry, the class should + set the static attribute `registry_requires_subclass` to True + + example + ```python + class Dataset(RegistryMixin): + pass + + + # register with default name + @Dataset.register() + class ImageNetDataset(Dataset): + pass + + # load as "ImageNetDataset" + imagenet = Dataset.load("ImageNetDataset") + + # register with custom name + @Dataset.register(name="cifar-dataset") + class Cifar(Dataset): + pass + + Note: the name will be standardized for lookup in the registry. + For example, if a class is registered as "cifar_dataset" or + "cifar dataset", it will be stored as "cifar-dataset". The user + will be able to load the class with any of the three name variants. + + # register with multiple aliases + @Dataset.register(alias=["cifar-10-dataset", "cifar_100_dataset"]) + class Cifar(Dataset): + pass + + # load as "cifar-dataset" + cifar = Dataset.load_from_registry("cifar-dataset") + + # load from custom file that implements a dataset + mnist = Dataset.load_from_registry("/path/to/mnnist_dataset.py:MnistDataset") + ``` + """ + + # set to True in child class to add check that registered/retrieved values + # implement the class it is registered to + registry_requires_subclass: bool = False + + @classmethod + def register( + cls, name: Optional[str] = None, alias: Union[List[str], str, None] = None + ): + """ + Decorator for registering a value (ie class or function) wrapped by this + decorator to the base class (class that .register is called from) + + :param name: name or list of names to register the wrapped value as, + defaults to value.__name__ + :param alias: alias or list of aliases to register the wrapped value as, + defaults to None + :return: register decorator + """ + + def decorator(value: Any): + cls.register_value(value, name=name, alias=alias) + return value + + return decorator + + @classmethod + def register_value( + cls, value: Any, name: str, alias: Union[str, List[str], None] = None + ): + """ + Registers the given value to the class `.register_value` is called from + :param value: value to register + :param name: name to register the wrapped value as, + defaults to value.__name__ + :param alias: alias or list of aliases to register the wrapped value as, + defaults to None + """ + register( + parent_class=cls, + value=value, + name=name, + alias=alias, + require_subclass=cls.registry_requires_subclass, + ) + + @classmethod + def load_from_registry(cls, name: str, **constructor_kwargs) -> object: + """ + :param name: name of registered class to load + :param constructor_kwargs: arguments to pass to the constructor retrieved + from the registry + :return: loaded object registered to this class under the given name, + constructed with the given kwargs. Raises error if the name is + not found in the registry + """ + constructor = cls.get_value_from_registry(name=name) + return constructor(**constructor_kwargs) + + @classmethod + def get_value_from_registry(cls, name: str): + """ + :param name: name to retrieve from the registry + :return: value from retrieved the registry for the given name, raises + error if not found + """ + return get_from_registry( + parent_class=cls, + name=name, + require_subclass=cls.registry_requires_subclass, + ) + + @classmethod + def registered_names(cls) -> List[str]: + """ + :return: list of all names registered to this class + """ + return registered_names(cls) + + @classmethod + def registered_aliases(cls) -> List[str]: + """ + :return: list of all aliases registered to this class + """ + return registered_aliases(cls) + + +def register( + parent_class: Type, + value: Any, + name: Optional[str] = None, + alias: Union[List[str], str, None] = None, + require_subclass: bool = False, +): + """ + :param parent_class: class to register the name under + :param value: the value to register + :param name: name to register the wrapped value as, defaults to value.__name__ + :param alias: alias or list of aliases to register the wrapped value as, + defaults to None + :param require_subclass: require that value is a subclass of the class this + method is called from + """ + if name is None: + # default name + name = value.__name__ + + name = standardize_lookup_name(name) + alias = standardize_alias_name(alias) + register_alias(name=name, alias=alias, parent_class=parent_class) + + if require_subclass: + _validate_subclass(parent_class, value) + + if name in _REGISTRY[parent_class]: + # name already exists - raise error if two different values are attempting + # to share the same name + registered_value = _REGISTRY[parent_class][name] + if registered_value is not value: + raise RuntimeError( + f"Attempting to register name {name} as {value} " + f"however {name} has already been registered as {registered_value}" + ) + else: + _REGISTRY[parent_class][name] = value + + +def get_from_registry( + parent_class: Type, name: str, require_subclass: bool = False +) -> Any: + """ + :param parent_class: class that the name is registered under + :param name: name to retrieve from the registry of the class + :param require_subclass: require that value is a subclass of the class this + method is called from + :return: value from retrieved the registry for the given name, raises + error if not found + """ + name = standardize_lookup_name(name) + + if ":" in name: + # user specifying specific module to load and value to import + module_path, value_name = name.split(":") + retrieved_value = _import_and_get_value_from_module(module_path, value_name) + else: + # look up name in alias registry + name = _ALIAS_REGISTRY[parent_class].get(name) + # look up name in registry + retrieved_value = _REGISTRY[parent_class].get(name) + if retrieved_value is None: + raise KeyError( + f"Unable to find {name} registered under type {parent_class}.\n" + f"Registered values for {parent_class}: " + f"{registered_names(parent_class)}\n" + f"Registered aliases for {parent_class}: " + f"{registered_aliases(parent_class)}" + ) + + if require_subclass: + _validate_subclass(parent_class, retrieved_value) + + return retrieved_value + + +def registered_names(parent_class: Type) -> List[str]: + """ + :param parent_class: class to look up the registry of + :return: all names registered to the given class + """ + return list(_REGISTRY[parent_class].keys()) + + +def registered_aliases(parent_class: Type) -> List[str]: + """ + :param parent_class: class to look up the registry of + :return: all aliases registered to the given class + """ + registered_aliases_plus_names = list(_ALIAS_REGISTRY[parent_class].keys()) + registered_aliases = list( + set(registered_aliases_plus_names) - set(registered_names(parent_class)) + ) + return registered_aliases + + +def register_alias( + name: str, parent_class: Type, alias: Union[str, List[str], None] = None +): + """ + Updates the mapping from the alias(es) to the given name. + If the alias is None, the name is used as the alias. + ``` + + :param name: name that the alias refers to + :param parent_class: class that the name is registered under + :param alias: single alias or list of aliases that + refer to the name, defaults to None + """ + if alias is not None: + alias = alias if isinstance(alias, list) else [alias] + else: + alias = [] + + if name in alias: + raise KeyError( + f"Attempting to register alias {name}, " + f"that is identical to the standardized name: {name}." + ) + alias.append(name) + + for alias_name in alias: + if alias_name in _ALIAS_REGISTRY[parent_class]: + raise KeyError( + f"Attempting to register alias {alias_name} as {name} " + f"however {alias_name} has already been registered as " + f"{_ALIAS_REGISTRY[alias_name]}" + ) + _ALIAS_REGISTRY[parent_class][alias_name] = name + + +def _import_and_get_value_from_module(module_path: str, value_name: str) -> Any: + # import the given module path and try to get the value_name if it is included + # in the module + + # load module + spec = importlib.util.spec_from_file_location( + f"plugin_module_for_{value_name}", module_path + ) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # get value from module + value = getattr(module, value_name, None) + + if not value: + raise RuntimeError( + f"Unable to find attribute {value_name} in module {module_path}" + ) + return value + + +def _validate_subclass(parent_class: Type, child_class: Type): + if not issubclass(child_class, parent_class): + raise ValueError( + f"class {child_class} is not a subclass of the class it is " + f"registered for: {parent_class}." + ) diff --git a/src/compressed_tensors/utils/__init__.py b/src/compressed_tensors/utils/__init__.py new file mode 100644 index 00000000..e9e78d44 --- /dev/null +++ b/src/compressed_tensors/utils/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# flake8: noqa + +from .helpers import * +from .safetensors_load import * diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py new file mode 100644 index 00000000..ac9ed229 --- /dev/null +++ b/src/compressed_tensors/utils/helpers.py @@ -0,0 +1,45 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +from compressed_tensors.base import SPARSITY_CONFIG_NAME +from compressed_tensors.compressors import ModelCompressor +from compressed_tensors.config import CompressionConfig +from transformers import AutoConfig + + +__all__ = ["infer_compressor_from_model_config"] + + +def infer_compressor_from_model_config( + pretrained_model_name_or_path: str, +) -> Optional[ModelCompressor]: + """ + Given a path to a model config, extract a sparsity config if it exists and return + the associated ModelCompressor + + :param pretrained_model_name_or_path: path to model config on disk or HF hub + :return: matching compressor if config contains a sparsity config + """ + config = AutoConfig.from_pretrained(pretrained_model_name_or_path) + sparsity_config = getattr(config, SPARSITY_CONFIG_NAME, None) + if sparsity_config is None: + return None + + format = sparsity_config.get("format") + sparsity_config = CompressionConfig.load_from_registry(format, **sparsity_config) + compressor = ModelCompressor.load_from_registry(format, config=sparsity_config) + return compressor diff --git a/src/compressed_tensors/utils/safetensors_load.py b/src/compressed_tensors/utils/safetensors_load.py new file mode 100644 index 00000000..4d71482a --- /dev/null +++ b/src/compressed_tensors/utils/safetensors_load.py @@ -0,0 +1,196 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import re +import struct +from typing import Dict, List, Optional + +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, cached_file + + +__all__ = [ + "get_safetensors_folder", + "get_safetensors_header", + "match_param_name", + "merge_names", + "get_weight_mappings", + "get_nested_weight_mappings", +] + + +def get_safetensors_folder( + pretrained_model_name_or_path: str, cache_dir: Optional[str] = None +) -> str: + """ + Given a Hugging Face stub or a local path, return the folder containing the + safetensors weight files + + :param pretrained_model_name_or_path: local path to model or HF stub + :param cache_dir: optional cache dir to search through, if none is specified the + model will be searched for in the default TRANSFORMERS_CACHE + :return: local folder containing model data + """ + if os.path.exists(pretrained_model_name_or_path): + # argument is a path to a local folder + return pretrained_model_name_or_path + + safetensors_path = cached_file( + pretrained_model_name_or_path, + SAFE_WEIGHTS_NAME, + cache_dir=cache_dir, + _raise_exceptions_for_missing_entries=False, + ) + index_path = cached_file( + pretrained_model_name_or_path, + SAFE_WEIGHTS_INDEX_NAME, + cache_dir=cache_dir, + _raise_exceptions_for_missing_entries=False, + ) + if safetensors_path is not None: + # found a single cached safetensors file + return os.path.split(safetensors_path)[0] + if index_path is not None: + # found a cached safetensors weight index file + return os.path.split(index_path)[0] + + # model weights could not be found locally or cached from HF Hub + raise ValueError( + "Could not locate safetensors weight or index file from " + f"{pretrained_model_name_or_path}." + ) + + +def get_safetensors_header(safetensors_path: str) -> Dict[str, str]: + """ + Extracts the metadata from a safetensors file as JSON + + :param safetensors_path: path to a safetensors file + :return: dictionary of metadata extracted from the safetensors file + """ + with open(safetensors_path, "rb") as f: + length_of_header = struct.unpack(" str: + """ + Helper function extracting the uncompressed parameterized layer name from a + compressed name. Assumes the compressed name was merged using merge_names. + + :param full_name: full name of parameter in compressed model + :param param_name: compression paramater name + :return: uncompressed name of the uncompressed parameterized layer + """ + pattern = r"^(.*)\." + param_name + r"$" + regex = re.findall(pattern, full_name) + if len(regex) == 0: + return None + return regex[0] + + +def merge_names(parent_name: str, child_name: str) -> str: + """ + Helper function for merging an uncompressed parameterized layer name with a + compression parameter. Names merged with this function can then be parsed by + match_param_name. + + :param parent_name: uncompressed parameterized layer name + :param child_name: compression parameter name + :return: merged compressed name + """ + return parent_name + "." + child_name + + +def get_weight_mappings(model_path: str) -> Dict[str, str]: + """ + Takes a path to a state dict saved in safetensors format and returns a mapping + from parameterized layer name to file location. + + { + layer.weight.bitmask: file_location, + layer.weight.row_offsets: file_location, + layer.weight.shape: file_location, + layer.weight.compressed: file_location + } + + This generalizes to cases where the model is split into multiple safetensors files + + :param model_path: path to safetensors state dict, must contain either a single + safetensors file or multiple files with an index + :return: mapping of parameterized layer name to file location + """ + safetensors_path = os.path.join(model_path, SAFE_WEIGHTS_NAME) + index_path = os.path.join(model_path, SAFE_WEIGHTS_INDEX_NAME) + if os.path.exists(safetensors_path): + # we have a single safetensors file to read + header = get_safetensors_header(safetensors_path) + for key in header.keys(): + header[key] = SAFE_WEIGHTS_NAME + header.pop("__metadata__", None) + elif os.path.exists(index_path): + # we have multiple safetensors file, read from index + with open(index_path, "r", encoding="utf-8") as f: + index = json.load(f) + header = index["weight_map"] + else: + raise ValueError( + f"Could not find a safetensors weight or index file at {model_path}" + ) + + # convert weight locations to full paths + for key, value in header.items(): + header[key] = os.path.join(model_path, value) + + return header + + +def get_nested_weight_mappings( + model_path: str, params_to_nest: List[str] +) -> Dict[str, Dict[str, str]]: + """ + Takes a path to a state dict saved in safetensors format and returns a nested + mapping from uncompressed parameterized layer names to the file locations of each + of the layers compression parameters. + + layer.weight: { + bitmask: file_location, + row_offsets: file_location, + shape: file_location, + compressed: file_location + } + + This generalizes to cases where the model is split into multiple safetensors files + + :param model_path: path to safetensors state dict, must contain either a single + safetensors file or multiple files with an index + :return: nested mapping of parameterized layer name to file location + """ + weight_mappings = get_weight_mappings(model_path) + + nested_weight_mappings = {} + for key in weight_mappings.keys(): + for param_name in params_to_nest: + maybe_match = match_param_name(key, param_name) + if maybe_match is not None: + dense_param = maybe_match + if dense_param not in nested_weight_mappings: + nested_weight_mappings[dense_param] = {} + nested_weight_mappings[dense_param][param_name] = weight_mappings[key] + + return nested_weight_mappings diff --git a/tests/compressed_tensors/quantization/observers/quantization/__init__.py b/tests/compressed_tensors/quantization/observers/quantization/__init__.py new file mode 100644 index 00000000..0c44f887 --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py new file mode 100644 index 00000000..0c44f887 --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py new file mode 100644 index 00000000..97bf8b0c --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +import pytest +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization.quant_scheme import QuantizationScheme + + +@pytest.fixture +def create_quantization_scheme(): + def quantization_scheme( + targets: List[str], + weights: Optional[QuantizationArgs] = None, + input_activations: Optional[QuantizationArgs] = None, + output_activations: Optional[QuantizationArgs] = None, + ): + return QuantizationScheme( + targets=targets, + weights=weights, + input_activations=input_activations, + output_activations=output_activations, + ) + + return quantization_scheme diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py new file mode 100644 index 00000000..6a3d17af --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py @@ -0,0 +1,140 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from compressed_tensors.quantization.lifecycle import apply_quantization_config +from compressed_tensors.quantization.quant_config import ( + QuantizationConfig, + QuantizationStatus, +) +from transformers import AutoModelForCausalLM + + +def test_apply_quantization_config_tinyllama(): + quant_config = get_sample_tinyllama_quant_config() + model = get_tinyllama_model() + + # check that model is not already quantized + for module in model.modules(): + _test_layer_quantization_status(module, inputs=False, weights=False) + + # apply quant config to model + apply_quantization_config(model, quant_config) + + # check for correct application of quant config + num_linears = 0 + num_embeddings = 0 + num_rotary_embeddings = 0 + for name, module in model.named_modules(): + if name in quant_config.ignore: + continue + module_type = module.__class__.__name__ + if module_type == "Linear": + num_linears += 1 + _test_layer_quantization_status(module, inputs=True, weights=True) + elif module_type == "Embedding": + num_embeddings += 1 + _test_layer_quantization_status(module, inputs=False, weights=True) + elif module_type == "LlamaRotaryEmbedding": + num_rotary_embeddings += 1 + _test_layer_quantization_status(module, inputs=False, weights=False) + + # sanity check correct number of layers targeted + assert num_linears == 154 # 155 Linear layers - 1 that gets ignored + assert num_embeddings == 1 + assert num_rotary_embeddings == 22 + + +def test_serialize_config_tinyllama(): + quant_config = get_sample_tinyllama_quant_config() + model = get_tinyllama_model() + + # check that model is not already quantized + for module in model.modules(): + _test_layer_quantization_status(module, inputs=False, weights=False) + + # apply quant config to model + apply_quantization_config(model, quant_config) + + serialized_config = QuantizationConfig.from_pretrained(model) + assert len(serialized_config.config_groups) == 2 + assert serialized_config.config_groups["group_0"].targets == ["Embedding"] + assert serialized_config.config_groups["group_0"].input_activations is None + assert serialized_config.config_groups["group_1"].targets == ["Linear"] + assert serialized_config.config_groups["group_1"].input_activations is not None + assert serialized_config.quantization_status == QuantizationStatus.FROZEN + assert serialized_config.format == "fakequant" + assert serialized_config.quant_method == "sparseml" + assert serialized_config.ignore == ["model.layers.1.mlp.down_proj"] + assert serialized_config.global_compression_ratio > 1.0 + assert serialized_config.global_compression_ratio < 8.0 + + +def _test_layer_quantization_status(module, inputs: bool, weights: bool): + # check if quantization is applied at all (true if inputs or weights targeted) + quantized = inputs or weights + assert hasattr(module, "quantization_scheme") == quantized + assert hasattr(module, "quantization_status") == quantized + + # check inputs matches expected + assert hasattr(module, "input_scale") == inputs + assert hasattr(module, "input_zero_point") == inputs + + # check weights matches expected + assert hasattr(module, "weight_scale") == weights + assert hasattr(module, "weight_zero_point") == weights + + +def get_tinyllama_model(): + return AutoModelForCausalLM.from_pretrained( + "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" + ) + + +def get_sample_tinyllama_quant_config(): + config_dict = { + "quant_method": "sparseml", + "format": "fakequant", + "quantization_status": "frozen", + "global_compression_ratio": None, + "config_groups": { + "group_1": { + "weights": { + "num_bits": 8, + "type": "int", + "symmetric": True, + "strategy": "tensor", + }, + "input_activations": { + "num_bits": 8, + "type": "int", + "symmetric": True, + "strategy": "tensor", + }, + "targets": ["Linear"], + }, + "group_2": { + "weights": { + "num_bits": 8, + "type": "int", + "symmetric": False, + "strategy": "tensor", + }, + "input_activations": None, + "targets": ["Embedding"], + }, + }, + "ignore": ["LlamaRotaryEmbedding", "model.layers.1.mlp.down_proj"], + } + return QuantizationConfig.parse_obj(config_dict) diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py new file mode 100644 index 00000000..00c95d16 --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py @@ -0,0 +1,82 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +import torch +from compressed_tensors.quantization.lifecycle.forward import ( + maybe_calibrate_or_quantize, + wrap_module_forward_quantized, +) +from compressed_tensors.quantization.lifecycle.initialize import ( + initialize_module_for_quantization, +) +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization.quant_config import QuantizationStatus +from torch.nn import Linear + + +def test_wrap_module_forward_quantized(create_quantization_scheme): + num_bits = 8 + quantization_scheme = create_quantization_scheme( + targets=["*"], + weights=QuantizationArgs(num_bits=num_bits, symmetric=True), + input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), + ) + layer = Linear(4, 4) + + func_forward = layer.forward.__func__ + + # check that the forward call is overwritten + wrap_module_forward_quantized(layer, quantization_scheme) + + assert not func_forward == layer.forward.__func__ + + +@pytest.mark.parametrize( + "quantization_status", ["initialized", "calibration", "frozen"] +) +def test_maybe_calibrate_or_quantize(create_quantization_scheme, quantization_status): + num_bits = 8 + quantization_scheme = create_quantization_scheme( + targets=["*"], + weights=QuantizationArgs(num_bits=num_bits, symmetric=True), + input_activations=QuantizationArgs(num_bits=num_bits, symmetric=True), + ) + quantization_args = QuantizationArgs(num_bits=num_bits, symmetric=True) + layer = Linear(4, 4) + layer.weight.data *= 100 + + initialize_module_for_quantization(layer, quantization_scheme) + layer.quantization_status = QuantizationStatus(quantization_status) + + # only calibration updates the scale and zero-point + if layer.quantization_status == QuantizationStatus.INITIALIZED: + out = maybe_calibrate_or_quantize( + layer, layer.weight.data, "input", quantization_args + ) + assert torch.allclose(out, layer.weight.data) + elif layer.quantization_status == QuantizationStatus.CALIBRATION: + + out = maybe_calibrate_or_quantize( + layer, layer.weight.data, "input", quantization_args + ) + assert torch.allclose(out, layer.weight.data, atol=0.2) + + elif layer.quantization_status == QuantizationStatus.FROZEN: + # scale and zero points are empty -- cannot quantize + with pytest.raises(Exception): + out = maybe_calibrate_or_quantize( + layer, layer.weight.data, "input", quantization_args + ) diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py new file mode 100644 index 00000000..056c6089 --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py @@ -0,0 +1,47 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quantization +from compressed_tensors.quantization.lifecycle.initialize import ( + initialize_module_for_quantization, +) +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization.quant_config import QuantizationStatus +from torch.nn import Linear + + +def test_set_module_for_calibration(create_quantization_scheme): + num_bits = 8 + quantization_scheme = create_quantization_scheme( + targets=["*"], + weights=QuantizationArgs(num_bits=num_bits, symmetric=True), + input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), + ) + + layer = Linear(4, 4) + + initialize_module_for_quantization(layer, quantization_scheme) + layer.quantization_status = QuantizationStatus("calibration") + + # should have both input and weight observer after initalizing + assert hasattr(layer, "input_observer") + assert hasattr(layer, "weight_observer") + + # observers should get deleted after freezing + freeze_module_quantization(layer) + assert not hasattr(layer, "input_observer") + assert not hasattr(layer, "weight_observer") + + assert layer.quantization_status == QuantizationStatus("frozen") diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py new file mode 100644 index 00000000..987b2ae2 --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py @@ -0,0 +1,79 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from compressed_tensors.quantization.lifecycle.initialize import ( + initialize_module_for_quantization, +) +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization.quant_config import QuantizationStatus +from torch.nn import Linear + + +NUM_BITS = 8 + + +@pytest.mark.parametrize( + "weights,input_activations", + [ + ( + QuantizationArgs(num_bits=NUM_BITS, symmetric=True), + None, + ), + ( + None, + QuantizationArgs(num_bits=NUM_BITS, symmetric=True), + ), + ( + QuantizationArgs(num_bits=NUM_BITS, symmetric=True), + QuantizationArgs(num_bits=NUM_BITS, symmetric=True), + ), + ], +) +def test_initialize_module_for_quantization( + create_quantization_scheme, weights, input_activations +): + quantization_scheme = create_quantization_scheme( + targets=["*"], + weights=weights, + input_activations=input_activations, + ) + layer = Linear(4, 4) + + assert not hasattr(layer, "quantization_scheme") + assert not hasattr(layer, "quantization_status") + + # add attributes, zero_points and scale + initialize_module_for_quantization(layer, quantization_scheme) + + registered_params = {"weight", "bias"} + if weights is not None: + registered_params.add("weight_scale") + registered_params.add("weight_zero_point") + + if input_activations is not None: + registered_params.add("input_scale") + registered_params.add("input_zero_point") + + for key in layer.state_dict().keys(): + assert key in registered_params + registered_params.remove(key) + + assert len(registered_params) == 0 + + assert hasattr(layer, "quantization_scheme") + assert hasattr(layer, "quantization_status") + + assert layer.quantization_status == QuantizationStatus.INITIALIZED diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py new file mode 100644 index 00000000..44932778 --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py @@ -0,0 +1,119 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy + +import torch +from compressed_tensors.quantization.lifecycle.calibration import ( + set_module_for_calibration, +) +from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quantization +from compressed_tensors.quantization.lifecycle.initialize import ( + initialize_module_for_quantization, +) +from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization.quant_config import QuantizationStatus +from torch.nn import Linear + + +def test_lifecyle(create_quantization_scheme): + num_bits = 8 + + quantization_scheme = create_quantization_scheme( + input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), + weights=QuantizationArgs(num_bits=num_bits, symmetric=True), + targets=["*"], + ) + + layer = Linear(4, 4) + layer.weight.data *= 100 + + # updated layer keys check + expected_layer_keys = {"weight", "bias"} + for key in layer.state_dict().keys(): + expected_layer_keys.remove(key) + assert len(expected_layer_keys) == 0 + + # over write forward pass and register zero_point and scale + initialize_module_for_quantization(layer, quantization_scheme) + expected_layer_keys = { + "input_scale", + "input_zero_point", + "weight_scale", + "weight_zero_point", + "weight", + "bias", + } + for key in layer.state_dict().keys(): + expected_layer_keys.remove(key) + assert len(expected_layer_keys) == 0 + + # should have both input and weight observer after initalizing + assert hasattr(layer, "input_observer") + assert hasattr(layer, "weight_observer") + + assert hasattr(layer, "quantization_scheme") + assert hasattr(layer, "quantization_status") + assert layer.quantization_status == QuantizationStatus.INITIALIZED + + set_module_for_calibration(layer) + assert layer.quantization_status == QuantizationStatus.CALIBRATION + + # do a calibration step + assert torch.numel(layer.input_zero_point.data) == 0 + assert torch.numel(layer.input_scale) == 0 + assert torch.numel(layer.weight_scale) == 0 + assert torch.numel(layer.weight_zero_point) == 0 + + layer(torch.randn(4, 4)) + + # zero-points and scale should be updated after forward pass + assert torch.numel(layer.input_zero_point.data) > 0 + assert torch.numel(layer.input_scale) > 0 + assert torch.numel(layer.weight_scale) > 0 + assert torch.numel(layer.weight_zero_point) > 0 + + # symmetric zero points should center at 0 + assert layer.weight_zero_point.data == 0 + + # check high and low bound of the weights + assert torch.all(layer.weight.data >= -128) and torch.all(layer.weight.data <= 127) + + initalized_layer = deepcopy(layer) + + # calibrate the layers with each iteration + for _ in range(10): + layer(torch.randn(4, 4)) + + assert initalized_layer.input_zero_point != layer.input_zero_point + assert initalized_layer.input_scale != layer.input_scale + assert initalized_layer.weight_scale != layer.weight_scale + + # check quantization f_q(x) is applied after frozen without update + input_check_for_quant = torch.randn(4, 4) + out_calibration = layer(input_check_for_quant) + + layer_before_freeze = deepcopy(layer) + + # Freeze, no update after any forward pass + freeze_module_quantization(layer) + + for _ in range(10): + layer(torch.randn(4, 4)) + assert layer_before_freeze.input_zero_point == layer.input_zero_point + assert layer_before_freeze.input_scale == layer.input_scale + assert layer_before_freeze.weight_scale == layer.weight_scale + + # check that the same quantization is applied as calibration to frozen + assert torch.all(out_calibration == layer(input_check_for_quant)) diff --git a/tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py b/tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py new file mode 100644 index 00000000..c1c84be6 --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from compressed_tensors.quantization import ( + QuantizationArgs, + QuantizationStrategy, + QuantizationType, +) +from pydantic import ValidationError + + +def test_defaults(): + default = QuantizationArgs() + + assert default.num_bits == 8 + assert default.type == QuantizationType.INT + assert default.symmetric + assert default.strategy == QuantizationStrategy.TENSOR + assert default.group_size is None + assert default.block_structure is None + + +def test_group(): + kwargs = {"strategy": "group", "group_size": 128} + + group = QuantizationArgs(**kwargs) + assert group.strategy == QuantizationStrategy.GROUP + assert group.group_size == kwargs["group_size"] + + +def test_block(): + kwargs = {"strategy": "block", "block_structure": "2x4"} + + block = QuantizationArgs(**kwargs) + assert block.strategy == QuantizationStrategy.BLOCK + assert block.block_structure == kwargs["block_structure"] + + +def test_invalid(): + with pytest.raises(ValidationError): + _ = QuantizationArgs(type="invalid") + with pytest.raises(ValidationError): + _ = QuantizationArgs(strategy="invalid") diff --git a/tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py b/tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py new file mode 100644 index 00000000..091be723 --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from compressed_tensors.quantization import ( + QuantizationConfig, + QuantizationScheme, + QuantizationStatus, +) +from pydantic import ValidationError + + +def test_basic_config(): + config_groups = {"group_1": QuantizationScheme(targets=[])} + config = QuantizationConfig(config_groups=config_groups) + + assert config.config_groups == config_groups + assert config.quant_method == "sparseml" + assert config.format == "fakequant" + assert config.quantization_status == QuantizationStatus.INITIALIZED + assert config.global_compression_ratio is None + assert isinstance(config.ignore, list) and len(config.ignore) == 0 + + +def test_full_config(): + config_groups = { + "group_1": QuantizationScheme(targets=[]), + "group_2": QuantizationScheme(targets=[]), + } + global_compression_ratio = 3.5 + ignore = ["model.layers.0"] + quantization_status = "compressed" + + config = QuantizationConfig( + config_groups=config_groups, + global_compression_ratio=global_compression_ratio, + ignore=ignore, + quantization_status=quantization_status, + ) + assert config.config_groups == config_groups + assert config.global_compression_ratio == global_compression_ratio + assert config.ignore == ignore + assert config.quantization_status == QuantizationStatus.COMPRESSED + + +def test_need_config_groups(): + with pytest.raises(ValidationError): + _ = QuantizationScheme() diff --git a/tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py b/tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py new file mode 100644 index 00000000..14ba9f7e --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py @@ -0,0 +1,51 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme +from pydantic import ValidationError + + +def test_basic_scheme(): + targets = ["model.layer.0", "model.layer.3"] + weights = QuantizationArgs() + + scheme = QuantizationScheme(targets=targets, weights=weights) + assert scheme.targets == targets + assert scheme.weights == weights + assert scheme.input_activations is None + assert scheme.output_activations is None + + +def test_full_scheme(): + targets = ["Linear"] + weights = QuantizationArgs() + input_activations = QuantizationArgs(num_bits=4) + output_activations = QuantizationArgs(num_bits=8, type="float", symmetric=False) + + scheme = QuantizationScheme( + targets=targets, + weights=weights, + input_activations=input_activations, + output_activations=output_activations, + ) + assert scheme.targets == targets + assert scheme.weights == weights + assert scheme.input_activations == input_activations + assert scheme.output_activations == output_activations + + +def test_needs_targets(): + with pytest.raises(ValidationError): + _ = QuantizationScheme() diff --git a/tests/compressed_tensors/quantization/observers/test_min_max.py b/tests/compressed_tensors/quantization/observers/test_min_max.py new file mode 100644 index 00000000..a14866ef --- /dev/null +++ b/tests/compressed_tensors/quantization/observers/test_min_max.py @@ -0,0 +1,89 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +import torch +from compressed_tensors.quantization.quant_args import QuantizationArgs + + +@pytest.mark.parametrize( + "symmetric,expected_scale,expected_zero_point", + [ + (True, 0.0078, 0), + (False, 0.0039, 0), + ], +) +def test_min_max_observer(symmetric, expected_scale, expected_zero_point): + tensor = torch.tensor([1, 1, 1, 1, 1]) + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric) + + observer = weights.get_observer() + scale, zero_point = observer(tensor) + + assert round(scale.item(), 4) == expected_scale + assert round(zero_point.item(), 4) == expected_zero_point + + +def test_min_max_observer_symmetric_scale_range(): + tensor = torch.rand(4, 4) + tensor *= 127 + + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=True) + + observer = weights.get_observer() + scale, zero_point = observer(tensor) + + # if symmetric, max symmetric_range = abs(-128) / 255 + assert round(scale.item(), 4) <= 1.0039 + assert round(zero_point.item(), 4) == 0 + + +def test_min_max_observer_value_update(): + inp = torch.tensor([1, 1, 1, 1, 1]) + inp_update_max = torch.tensor([127, 1, 1, 1, 1]) + inp_update_min = torch.tensor([-128, 1, 1, 1, 1]) + + # udpate the min, max twice total + tensors = [ + inp, + inp, + inp_update_max, # update max + inp, + inp_update_min, # update min + ] + + tensor = inp + num_bits = 8 + weights = QuantizationArgs(num_bits=num_bits, symmetric=True) + + observer = weights.get_observer() + curr_max = 1 + curr_min = 1 + for i, tensor in enumerate(tensors): + observer(tensor) + curr_max = max(observer.max_val, curr_max) + curr_min = min(observer.min_val, curr_max) + + if i < 2: + assert curr_max == 1 + assert curr_min == 1 + elif i < 4: + assert curr_max == 43 # (127 + 2) / 3 + assert curr_min == 1 + else: + assert curr_max == 43 + assert curr_min == -24.8 # (-128 + 4) / 5 From 2125b15a9d5b49d8c22515ef99ca5d9bc04aa20d Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Mon, 29 Apr 2024 13:50:00 +0000 Subject: [PATCH 09/10] fix --- setup.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/setup.py b/setup.py index d111af24..896609ed 100644 --- a/setup.py +++ b/setup.py @@ -31,13 +31,8 @@ def _setup_extras() -> Dict: return {"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"]} setup( -<<<<<<< HEAD - name="compressed_tensors", - version="0.3.0", -======= name="compressed-tensors", version="0.3.1", ->>>>>>> main author="Neuralmagic, Inc.", author_email="support@neuralmagic.com", license="Apache 2.0", From 8ea5b0d62f176c8816e96a68fd450a3799382bb9 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Mon, 29 Apr 2024 13:57:04 +0000 Subject: [PATCH 10/10] fix --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 3c60a838..09851d18 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,4 @@ -<<<<<<< HEAD # compressed_tensors -======= -# compressed-tensors This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.