draft

neuralmagic · Apr 12, 2024 · d203669 · d203669
1 parent 318ad21
commit d203669
Show file tree

Hide file tree

Showing 12 changed files with 978 additions and 0 deletions.
diff --git a/bin/quant.py b/bin/quant.py
@@ -0,0 +1,55 @@
+import torch
+from torch.nn import Linear
+# from sparseml.modifiers.quantization.utils.quantization_scheme import QuantizationScheme, QuantizationArgs
+from sparsetensors.quantization.quant_args import QuantizationArgs
+from sparsetensors.quantization.quant_scheme import QuantizationScheme
+from sparseml.modifiers.quantization.lifecycle.initialize import initialize_module_for_quantization
+from sparseml.modifiers.quantization.lifecycle.calibration import set_module_for_calibration
+from sparseml.modifiers.quantization.lifecycle.frozen import freeze_module_quantization
+num_bits = 8
+
+scheme = QuantizationScheme(
+    input_acivations=QuantizationArgs(num_bits=num_bits, symmetric=False),
+    weights=QuantizationArgs(num_bits=num_bits,  symmetric=True),
+    output_activations=None,
+)
+
+layer = Linear(4, 4)
+print(layer)
+print(dict(layer.named_parameters()))
+
+
+initialize_module_for_quantization(layer, scheme)
+print(layer)  # should see observer under layer now
+print(0)
+print(dict(layer.named_parameters()))  # should see empty tensors for scale and zero point now
+print(1)
+
+
+set_module_for_calibration(layer)
+# do a calibration step
+layer(torch.randn(4,4))
+print(dict(layer.named_parameters()))  # scale and zero point should have updated values
+print(2)
+for _ in range(10):
+    layer(torch.randn(4,4))
+print(dict(layer.named_parameters()))  # scale and zero point should have updated values again since we did another pass
+
+print(3)
+breakpoint()
+
+
+freeze_module_quantization(layer)
+for _ in range(10):
+    # do more forward passes but show args are frozen
+    layer(torch.random.randn(4,4))
+print(dict(layer.named_parameters()))  # scale and zero point should not be updated now
+
+
+# missing
+
+# correctness
+# quantizing an entire model
+
+
+
diff --git a/src/sparsetensors/quantization/lifecycle/__init__.py b/src/sparsetensors/quantization/lifecycle/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+from .calibration import *
+from .forward import *
+from .frozen import *
+from .initialize import *
+from .status import *
+from .initialize import *
diff --git a/src/sparsetensors/quantization/lifecycle/calirbation.py b/src/sparsetensors/quantization/lifecycle/calirbation.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+
+from torch.nn import Module
+
+from sparseml.modifiers.quantization.lifecycle.status import QuantizationStatus
+
+
+__all__ = [
+    "set_module_for_calibration",
+]
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def set_module_for_calibration(module: Module):
+    if not getattr(module, "quantization_scheme", None):
+        # no quantization scheme nothing to do
+        return
+    status = getattr(module, "quantization_status", None)
+    if not status or status != QuantizationStatus.INITIALIZED:
+        raise _LOGGER.warning(
+            f"Attempting set module with status {status} to calibration mode. "
+            f"but status is not {QuantizationStatus.INITIALIZED} - you may "
+            "be calibrating an uninitialized module which may fail or attempting "
+            "to re-calibrate a frozen module"
+        )
+
+    module.quantization_status = QuantizationStatus.CALIBRATION
diff --git a/src/sparsetensors/quantization/lifecycle/forward.py b/src/sparsetensors/quantization/lifecycle/forward.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import wraps
+
+import torch
+from torch.nn import Module
+
+from sparseml.modifiers.quantization.lifecycle.status import QuantizationStatus
+
+from sparseml.modifiers.quantization.utils.quantization_scheme import QuantizationScheme, QuantizationArgs
+
+__all__ = ["wrap_module_forward_quantized"]
+
+
+def quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    q_max: torch.Tensor,
+) -> torch.Tensor:
+    return torch.clamp(
+        torch.round(
+            x / scale + zero_point,
+        ),
+          0,
+            q_max,
+    )
+
+
+def dequantize(
+    x_q: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+) -> torch.Tensor:
+    return (x_q - zero_point) * scale
+
+
+def fake_quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    args: QuantizationArgs,
+) -> torch.Tensor:
+    max_q = torch.tensor(2**args.num_bits - 1)
+    columns = x.shape[1]
+    Q = torch.zeros_like(x)
+    # for i1 in range(0, columns, args.block_size):
+    #     i2 = min(i1 + args.block_size, columns)
+    #     count = i2 - i1
+
+    #     W1 = x[:, i1:i2].clone()
+    #     Q1 = torch.zeros_like(W1)
+
+    #     for i in range(count):
+    #         w = W1[:, i]
+    #         breakpoint()
+    #         if args.group_size != -1:
+    #             if (i1 + i) % args.group_size == 0:
+    #                 xmin, xmax = get_qparams(
+    #                     x[:, (i1 + i) : (i1 + i + args.group_size)], args.symmetric
+    #                 )
+    #                 scale, zero = get_scale_zero_point(
+    #                     x[:, (i1 + i) : (i1 + i + args.group_size)],
+    #                     max_q,
+    #                     xmax,
+    #                     xmin,
+    #                     args.symmetric,
+    #                     args.group_size,
+    #                 )
+
+    #         q = quantize(w.unsqueeze(1), scale, zero, max_q).flatten()
+    #     Q1[:, i] = q
+    #     Q[:, i1:i2] = Q1
+    Q =  quantize(x, scale, zero_point, max_q)
+    return dequantize(Q, scale, zero_point)
+
+
+def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
+    # expects a module already initialized and injected with the parameters in
+    # initialize_module_for_quantization
+    forward_func_orig = module.forward.__func__
+
+    @wraps(forward_func_orig)  # ensures docstring, names, etc are propagated
+    def wrapped_forward(self, *args, **kwargs):
+        input_ = args[0]
+
+        if scheme.input_activations is not None:
+            # calibrate and (fake) quantize input activations when applicable
+            input_ = _maybe_calibrate_or_quantize(
+                module, input_, "input", scheme.input_activations
+            )
+
+        if scheme.weights is not None:
+            # calibrate and (fake) quantize weights when applicable
+            self.weight.data = _maybe_calibrate_or_quantize(
+                module, self.weight, "weight", scheme.weights
+            )
+
+        # perform wrapped forward call
+        output = forward_func_orig.__get__(module, module.__class__)(
+            input_, *args[1:], **kwargs
+        )
+
+        if scheme.output_activations is not None:
+            # calibrate and (fake) quantize output activations when applicable
+            output = _maybe_calibrate_or_quantize(
+                module, output, "output", scheme.output_activations
+            )
+
+        return output
+
+    # bind wrapped forward to module class so reference to `self` is correct
+    bound_wrapped_forward = wrapped_forward.__get__(module, module.__class__)
+    # set forward to wrapped forward
+    setattr(module, "forward", bound_wrapped_forward)
+
+
+def _maybe_calibrate_or_quantize(
+    module: Module, value: Module, base_name: str, args: "QuantizationArgs"
+) -> torch.Tensor:
+    # only run quantized for the included stages
+    if module.quantization_status not in {
+        QuantizationStatus.CALIBRATION,
+        QuantizationStatus.FROZEN,
+    }:
+        return value
+
+    scale = getattr(module, f"{base_name}_scale")
+    # zero_point = getattr(module, f"{base_name}_zero_point").data 
+    zero_point = getattr(module, f"{base_name}_zero_point")
+
+    print(scale, zero_point)
+
+    if module.quantization_status == QuantizationStatus.CALIBRATION:
+        # get observer and get new quant params from observation
+        observer = getattr(module, f"{base_name}_observer")
+        updated_scale, updated_zero_point = observer(value)
+
+        # update scale and zero point
+        scale.data = updated_scale
+        zero_point.data = updated_zero_point
+
+    return fake_quantize(value, scale, zero_point, args)
diff --git a/src/sparsetensors/quantization/lifecycle/frozen.py b/src/sparsetensors/quantization/lifecycle/frozen.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from torch.nn import Module
+
+from sparseml.modifiers.quantization.lifecycle.status import QuantizationStatus
+
+
+__all__ = [
+    "freeze_module_quantization",
+]
+
+
+def freeze_module_quantization(module: Module):
+    if not getattr(module, "quantization_scheme", None):
+        # no quantization scheme nothing to do
+        return
+
+    # delete observers from module
+    for submodule_name, _ in module.named_modules():
+        if "." not in submodule_name and submodule_name.endswith("_observer"):
+            # delete any observers that belong directly to this module
+            delattr(module, submodule_name)
+
+    module.quantization_status = QuantizationStatus.FROZEN
diff --git a/src/sparsetensors/quantization/lifecycle/initialize.py b/src/sparsetensors/quantization/lifecycle/initialize.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+
+import torch
+from torch.nn import Module, Parameter
+
+from sparseml.modifiers.quantization.lifecycle.forward import (
+    wrap_module_forward_quantized,
+)
+from sparseml.modifiers.quantization.lifecycle.status import QuantizationStatus
+from sparseml.modifiers.quantization.utils.quantization_scheme import (
+    QuantizationArgs,
+    QuantizationScheme,
+)
+
+
+__all__ = [
+    "initialize_module_for_quantization",
+]
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def initialize_module_for_quantization(module: Module, scheme: QuantizationScheme):
+    if scheme.input_activations is not None:
+
+        _initialize_scale_zero_point_observer(
+            module, "input", scheme.input_activations
+            )
+    if scheme.weights is not None:
+        if hasattr(module, "weight"):
+            _initialize_scale_zero_point_observer(module, "weight", scheme.weights)
+        else:
+            _LOGGER.warning(
+                f"module type {type(module)} targeted for weight quantization but "
+                "has no attribute weight, skipping weight quantization "
+                f"for {type(module)}"
+            )
+    if scheme.output_activations is not None:
+        _initialize_scale_zero_point_observer(module, "output", scheme.output_activations)
+
+    module.quantization_scheme = scheme
+    module.quantization_status = QuantizationStatus.INITIALIZED
+
+    # wrap forward call of module to perform quantized actions based on calltime status
+    wrap_module_forward_quantized(module, scheme)
+
+
+
+def _initialize_scale_zero_point_observer(
+    module: Module, base_name: str, quantization_args: QuantizationArgs
+):
+    # initializes empty scale and zero point parameters for the module
+    init_scale = Parameter(torch.empty(0), requires_grad=False)
+    module.register_parameter(f"{base_name}_scale", init_scale)
+
+    init_zero_point = Parameter(torch.empty(0, dtype=int), requires_grad=False)
+    module.register_parameter(f"{base_name}_zero_point", init_zero_point)
+
+    # initialize observer module and attach as submodule
+    observer = quantization_args.get_observer()
+    module.register_module(f"{base_name}_observer", observer)