From fa6a48f1da6b47106912bcd25eba7171ba7cfec7 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 11 Apr 2024 18:05:45 +0000
Subject: [PATCH] defining quantization config models

---
 src/sparsetensors/__init__.py                 |  1 +
 src/sparsetensors/quantization/__init__.py    | 18 +++++
 src/sparsetensors/quantization/quant_args.py  | 63 +++++++++++++++++
 .../quantization/quant_config.py              | 68 +++++++++++++++++++
 .../quantization/quant_scheme.py              | 39 +++++++++++
 5 files changed, 189 insertions(+)
 create mode 100644 src/sparsetensors/quantization/__init__.py
 create mode 100644 src/sparsetensors/quantization/quant_args.py
 create mode 100644 src/sparsetensors/quantization/quant_config.py
 create mode 100644 src/sparsetensors/quantization/quant_scheme.py

diff --git a/src/sparsetensors/__init__.py b/src/sparsetensors/__init__.py
index 3eefa2c8..0833dd42 100644
--- a/src/sparsetensors/__init__.py
+++ b/src/sparsetensors/__init__.py
@@ -17,4 +17,5 @@
 # flake8: noqa
 from .compressors import *
 from .config import *
+from .quantization import QuantizationConfig, QuantizationStatus
 from .utils import *
diff --git a/src/sparsetensors/quantization/__init__.py b/src/sparsetensors/quantization/__init__.py
new file mode 100644
index 00000000..b53a328a
--- /dev/null
+++ b/src/sparsetensors/quantization/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+from .quant_args import *
+from .quant_config import *
+from .quant_scheme import *
diff --git a/src/sparsetensors/quantization/quant_args.py b/src/sparsetensors/quantization/quant_args.py
new file mode 100644
index 00000000..89a2e3df
--- /dev/null
+++ b/src/sparsetensors/quantization/quant_args.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+__all__ = ["QuantizationType", "QuantizationStrategy", "QuantizationArgs"]
+
+
+class QuantizationType(Enum):
+    """
+    Enum storing quantization type options
+    """
+
+    INT = "int"
+    FLOAT = "float"
+
+
+class QuantizationStrategy(Enum):
+    """
+    Enum storing quantization strategy options
+    """
+
+    TENSOR = "tensor"
+    CHANNEL = "channel"
+    GROUP = "group"
+    BLOCK = "block"
+
+
+class QuantizationArgs(BaseModel):
+    """
+    User facing arguments used to define a quantization config for weights or
+    activations
+
+    :param num_bits: quantization bit depth
+    :param type: dtype to quantized to, either int or float
+    :param symmetric: whether or not quantization scale is symmetric about zero-point
+    :param strategy: string id determining the scope of scale/zero-point to apply
+    :param group_size: group length to use for the group strategy
+    :param block_structure: 2d block structure to use for the block strategy, must be
+    of the format "2x4", "8x16", etc.
+    """
+
+    num_bits: int = 8
+    type: QuantizationType = QuantizationType.INT
+    symmetric: bool = True
+    strategy: QuantizationStrategy = QuantizationStrategy.TENSOR
+    group_size: Optional[int] = None
+    block_structure: Optional[str] = None
diff --git a/src/sparsetensors/quantization/quant_config.py b/src/sparsetensors/quantization/quant_config.py
new file mode 100644
index 00000000..1c8bd796
--- /dev/null
+++ b/src/sparsetensors/quantization/quant_config.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from typing import Dict, List, Optional
+
+from pydantic import BaseModel
+from sparsetensors.quantization.quant_scheme import QuantizationScheme
+
+
+__all__ = ["QuantizationStatus", "QuantizationConfig"]
+
+
+class QuantizationStatus(Enum):
+    """
+    Enum storing the different states a quantized layer can be in
+
+    Initialized: scale, zero points and observers have been attached to the layer but
+    are set to dummy values (not yet calibrated)
+    Calibration: scale and zero points have been calibrated through OBCQ or similar
+    algorithm, observers are still attached
+    Frozen: scale and zero points are finalized, observers have been deleted, weights
+    are still in their original precision
+    Compressed: weights have been converted to their target type or compressed to
+    their closed approximation
+    """
+
+    INITIALIZED = "initialized"
+    CALIBRATION = "calibration"
+    FROZEN = "frozen"
+    COMPRESSED = "compressed"
+
+
+class QuantizationConfig(BaseModel):
+    """
+    Full configuration specifying how a model is quantized. Each quantized layer is
+    mapped to a QuantizationScheme in config_groups.
+
+    :param config_groups: dict of QuantizationSchemes specifying the quantization
+    settings for each quantized layer
+    :param quant_method: a constant used to differentiate sparseML quantization from
+    other quantization configs
+    :param format: specifies how the quantized model is stored on disk
+    :quantization_status: specifies the current status of all quantized layers. It is
+    assumed all layers are in the same state.
+    :global_compression_ratio: optional informational config to report the model
+    compression ratio acheived by the quantization config
+    :ignore: optional list of layers to ignore from config_groups. Layers in this list
+    are not quantized even if they match up with a target in config_groups
+    """
+
+    config_groups: Dict[str, QuantizationScheme]
+    quant_method: str = "sparseml"
+    format: str = "fakequant"
+    quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
+    global_compression_ratio: Optional[float] = None
+    ignore: Optional[List[str]] = None
diff --git a/src/sparsetensors/quantization/quant_scheme.py b/src/sparsetensors/quantization/quant_scheme.py
new file mode 100644
index 00000000..7077c24e
--- /dev/null
+++ b/src/sparsetensors/quantization/quant_scheme.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+from pydantic import BaseModel
+from sparsetensors.quantization.quant_args import QuantizationArgs
+
+
+__all__ = ["QuantizationScheme"]
+
+
+class QuantizationScheme(BaseModel):
+    """
+    Set of QuantizationArgs defining how the weights, inputs and outputs of target list
+    of modules should be quantized
+
+    :param targets: list of modules to apply the QuantizationArgs to, can be layer
+    names, layer types or a regular expression
+    :param weights: quantization config for layer weights
+    :param input_activations: quantization config for layer inputs
+    :param output_activations: quantization config for layer outputs
+    """
+
+    targets: List[str]
+    weights: Optional[QuantizationArgs] = None
+    input_activations: Optional[QuantizationArgs] = None
+    output_activations: Optional[QuantizationArgs] = None