From 09691d16cce911a6d1ef0399f263ec4c918014f8 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Mon, 22 Apr 2024 12:38:29 -0400
Subject: [PATCH 01/10] test forward (#16)

---
 .../quantization/lifecycle/forward.py         |  8 +-
 .../quantization/lifecycle/test_forward.py    | 80 +++++++++++++++++++
 2 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 tests/sparsetensors/quantization/lifecycle/test_forward.py

diff --git a/src/sparsetensors/quantization/lifecycle/forward.py b/src/sparsetensors/quantization/lifecycle/forward.py
index 6416a10b..5e6036ea 100644
--- a/src/sparsetensors/quantization/lifecycle/forward.py
+++ b/src/sparsetensors/quantization/lifecycle/forward.py
@@ -21,7 +21,13 @@
 from torch.nn import Module
 
 
-__all__ = ["wrap_module_forward_quantized"]
+__all__ = [
+    "wrap_module_forward_quantized",
+    "quantize",
+    "dequantize",
+    "fake_quantize",
+    "maybe_calibrate_or_quantize",
+]
 
 
 @torch.no_grad()
diff --git a/tests/sparsetensors/quantization/lifecycle/test_forward.py b/tests/sparsetensors/quantization/lifecycle/test_forward.py
new file mode 100644
index 00000000..c2d27bd1
--- /dev/null
+++ b/tests/sparsetensors/quantization/lifecycle/test_forward.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+import torch
+from sparsetensors.quantization.lifecycle.forward import (
+    maybe_calibrate_or_quantize,
+    wrap_module_forward_quantized,
+)
+from sparsetensors.quantization.lifecycle.initialize import (
+    initialize_module_for_quantization,
+)
+from sparsetensors.quantization.lifecycle.status import QuantizationStatus
+from sparsetensors.quantization.quant_args import QuantizationArgs
+from torch.nn import Linear
+
+
+def test_wrap_module_forward_quantized(create_quantization_scheme):
+    num_bits = 8
+    quantization_scheme = create_quantization_scheme(
+        targets=["*"],
+        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
+        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
+    )
+    layer = Linear(4, 4)
+
+    func_forward = layer.forward.__func__
+
+    # check that the forward call is overwritten
+    wrap_module_forward_quantized(layer, quantization_scheme)
+
+    assert not func_forward == layer.forward.__func__
+
+
+@pytest.mark.parametrize(
+    "quantization_status", ["INITIALIZED", "CALIBRATION", "FROZEN"]
+)
+def test_maybe_calibrate_or_quantize(create_quantization_scheme, quantization_status):
+    num_bits = 8
+    quantization_scheme = create_quantization_scheme(
+        targets=["*"],
+        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
+        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
+    )
+    quantization_args = QuantizationArgs(num_bits=num_bits, symmetric=False)
+    layer = Linear(4, 4)
+    layer.weight.data *= 100
+
+    initialize_module_for_quantization(layer, quantization_scheme)
+    layer.quantization_status = QuantizationStatus(quantization_status)
+
+    if layer.quantization_status == QuantizationStatus.INITIALIZED:
+        out = maybe_calibrate_or_quantize(
+            layer, layer.weight.data, "input", quantization_args
+        )
+        assert torch.allclose(out, layer.weight.data)
+    elif layer.quantization_status == QuantizationStatus.CALIBRATION:
+        out = maybe_calibrate_or_quantize(
+            layer, layer.weight.data, "input", quantization_args
+        )
+        assert not torch.allclose(out, layer.weight.data)
+
+    elif layer.quantization_status == QuantizationStatus.FROZEN:
+        # scale and zero points are empty -- cannot quantize
+        with pytest.raises(ValueError):
+            out = maybe_calibrate_or_quantize(
+                layer, layer.weight.data, "input", quantization_args
+            )

From 10aee1f6d44553b466529acf79838a4baf91f207 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Mon, 22 Apr 2024 12:39:36 -0400
Subject: [PATCH 02/10] test frozen (#17)

* test frozen

* rename
---
 .../quantization/lifecycle/frozen.py          |  9 ++--
 .../quantization/lifecycle/test_frozen.py     | 47 +++++++++++++++++++
 2 files changed, 52 insertions(+), 4 deletions(-)
 create mode 100644 tests/sparsetensors/quantization/lifecycle/test_frozen.py

diff --git a/src/sparsetensors/quantization/lifecycle/frozen.py b/src/sparsetensors/quantization/lifecycle/frozen.py
index 63949cf5..9715a4b2 100644
--- a/src/sparsetensors/quantization/lifecycle/frozen.py
+++ b/src/sparsetensors/quantization/lifecycle/frozen.py
@@ -35,12 +35,13 @@ def freeze_module_quantization(module: Module):
         return
 
     # delete observers from module
-    observer_names = []
+    submodule_name_do_delete = set()
     for submodule_name, _ in module.named_modules():
         if "." not in submodule_name and submodule_name.endswith("_observer"):
             # delete any observers that belong directly to this module
-            observer_names.append(submodule_name)
-    for observer_name in observer_names:
-        delattr(module, observer_name)
+            submodule_name_do_delete.add(submodule_name)
+
+    for submodule_name in submodule_name_do_delete:
+        delattr(module, submodule_name)
 
     module.quantization_status = QuantizationStatus.FROZEN
diff --git a/tests/sparsetensors/quantization/lifecycle/test_frozen.py b/tests/sparsetensors/quantization/lifecycle/test_frozen.py
new file mode 100644
index 00000000..0b5a18e8
--- /dev/null
+++ b/tests/sparsetensors/quantization/lifecycle/test_frozen.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from sparsetensors.quantization.lifecycle.frozen import freeze_module_quantization
+from sparsetensors.quantization.lifecycle.initialize import (
+    initialize_module_for_quantization,
+)
+from sparsetensors.quantization.lifecycle.status import QuantizationStatus
+from sparsetensors.quantization.quant_args import QuantizationArgs
+from torch.nn import Linear
+
+
+def test_set_module_for_calibration(create_quantization_scheme):
+    num_bits = 8
+    quantization_scheme = create_quantization_scheme(
+        targets=["*"],
+        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
+        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
+    )
+
+    layer = Linear(4, 4)
+
+    initialize_module_for_quantization(layer, quantization_scheme)
+    layer.quantization_status = QuantizationStatus("CALIBRATION")
+
+    # should have both input and weight observer after initalizing
+    assert hasattr(layer, "input_observer")
+    assert hasattr(layer, "weight_observer")
+
+    # observers should get deleted after freezing
+    freeze_module_quantization(layer)
+    assert not hasattr(layer, "input_observer")
+    assert not hasattr(layer, "weight_observer")
+
+    assert layer.quantization_status == QuantizationStatus("FROZEN")

From 321bf845b134553f80d2017491391d7ce1ea512e Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Mon, 22 Apr 2024 14:05:25 -0400
Subject: [PATCH 03/10] lifecycle conftest (#21)

---
 .../quantization/lifecycle/conftest.py        | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 tests/sparsetensors/quantization/lifecycle/conftest.py

diff --git a/tests/sparsetensors/quantization/lifecycle/conftest.py b/tests/sparsetensors/quantization/lifecycle/conftest.py
new file mode 100644
index 00000000..a8ad01b2
--- /dev/null
+++ b/tests/sparsetensors/quantization/lifecycle/conftest.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import pytest
+from sparsetensors.quantization.quant_args import QuantizationArgs
+from sparsetensors.quantization.quant_scheme import QuantizationScheme
+
+
+@pytest.fixture
+def create_quantization_scheme():
+    def quantization_scheme(
+        targets: List[str],
+        weights: Optional[QuantizationArgs] = None,
+        input_activations: Optional[QuantizationArgs] = None,
+        output_activations: Optional[QuantizationArgs] = None,
+    ):
+        return QuantizationScheme(
+            targets=targets,
+            weights=weights,
+            input_activations=input_activations,
+            output_activations=output_activations,
+        )
+
+    return quantization_scheme

From 57578cc3543723d35ac78ee085e2d0e892b5e23e Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Wed, 24 Apr 2024 10:55:17 -0400
Subject: [PATCH 04/10] test initalize (#18)

* test initalize

* newline

* parametrize weights and inp_act

* remove dup
---
 .../quantization/lifecycle/test_initialize.py | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 tests/sparsetensors/quantization/lifecycle/test_initialize.py

diff --git a/tests/sparsetensors/quantization/lifecycle/test_initialize.py b/tests/sparsetensors/quantization/lifecycle/test_initialize.py
new file mode 100644
index 00000000..b2f01c0f
--- /dev/null
+++ b/tests/sparsetensors/quantization/lifecycle/test_initialize.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+from sparsetensors.quantization.lifecycle.initialize import (
+    initialize_module_for_quantization,
+)
+from sparsetensors.quantization.quant_args import QuantizationArgs
+from sparsetensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Linear
+
+
+NUM_BITS = 8
+
+
+@pytest.mark.parametrize(
+    "weights,input_activations",
+    [
+        (
+            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
+            None,
+        ),
+        (
+            None,
+            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
+        ),
+        (
+            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
+            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
+        ),
+    ],
+)
+def test_initialize_module_for_quantization(
+    create_quantization_scheme, weights, input_activations
+):
+    quantization_scheme = create_quantization_scheme(
+        targets=["*"],
+        weights=weights,
+        input_activations=input_activations,
+    )
+    layer = Linear(4, 4)
+
+    assert not hasattr(layer, "quantization_scheme")
+    assert not hasattr(layer, "quantization_status")
+
+    # add attributes, zero_points and scale
+    initialize_module_for_quantization(layer, quantization_scheme)
+
+    registered_params = {"weight", "bias"}
+    if weights is not None:
+        registered_params.add("weight_scale")
+        registered_params.add("weight_zero_point")
+
+    if input_activations is not None:
+        registered_params.add("input_scale")
+        registered_params.add("input_zero_point")
+
+    for key in layer.state_dict().keys():
+        assert key in registered_params
+        registered_params.remove(key)
+
+    assert len(registered_params) == 0
+
+    assert hasattr(layer, "quantization_scheme")
+    assert hasattr(layer, "quantization_status")
+
+    assert layer.quantization_status == QuantizationStatus.INITIALIZED

From ecadd5245f51467f5765e34da85e273f4b82354b Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Thu, 25 Apr 2024 12:50:53 -0400
Subject: [PATCH 05/10] test lifecycle (#19)

* test lifecycle

* comments

* comments

* add quantization test
---
 .../quantization/lifecycle/test_lifecycle.py  | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 tests/sparsetensors/quantization/lifecycle/test_lifecycle.py

diff --git a/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py b/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py
new file mode 100644
index 00000000..2884bde4
--- /dev/null
+++ b/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+
+import torch
+from sparsetensors.quantization.lifecycle.calibration import set_module_for_calibration
+from sparsetensors.quantization.lifecycle.frozen import freeze_module_quantization
+from sparsetensors.quantization.lifecycle.initialize import (
+    initialize_module_for_quantization,
+)
+from sparsetensors.quantization.quant_args import QuantizationArgs
+from sparsetensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Linear
+
+
+def test_lifecyle(create_quantization_scheme):
+    num_bits = 8
+
+    quantization_scheme = create_quantization_scheme(
+        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
+        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
+        targets=["*"],
+    )
+
+    layer = Linear(4, 4)
+    layer.weight.data *= 100
+
+    # updated layer keys check
+    expected_layer_keys = {"weight", "bias"}
+    for key in layer.state_dict().keys():
+        expected_layer_keys.remove(key)
+    assert len(expected_layer_keys) == 0
+
+    # over write forward pass and register zero_point and scale
+    initialize_module_for_quantization(layer, quantization_scheme)
+    expected_layer_keys = {
+        "input_scale",
+        "input_zero_point",
+        "weight_scale",
+        "weight_zero_point",
+        "weight",
+        "bias",
+    }
+    for key in layer.state_dict().keys():
+        expected_layer_keys.remove(key)
+    assert len(expected_layer_keys) == 0
+
+    # should have both input and weight observer after initalizing
+    assert hasattr(layer, "input_observer")
+    assert hasattr(layer, "weight_observer")
+
+    assert hasattr(layer, "quantization_scheme")
+    assert hasattr(layer, "quantization_status")
+    assert layer.quantization_status == QuantizationStatus.INITIALIZED
+
+    set_module_for_calibration(layer)
+    assert layer.quantization_status == QuantizationStatus.CALIBRATION
+
+    # do a calibration step
+    assert torch.numel(layer.input_zero_point.data) == 0
+    assert torch.numel(layer.input_scale) == 0
+    assert torch.numel(layer.weight_scale) == 0
+    assert torch.numel(layer.weight_zero_point) == 0
+
+    layer(torch.randn(4, 4))
+
+    # zero-points and scale should be updated after forward pass
+    assert torch.numel(layer.input_zero_point.data) > 0
+    assert torch.numel(layer.input_scale) > 0
+    assert torch.numel(layer.weight_scale) > 0
+    assert torch.numel(layer.weight_zero_point) > 0
+
+    # symmetric zero points should center at 0
+    assert layer.weight_zero_point.data == 0
+
+    # check high and low bound of the weights
+    assert torch.all(layer.weight.data >= -128) and torch.all(layer.weight.data <= 127)
+
+    initalized_layer = deepcopy(layer)
+
+    # calibrate the layers with each iteration
+    for _ in range(10):
+        layer(torch.randn(4, 4))
+
+    assert initalized_layer.input_zero_point != layer.input_zero_point
+    assert initalized_layer.input_scale != layer.input_scale
+    assert initalized_layer.weight_scale != layer.weight_scale
+
+    # check quantization f_q(x) is applied after frozen without update
+    input_check_for_quant = torch.randn(4, 4)
+    out_calibration = layer(input_check_for_quant)
+
+    layer_before_freeze = deepcopy(layer)
+
+    # Freeze, no update after any forward pass
+    freeze_module_quantization(layer)
+
+    for _ in range(10):
+        layer(torch.randn(4, 4))
+    assert layer_before_freeze.input_zero_point == layer.input_zero_point
+    assert layer_before_freeze.input_scale == layer.input_scale
+    assert layer_before_freeze.weight_scale == layer.weight_scale
+
+    # check that the same quantization is applied as calibration to frozen
+    assert torch.all(out_calibration == layer(input_check_for_quant))

From 7ad03ab5003dcfa58ef65ffa53846411a149ae76 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Thu, 25 Apr 2024 12:51:37 -0400
Subject: [PATCH 06/10] Lifecycle/min max obs (#20)

* min max test

* add minmax obs

* test scale range and min_max update
---
 .../quantization/observers/test_min_max.py    | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 tests/sparsetensors/quantization/observers/test_min_max.py

diff --git a/tests/sparsetensors/quantization/observers/test_min_max.py b/tests/sparsetensors/quantization/observers/test_min_max.py
new file mode 100644
index 00000000..a5273d02
--- /dev/null
+++ b/tests/sparsetensors/quantization/observers/test_min_max.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+import torch
+from sparsetensors.quantization.quant_args import QuantizationArgs
+
+
+@pytest.mark.parametrize(
+    "symmetric,expected_scale,expected_zero_point",
+    [
+        (True, 0.0078, 0),
+        (False, 0.0039, 0),
+    ],
+)
+def test_min_max_observer(symmetric, expected_scale, expected_zero_point):
+    tensor = torch.tensor([1, 1, 1, 1, 1])
+    num_bits = 8
+    weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric)
+
+    observer = weights.get_observer()
+    scale, zero_point = observer(tensor)
+
+    assert round(scale.item(), 4) == expected_scale
+    assert round(zero_point.item(), 4) == expected_zero_point
+
+
+def test_min_max_observer_symmetric_scale_range():
+    tensor = torch.rand(4, 4)
+    tensor *= 127
+
+    num_bits = 8
+    weights = QuantizationArgs(num_bits=num_bits, symmetric=True)
+
+    observer = weights.get_observer()
+    scale, zero_point = observer(tensor)
+
+    # if symmetric, max symmetric_range = abs(-128) / 255
+    assert round(scale.item(), 4) <= 1.0039
+    assert round(zero_point.item(), 4) == 0
+
+
+def test_min_max_observer_value_update():
+    inp = torch.tensor([1, 1, 1, 1, 1])
+    inp_update_max = torch.tensor([127, 1, 1, 1, 1])
+    inp_update_min = torch.tensor([-128, 1, 1, 1, 1])
+
+    # udpate the min, max twice total
+    tensors = [
+        inp,
+        inp,
+        inp_update_max,  # update max
+        inp,
+        inp_update_min,  # update min
+    ]
+
+    tensor = inp
+    num_bits = 8
+    weights = QuantizationArgs(num_bits=num_bits, symmetric=True)
+
+    observer = weights.get_observer()
+    curr_max = 1
+    curr_min = 1
+    for i, tensor in enumerate(tensors):
+        observer(tensor)
+        curr_max = max(observer.max_val, curr_max)
+        curr_min = min(observer.min_val, curr_max)
+
+        if i < 2:
+            assert curr_max == 1
+            assert curr_min == 1
+        elif i < 4:
+            assert curr_max == 43  # (127 + 2) / 3
+            assert curr_min == 1
+        else:
+            assert curr_max == 43
+            assert curr_min == -24.8  # (-128 + 4) / 5

From 89ca72cef1ff0139e436aed38b9452ca8a481e0d Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Fri, 26 Apr 2024 20:31:25 +0000
Subject: [PATCH 07/10] rebase

---
 README.md                                     |   2 +-
 setup.py                                      |   4 +-
 src/sparsetensors/README.md                   | 162 --------
 src/sparsetensors/__init__.py                 |  21 -
 src/sparsetensors/base.py                     |  15 -
 src/sparsetensors/compressors/__init__.py     |  19 -
 src/sparsetensors/compressors/base.py         |  73 ----
 src/sparsetensors/compressors/dense.py        |  31 --
 .../compressors/sparse_bitmask.py             | 233 ------------
 src/sparsetensors/config/__init__.py          |  18 -
 src/sparsetensors/config/base.py              |  36 --
 src/sparsetensors/config/dense.py             |  36 --
 src/sparsetensors/config/sparse_bitmask.py    |  36 --
 src/sparsetensors/quantization/__init__.py    |  21 -
 .../quantization/lifecycle/__init__.py        |  22 --
 .../quantization/lifecycle/apply.py           | 103 -----
 .../quantization/lifecycle/calibration.py     |  51 ---
 .../quantization/lifecycle/forward.py         | 135 -------
 .../quantization/lifecycle/frozen.py          |  47 ---
 .../quantization/lifecycle/initialize.py      |  94 -----
 .../quantization/observers/__init__.py        |  19 -
 .../quantization/observers/base.py            |  69 ----
 .../quantization/observers/memoryless.py      |  61 ---
 .../quantization/observers/min_max.py         |  79 ----
 src/sparsetensors/quantization/quant_args.py  |  85 -----
 .../quantization/quant_config.py              | 154 --------
 .../quantization/quant_scheme.py              |  39 --
 .../quantization/utils/__init__.py            |  16 -
 .../quantization/utils/helpers.py             | 115 ------
 src/sparsetensors/registry/__init__.py        |  17 -
 src/sparsetensors/registry/registry.py        | 360 ------------------
 src/sparsetensors/utils/__init__.py           |  17 -
 src/sparsetensors/utils/helpers.py            |  45 ---
 src/sparsetensors/utils/safetensors_load.py   | 196 ----------
 tests/quantization/__init__.py                |  13 -
 tests/quantization/lifecycle/__init__.py      |  13 -
 tests/quantization/lifecycle/test_apply.py    | 140 -------
 tests/quantization/test_quant_args.py         |  55 ---
 tests/quantization/test_quant_config.py       |  60 ---
 tests/quantization/test_quant_scheme.py       |  51 ---
 .../quantization/lifecycle/conftest.py        |  37 --
 .../quantization/lifecycle/test_forward.py    |  80 ----
 .../quantization/lifecycle/test_frozen.py     |  47 ---
 .../quantization/lifecycle/test_initialize.py |  79 ----
 .../quantization/lifecycle/test_lifecycle.py  | 117 ------
 .../quantization/observers/test_min_max.py    |  89 -----
 tests/test_bitmask.py                         |   2 +-
 tests/test_registry.py                        |   2 +-
 48 files changed, 5 insertions(+), 3211 deletions(-)
 delete mode 100644 src/sparsetensors/README.md
 delete mode 100644 src/sparsetensors/__init__.py
 delete mode 100644 src/sparsetensors/base.py
 delete mode 100644 src/sparsetensors/compressors/__init__.py
 delete mode 100644 src/sparsetensors/compressors/base.py
 delete mode 100644 src/sparsetensors/compressors/dense.py
 delete mode 100644 src/sparsetensors/compressors/sparse_bitmask.py
 delete mode 100644 src/sparsetensors/config/__init__.py
 delete mode 100644 src/sparsetensors/config/base.py
 delete mode 100644 src/sparsetensors/config/dense.py
 delete mode 100644 src/sparsetensors/config/sparse_bitmask.py
 delete mode 100644 src/sparsetensors/quantization/__init__.py
 delete mode 100644 src/sparsetensors/quantization/lifecycle/__init__.py
 delete mode 100644 src/sparsetensors/quantization/lifecycle/apply.py
 delete mode 100644 src/sparsetensors/quantization/lifecycle/calibration.py
 delete mode 100644 src/sparsetensors/quantization/lifecycle/forward.py
 delete mode 100644 src/sparsetensors/quantization/lifecycle/frozen.py
 delete mode 100644 src/sparsetensors/quantization/lifecycle/initialize.py
 delete mode 100644 src/sparsetensors/quantization/observers/__init__.py
 delete mode 100644 src/sparsetensors/quantization/observers/base.py
 delete mode 100644 src/sparsetensors/quantization/observers/memoryless.py
 delete mode 100644 src/sparsetensors/quantization/observers/min_max.py
 delete mode 100644 src/sparsetensors/quantization/quant_args.py
 delete mode 100644 src/sparsetensors/quantization/quant_config.py
 delete mode 100644 src/sparsetensors/quantization/quant_scheme.py
 delete mode 100644 src/sparsetensors/quantization/utils/__init__.py
 delete mode 100644 src/sparsetensors/quantization/utils/helpers.py
 delete mode 100644 src/sparsetensors/registry/__init__.py
 delete mode 100644 src/sparsetensors/registry/registry.py
 delete mode 100644 src/sparsetensors/utils/__init__.py
 delete mode 100644 src/sparsetensors/utils/helpers.py
 delete mode 100644 src/sparsetensors/utils/safetensors_load.py
 delete mode 100644 tests/quantization/__init__.py
 delete mode 100644 tests/quantization/lifecycle/__init__.py
 delete mode 100644 tests/quantization/lifecycle/test_apply.py
 delete mode 100644 tests/quantization/test_quant_args.py
 delete mode 100644 tests/quantization/test_quant_config.py
 delete mode 100644 tests/quantization/test_quant_scheme.py
 delete mode 100644 tests/sparsetensors/quantization/lifecycle/conftest.py
 delete mode 100644 tests/sparsetensors/quantization/lifecycle/test_forward.py
 delete mode 100644 tests/sparsetensors/quantization/lifecycle/test_frozen.py
 delete mode 100644 tests/sparsetensors/quantization/lifecycle/test_initialize.py
 delete mode 100644 tests/sparsetensors/quantization/lifecycle/test_lifecycle.py
 delete mode 100644 tests/sparsetensors/quantization/observers/test_min_max.py

diff --git a/README.md b/README.md
index 53d7e9bd..fe71acb8 100644
--- a/README.md
+++ b/README.md
@@ -1 +1 @@
-# sparsetensors
+# compressed_tensors
diff --git a/setup.py b/setup.py
index 303f14ad..959180b1 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
 
 def _setup_packages() -> List:
     return find_packages(
-        "src", include=["sparsetensors", "sparsetensors.*"], exclude=["*.__pycache__.*"]
+        "src", include=["compressed_tensors", "compressed_tensors.*"], exclude=["*.__pycache__.*"]
     )
     
 def _setup_install_requires() -> List:
@@ -28,7 +28,7 @@ def _setup_extras() -> Dict:
     return {"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0",]}
 
 setup(
-    name="sparsetensors",
+    name="compressed_tensors",
     version="0.3.0",
     author="Neuralmagic, Inc.",
     author_email="support@neuralmagic.com",
diff --git a/src/sparsetensors/README.md b/src/sparsetensors/README.md
deleted file mode 100644
index 833d1ec2..00000000
--- a/src/sparsetensors/README.md
+++ /dev/null
@@ -1,162 +0,0 @@
-# Save/Load Compressed SafeTensors
-
-## Motivation
-
-* Reduce disk space by saving in a compressed format for sparse models. Models in this compressed format will be loaded by vLLM for more efficient inference
-* Set up the save/load architecture such that we can easily expand to additional compression formats in the future. The config should be human readable so users can understand the compression format at a quick glance
-
-## SafeTensors File Format
-
-For each parameter in the uncompressed state_dict, we store the following attributes 
-needed for decompression in the compressed state_dict:
-
-* compressed tensor
-* bitmask
-* uncompressed shape
-* row offsets
-
-```python
-# dense
-{
-    PARAM_NAME: uncompressed_tensor
-}
-
-# compressed
-{
-    PARAM_NAME.compressed: compressed_tensor # 1d tensor
-    PARAM_NAME.bitmask: value # 2d bitmask tensor (nrows x (ncols / 8))
-    PARAM_NAME.shape: value # uncompressed shape tensor
-    PARAM_NAME.row_offsets: value # 1d offsets tensor
-}
-```
-
-Config information gets stored in the HF config file
-```json
-// config.json
-{
-    "sparsity_config": {
-        "format": "sparse_bitmask", // "dense_sparsity" for original tensor format
-
-        // informational
-        "sparsity_structure": "unstructured", // or 2:4, 8:16 etc...
-        "global_sparsity": "0.5"
-    }
-}
-```
-
-## Saving/Loading Interface 
-
-Loading in a compressed model requires no interface changes
-
-```python
-from sparseml.transformers.utils import SparseAutoModelForCausalLM
-
-# should contain model.safetensors or model.safetensors.index.json
-model_path = "/PATH/TO/COMPRESSED_MODEL"
-
-model = SparseAutoModelForCausalLM.from_pretrained(
-    model_name_or_path=model_path,
-    **model_kwargs,
-)
-```
-
-Saving a compressed model with an explicitly provided compression config. The config
-is saved to the model's `config.json` file. **Note:** the model must have been 
-initialized with SparseAutoModelForCausalLM.from_pretrained()
-
-```python
-from sparsetensors import BitmaskConfig
-
-output_dir = "/PATH/TO/SAVE/COMPRESSED_MODEL"
-sparsity_config = BitmaskConfig()
-
-model.save_pretrained(
-    save_directory=output_dir,
-    sparsity_config=sparsity_config,
-)
-```
-
-Saving a compressed model, inferring the config from the model attributes
-
-```python
-model.save_pretrained(
-    save_directory=output_dir,
-    save_compressed=True
-)
-```
-
-Saving a model in the dense format. If the model has at least 5% global sparsity a 
-sparsity config will still be included in `config.json` with format `dense_sparsity`
-
-```python
-model.save_pretrained(
-    save_directory=output_dir
-)
-```
-
-Saving a model in the dense format, bypassing the sparsity config calculation. When the
-`skip_compression_stats` flag is set, no sparsity config will be written to 
-`config.json`
-
-```python
-model.save_pretrained(
-    save_directory=output_dir
-    skip_compression_stats=True
-)
-```
-
-## Enable Compression During One-Shot and Sparse Finetunining
-Models that are saved in a supported compressed format on disk will automatically be
-decompressed when loaded as input to `sparseml.transformers.oneshot` or 
-`sparseml.transformers.train`
-
-To enable compression on save after oneshot or finetuning simply add the 
-`save_compressed=True` argument to `sparseml.transformers.oneshot` or 
-`sparseml.transformers.train`
-
-```python
-from sparseml.transformers import train
-
-train(
-    save_compressed=True,
-    model="neuralmagic/TinyLlama-1.1B-Chat-v1.0-pruned2.4",
-    recipe=RECIPE,
-    dataset=DATASET
-)
-```
-
-
-## Example Code
-
-Loads a 60% sparse model, compresses it using the inferred bitmask compression, then 
-reloads the compressed model.
-
-```python
-from sparseml.transformers import SparseAutoModelForCausalLM
-from sparseml.utils.pytorch.utils import measure_cuda_memory
-import torch
-
-MODEL_PATH = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60"
-OUTPUT_PATH = "./test_compress_output"
-RECIPE = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60"
-
-torch.cuda.set_device(0)
-with measure_cuda_memory() as m:
-    model = SparseAutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="cuda:0")
-print(f"Load dense model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
-
-sparsity_config = getattr(model,"sparsity_config", None)
-print(f"Sparsity config before compression: {sparsity_config}")
-with measure_cuda_memory() as m:
-    model.save_pretrained(OUTPUT_PATH, save_compressed=True)
-print(f"Save compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
-
-torch.cuda.set_device(1)
-with measure_cuda_memory() as m:
-    model_again = SparseAutoModelForCausalLM.from_pretrained(
-        OUTPUT_PATH, device_map="cuda:1"
-    )
-print(f"Load compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
-sparsity_config = getattr(model_again,"sparsity_config", None)
-print(f"Sparsity config after compression: {sparsity_config}")
-```
diff --git a/src/sparsetensors/__init__.py b/src/sparsetensors/__init__.py
deleted file mode 100644
index 0833dd42..00000000
--- a/src/sparsetensors/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .base import *
-
-# flake8: noqa
-from .compressors import *
-from .config import *
-from .quantization import QuantizationConfig, QuantizationStatus
-from .utils import *
diff --git a/src/sparsetensors/base.py b/src/sparsetensors/base.py
deleted file mode 100644
index f01a055f..00000000
--- a/src/sparsetensors/base.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-SPARSITY_CONFIG_NAME = "sparsity_config"
diff --git a/src/sparsetensors/compressors/__init__.py b/src/sparsetensors/compressors/__init__.py
deleted file mode 100644
index 1c7362eb..00000000
--- a/src/sparsetensors/compressors/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-
-from .base import ModelCompressor
-from .dense import DenseCompressor
-from .sparse_bitmask import BitmaskCompressor, BitmaskTensor
diff --git a/src/sparsetensors/compressors/base.py b/src/sparsetensors/compressors/base.py
deleted file mode 100644
index a82ce048..00000000
--- a/src/sparsetensors/compressors/base.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import operator
-from typing import Dict, Generator, Tuple
-
-from sparsetensors.base import SPARSITY_CONFIG_NAME
-from sparsetensors.config import CompressionConfig
-from sparsetensors.registry import RegistryMixin
-from torch import Tensor
-from torch.nn import Module, Parameter
-from tqdm import tqdm
-
-
-__all__ = ["ModelCompressor"]
-
-
-class ModelCompressor(RegistryMixin):
-    """
-    Base class representing a model compression algorithm.
-
-    :param config: config specifying compression parameters
-    """
-
-    def __init__(self, config: CompressionConfig):
-        self.config = config
-
-    def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        """
-        Compresses a dense state dict
-
-        :param model_state: state dict of uncompressed model
-        :return: compressed state dict
-        """
-        raise NotImplementedError()
-
-    def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]:
-        """
-        Reads a compressed state dict located at model_path and returns a
-        generator for sequentially decompressing back to a dense state dict
-
-        :param model_path: path to compressed safetensors model
-        :return: compressed state dict
-        """
-        raise NotImplementedError()
-
-    def overwrite_weights(self, model_path: str, model: Module):
-        """
-        Overwrites the weights in model with weights decompressed from model_path
-
-        :param model_path: path to compressed weights
-        :param model: pytorch model to load decompressed weights into
-        """
-        dense_gen = self.decompress(model_path)
-        for name, data in tqdm(dense_gen, desc="Decompressing model"):
-            # loading the decompressed weights into the model
-            model_device = operator.attrgetter(name)(model).device
-            data_new = Parameter(data.to(model_device))
-            data_old = operator.attrgetter(name)(model)
-            data_old.data = data_new.data
-
-        setattr(model, SPARSITY_CONFIG_NAME, self.config)
diff --git a/src/sparsetensors/compressors/dense.py b/src/sparsetensors/compressors/dense.py
deleted file mode 100644
index 87f112ac..00000000
--- a/src/sparsetensors/compressors/dense.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Generator, Tuple
-
-from sparsetensors.compressors import ModelCompressor
-from torch import Tensor
-
-
-@ModelCompressor.register(name="dense_sparsity")
-class DenseCompressor(ModelCompressor):
-    """
-    Identity compressor for dense models, returns the original state_dict
-    """
-
-    def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        return model_state
-
-    def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]:
-        return iter([])
diff --git a/src/sparsetensors/compressors/sparse_bitmask.py b/src/sparsetensors/compressors/sparse_bitmask.py
deleted file mode 100644
index 3043e43b..00000000
--- a/src/sparsetensors/compressors/sparse_bitmask.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from typing import Dict, Generator, List, Tuple, Union
-
-import numpy
-import torch
-from safetensors import safe_open
-from sparsetensors.compressors import ModelCompressor
-from sparsetensors.utils import get_nested_weight_mappings, merge_names
-from torch import Tensor
-from tqdm import tqdm
-
-
-__all__ = [
-    "BitmaskCompressor",
-    "BitmaskTensor",
-    "bitmask_compress",
-    "bitmask_decompress",
-    "pack_bitmasks",
-    "unpack_bitmasks",
-]
-
-_LOGGER: logging.Logger = logging.getLogger(__name__)
-
-
-@ModelCompressor.register(name="sparse_bitmask")
-class BitmaskCompressor(ModelCompressor):
-    """
-    Compression for sparse models using bitmasks. Non-zero weights are stored in a 1d
-    values tensor, with their locations stored in a 2d bitmask
-    """
-
-    COMPRESSION_PARAM_NAMES = ["shape", "compressed", "bitmask", "row_offsets"]
-
-    def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        """
-        Compresses a dense state dict using bitmask compression
-
-        :param model_state: state dict of uncompressed model
-        :return: compressed state dict
-        """
-        compressed_dict = {}
-        _LOGGER.debug(
-            f"Compressing model with {len(model_state)} parameterized layers..."
-        )
-        for name, value in tqdm(model_state.items(), desc="Compressing model"):
-            bitmask_tensor = BitmaskTensor.from_dense(value)
-            bitmask_dict = bitmask_tensor.dict(name_prefix=name, device="cpu")
-            for key in bitmask_dict.keys():
-                if key in compressed_dict:
-                    _LOGGER.warn(
-                        f"Expected all compressed state_dict keys to be unique, but "
-                        f"found an existing entry for {key}. The existing entry will "
-                        "be replaced."
-                    )
-            compressed_dict |= bitmask_dict
-
-        return compressed_dict
-
-    def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]:
-        """
-        Reads a bitmask compressed state dict located at model_path and returns a
-        generator for sequentially decompressing back to a dense state dict
-
-        :param model_path: path to compressed safetensors model
-        :return: iterator for generating decompressed weights
-        """
-        weight_mappings = get_nested_weight_mappings(
-            model_path, self.COMPRESSION_PARAM_NAMES
-        )
-        for weight_name in weight_mappings.keys():
-            weight_data = {}
-            for param_name, safe_path in weight_mappings[weight_name].items():
-                full_name = merge_names(weight_name, param_name)
-                with safe_open(safe_path, framework="pt", device="cpu") as f:
-                    weight_data[param_name] = f.get_tensor(full_name)
-            data = BitmaskTensor(**weight_data)
-            decompressed = data.decompress()
-            yield weight_name, decompressed
-
-
-class BitmaskTensor:
-    """
-    Owns compressions and decompression for a single bitmask compressed tensor.
-    Adapted from: https://github.com/mgoin/torch_bitmask/tree/main
-
-    :param shape: shape of dense tensor
-    :compressed: flat tensor of non-zero values
-    :bitmask: 2d bitmask of non-zero values
-    :row_offsets: flat tensor indicating what index in values each dense row starts at
-    """
-
-    def __init__(
-        self,
-        shape: Union[torch.Size, List],
-        compressed: Tensor,
-        bitmask: Tensor,
-        row_offsets: Tensor,
-    ):
-        self.shape = list(shape)
-        self.compressed = compressed
-        self.bitmask = bitmask
-        self.row_offsets = row_offsets
-
-    @staticmethod
-    def from_dense(tensor: Tensor) -> "BitmaskTensor":
-        """
-        :param tensor: dense tensor to compress
-        :return: instantiated compressed tensor
-        """
-        shape = tensor.shape
-        compressed, bitmask, row_offsets = bitmask_compress(tensor.cpu())
-        return BitmaskTensor(
-            shape=shape, compressed=compressed, bitmask=bitmask, row_offsets=row_offsets
-        )
-
-    def decompress(self) -> Tensor:
-        """
-        :return: reconstructed dense tensor
-        """
-        return bitmask_decompress(self.compressed, self.bitmask, self.shape)
-
-    def curr_memory_size_bytes(self):
-        """
-        :return: size in bytes required to store compressed tensor on disk
-        """
-
-        def sizeof_tensor(a):
-            return a.element_size() * a.nelement()
-
-        return (
-            sizeof_tensor(self.compressed)
-            + sizeof_tensor(self.bitmask)
-            + sizeof_tensor(self.row_offsets)
-        )
-
-    def dict(self, name_prefix: str, device: str = "cpu") -> Dict[str, Tensor]:
-        """
-        :name_prefix: name of original tensor to store compressed weight as
-        :return: dict of compressed data for the stored weight
-        """
-        return {
-            merge_names(name_prefix, "shape"): torch.tensor(self.shape, device=device),
-            merge_names(name_prefix, "compressed"): self.compressed.to(device),
-            merge_names(name_prefix, "bitmask"): self.bitmask.to(device),
-            merge_names(name_prefix, "row_offsets"): self.row_offsets.to(device),
-        }
-
-    def __repr__(self):
-        return f"BitmaskTensor(shape={self.shape}, compressed=True)"
-
-
-def bitmask_compress(tensor: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
-    """
-    Compresses a dense tensor using bitmask compression
-
-    :param tensor: dense tensor to compress
-    :return: tuple of compressed data representing tensor
-    """
-    bytemasks = tensor != 0
-    row_counts = bytemasks.sum(dim=-1)
-    row_offsets = torch.cumsum(row_counts, 0) - row_counts
-    values = tensor[bytemasks]
-    bitmasks_packed = pack_bitmasks(bytemasks)
-
-    return values, bitmasks_packed, row_offsets
-
-
-def bitmask_decompress(
-    values: Tensor, bitmasks: Tensor, original_shape: torch.Size
-) -> Tensor:
-    """
-    Reconstructs a dense tensor from a compressed one
-
-    :param values: 1d tensor of non-zero values
-    :param bitmasks: 2d int8 tensor flagging locations of non-zero values in the
-    tensors original shape
-    :param original_shape: shape of the dense tensor
-    :return: decompressed dense tensor
-    """
-    bytemasks_unpacked = unpack_bitmasks(bitmasks, original_shape)
-
-    decompressed_tensor = torch.zeros(original_shape, dtype=values.dtype)
-    decompressed_tensor[bytemasks_unpacked] = values
-
-    return decompressed_tensor
-
-
-def pack_bitmasks(bytemasks: Tensor) -> Tensor:
-    """
-    Converts a bytemask tensor to a bitmask tensor to reduce memory. Shape RxC will be
-    compressed to R x ceil(C/8)
-    :param bytemasks: mask tensor where each byte corresponds to a weight
-    :return: mask tensor where each bit corresounds to a weight
-    """
-    packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little")
-    packed_bits_torch = torch.from_numpy(packed_bits_numpy)
-
-    return packed_bits_torch
-
-
-def unpack_bitmasks(packed_bitmasks: Tensor, original_shape: torch.Size) -> Tensor:
-    """
-    Converts a bitmask tensor back to a bytemask tensor for use during decompression
-
-    :param packed_bitmasks: mask tensor where each bit corresponds to a weight
-    :param original_shape: dense shape to decompress to
-    :return: boolean mask of weights in the original dense shape
-    """
-    # Unpack the bits
-    unpacked_bits = numpy.unpackbits(
-        packed_bitmasks.numpy(), axis=-1, count=original_shape[-1], bitorder="little"
-    )
-
-    # Reshape to match the original shape
-    unpacked_bitmasks_torch = torch.from_numpy(
-        unpacked_bits.reshape(original_shape).astype(bool)
-    )
-
-    return unpacked_bitmasks_torch
diff --git a/src/sparsetensors/config/__init__.py b/src/sparsetensors/config/__init__.py
deleted file mode 100644
index ff83f5af..00000000
--- a/src/sparsetensors/config/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from .base import *
-from .dense import *
-from .sparse_bitmask import *
diff --git a/src/sparsetensors/config/base.py b/src/sparsetensors/config/base.py
deleted file mode 100644
index 8af48bd9..00000000
--- a/src/sparsetensors/config/base.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from pydantic import BaseModel
-from sparsetensors.registry import RegistryMixin
-
-
-__all__ = ["CompressionConfig"]
-
-
-class CompressionConfig(RegistryMixin, BaseModel):
-    """
-    Base data class for storing compression parameters
-
-    :param format: name of compression format
-    :param global_sparsity: average sparsity of the entire model
-    :param sparsity_structure: structure of the sparsity, such as
-    "unstructured", "2:4", "8:16" etc
-    """
-
-    format: str
-    global_sparsity: Optional[float] = 0.0
-    sparsity_structure: Optional[str] = "unstructured"
diff --git a/src/sparsetensors/config/dense.py b/src/sparsetensors/config/dense.py
deleted file mode 100644
index a37be308..00000000
--- a/src/sparsetensors/config/dense.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from sparsetensors.config import CompressionConfig
-
-
-__all__ = ["DenseSparsityConfig"]
-
-
-@CompressionConfig.register(name="dense_sparsity")
-class DenseSparsityConfig(CompressionConfig):
-    """
-    Identity configuration for storing a sparse model in
-    an uncompressed dense format
-
-    :param global_sparsity: average sparsity of the entire model
-    :param sparsity_structure: structure of the sparsity, such as
-    "unstructured", "2:4", "8:16" etc
-    """
-
-    format: str = "dense_sparsity"
-    global_sparsity: Optional[float] = 0.0
-    sparsity_structure: Optional[str] = "unstructured"
diff --git a/src/sparsetensors/config/sparse_bitmask.py b/src/sparsetensors/config/sparse_bitmask.py
deleted file mode 100644
index d17c6a1a..00000000
--- a/src/sparsetensors/config/sparse_bitmask.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from sparsetensors.config.base import CompressionConfig
-
-
-__all__ = ["BitmaskConfig"]
-
-
-@CompressionConfig.register(name="sparse_bitmask")
-class BitmaskConfig(CompressionConfig):
-    """
-    Configuration for storing a sparse model using
-    bitmask compression
-
-    :param global_sparsity: average sparsity of the entire model
-    :param sparsity_structure: structure of the sparsity, such as
-    "unstructured", "2:4", "8:16" etc
-    """
-
-    format: str = "sparse_bitmask"
-    global_sparsity: Optional[float] = 0.0
-    sparsity_structure: Optional[str] = "unstructured"
diff --git a/src/sparsetensors/quantization/__init__.py b/src/sparsetensors/quantization/__init__.py
deleted file mode 100644
index 9fde69a3..00000000
--- a/src/sparsetensors/quantization/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-# isort: skip_file
-
-from .quant_args import *
-from .quant_config import *
-from .quant_scheme import *
-from .lifecycle import *
diff --git a/src/sparsetensors/quantization/lifecycle/__init__.py b/src/sparsetensors/quantization/lifecycle/__init__.py
deleted file mode 100644
index 9504597b..00000000
--- a/src/sparsetensors/quantization/lifecycle/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-# isort: skip_file
-
-from .calibration import *
-from .forward import *
-from .frozen import *
-from .initialize import *
-from .apply import *
diff --git a/src/sparsetensors/quantization/lifecycle/apply.py b/src/sparsetensors/quantization/lifecycle/apply.py
deleted file mode 100644
index ac238564..00000000
--- a/src/sparsetensors/quantization/lifecycle/apply.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from collections import OrderedDict
-from typing import Iterable, Optional
-
-from sparsetensors.quantization.lifecycle.calibration import set_module_for_calibration
-from sparsetensors.quantization.lifecycle.frozen import freeze_module_quantization
-from sparsetensors.quantization.lifecycle.initialize import (
-    initialize_module_for_quantization,
-)
-from sparsetensors.quantization.quant_config import (
-    QuantizationConfig,
-    QuantizationStatus,
-)
-from sparsetensors.quantization.utils import iter_named_leaf_modules
-from torch.nn import Module
-
-
-__all__ = [
-    "apply_quantization_config",
-    "apply_quantization_status",
-]
-
-
-def apply_quantization_config(model: Module, config: QuantizationConfig):
-    """
-    Initializes the model for quantization in-place based on the given config
-
-    :param model: model to apply quantization config to
-    :param config: quantization config
-    """
-    # build mapping of targets to schemes for easier matching
-    # use ordered dict to preserve target ordering in config
-    target_to_scheme = OrderedDict()
-    for scheme in config.config_groups.values():
-        for target in scheme.targets:
-            target_to_scheme[target] = scheme
-
-    # mark appropriate layers for quantization by setting their quantization schemes
-    for name, submodule in iter_named_leaf_modules(model):
-        if _find_first_name_or_class_match(name, submodule, config.ignore):
-            continue  # layer matches ignore list, continue
-        target = _find_first_name_or_class_match(name, submodule, target_to_scheme)
-        if target is not None:
-            # target matched - add layer and scheme to target list
-            submodule.quantization_scheme = target_to_scheme[target]
-
-    # apply current quantization status across all targeted layers
-    apply_quantization_status(model, config.quantization_status)
-
-
-def apply_quantization_status(model: Module, status: QuantizationStatus):
-    """
-    Applies in place the quantization lifecycle up to the given status
-
-    :param model: model to apply quantization to
-    :param status: status to update the module to
-    """
-    if status >= QuantizationStatus.INITIALIZED:
-        model.apply(initialize_module_for_quantization)
-    if status >= QuantizationStatus.CALIBRATION:
-        model.apply(set_module_for_calibration)
-    if status >= QuantizationStatus.FROZEN:
-        model.apply(freeze_module_quantization)
-
-
-def _find_first_name_or_class_match(
-    name: str,
-    module: Module,
-    targets: Iterable[str],
-) -> Optional[str]:
-    # first element of targets that matches the given name
-    # if no name matches returns first target that matches the class name
-    # returns None otherwise
-    return _find_first_match(name, targets) or _find_first_match(
-        module.__class__.__name__, targets
-    )
-
-
-def _find_first_match(value: str, targets: Iterable[str]) -> Optional[str]:
-    # returns first element of target that matches value either
-    # exactly or as a regex after 're:'
-    for target in targets:
-        if target.startswith("re:"):
-            pattern = target[3:]
-            if re.match(pattern, value):
-                return target
-        elif target == value:
-            return target
-    return None
diff --git a/src/sparsetensors/quantization/lifecycle/calibration.py b/src/sparsetensors/quantization/lifecycle/calibration.py
deleted file mode 100644
index 51c594fb..00000000
--- a/src/sparsetensors/quantization/lifecycle/calibration.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-
-from sparsetensors.quantization.quant_config import QuantizationStatus
-from torch.nn import Module
-
-
-__all__ = [
-    "set_module_for_calibration",
-]
-
-
-_LOGGER = logging.getLogger(__name__)
-
-
-def set_module_for_calibration(module: Module):
-    """
-    marks a layer as ready for calibration which activates observers
-    to update scales and zero points on each forward pass
-
-    apply to full model with `model.apply(set_module_for_calibration)`
-
-    :param module: module to set for calibration
-    """
-    if not getattr(module, "quantization_scheme", None):
-        # no quantization scheme nothing to do
-        return
-    status = getattr(module, "quantization_status", None)
-    if not status or status != QuantizationStatus.INITIALIZED:
-        raise _LOGGER.warning(
-            f"Attempting set module with status {status} to calibration mode. "
-            f"but status is not {QuantizationStatus.INITIALIZED} - you may "
-            "be calibrating an uninitialized module which may fail or attempting "
-            "to re-calibrate a frozen module"
-        )
-
-    module.quantization_status = QuantizationStatus.CALIBRATION
diff --git a/src/sparsetensors/quantization/lifecycle/forward.py b/src/sparsetensors/quantization/lifecycle/forward.py
deleted file mode 100644
index 5e6036ea..00000000
--- a/src/sparsetensors/quantization/lifecycle/forward.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import wraps
-
-import torch
-from sparsetensors.quantization.quant_args import QuantizationArgs
-from sparsetensors.quantization.quant_config import QuantizationStatus
-from sparsetensors.quantization.quant_scheme import QuantizationScheme
-from torch.nn import Module
-
-
-__all__ = [
-    "wrap_module_forward_quantized",
-    "quantize",
-    "dequantize",
-    "fake_quantize",
-    "maybe_calibrate_or_quantize",
-]
-
-
-@torch.no_grad()
-def quantize(
-    x: torch.Tensor,
-    scale: torch.Tensor,
-    zero_point: torch.Tensor,
-    q_max: torch.Tensor,
-) -> torch.Tensor:
-    return torch.clamp(
-        torch.round(
-            x / scale + zero_point,
-        ),
-        0,
-        q_max,
-    )
-
-
-@torch.no_grad()
-def dequantize(
-    x_q: torch.Tensor,
-    scale: torch.Tensor,
-    zero_point: torch.Tensor,
-) -> torch.Tensor:
-    return (x_q - zero_point) * scale
-
-
-@torch.no_grad()
-def fake_quantize(
-    x: torch.Tensor,
-    scale: torch.Tensor,
-    zero_point: torch.Tensor,
-    args: QuantizationArgs,
-) -> torch.Tensor:
-    max_q = torch.tensor(2**args.num_bits - 1, device=x.device)
-    Q = torch.zeros_like(x)
-    Q = quantize(x, scale, zero_point, max_q)
-    return dequantize(Q, scale, zero_point)
-
-
-def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
-    # expects a module already initialized and injected with the parameters in
-    # initialize_module_for_quantization
-    forward_func_orig = module.forward.__func__
-
-    @wraps(forward_func_orig)  # ensures docstring, names, etc are propagated
-    def wrapped_forward(self, *args, **kwargs):
-        input_ = args[0]
-
-        if scheme.input_activations is not None:
-            # calibrate and (fake) quantize input activations when applicable
-            input_ = _maybe_calibrate_or_quantize(
-                module, input_, "input", scheme.input_activations
-            )
-
-        if scheme.weights is not None:
-            # calibrate and (fake) quantize weights when applicable
-            self.weight.data = _maybe_calibrate_or_quantize(
-                module, self.weight, "weight", scheme.weights
-            )
-
-        # perform wrapped forward call
-        output = forward_func_orig.__get__(module, module.__class__)(
-            input_, *args[1:], **kwargs
-        )
-
-        if scheme.output_activations is not None:
-            # calibrate and (fake) quantize output activations when applicable
-            output = _maybe_calibrate_or_quantize(
-                module, output, "output", scheme.output_activations
-            )
-
-        return output
-
-    # bind wrapped forward to module class so reference to `self` is correct
-    bound_wrapped_forward = wrapped_forward.__get__(module, module.__class__)
-    # set forward to wrapped forward
-    setattr(module, "forward", bound_wrapped_forward)
-
-
-def _maybe_calibrate_or_quantize(
-    module: Module, value: Module, base_name: str, args: "QuantizationArgs"
-) -> torch.Tensor:
-    # only run quantized for the included stages
-    if module.quantization_status not in {
-        QuantizationStatus.CALIBRATION,
-        QuantizationStatus.FROZEN,
-    }:
-        return value
-
-    device = next(module.parameters()).device
-    scale = getattr(module, f"{base_name}_scale")
-    # zero_point = getattr(module, f"{base_name}_zero_point").data
-    zero_point = getattr(module, f"{base_name}_zero_point")
-
-    if module.quantization_status == QuantizationStatus.CALIBRATION:
-        # get observer and get new quant params from observation
-        observer = getattr(module, f"{base_name}_observer")
-        updated_scale, updated_zero_point = observer(value)
-
-        # update scale and zero point
-        scale.data = updated_scale.to(device)
-        zero_point.data = updated_zero_point.to(device)
-
-    return fake_quantize(value, scale, zero_point, args)
diff --git a/src/sparsetensors/quantization/lifecycle/frozen.py b/src/sparsetensors/quantization/lifecycle/frozen.py
deleted file mode 100644
index 9715a4b2..00000000
--- a/src/sparsetensors/quantization/lifecycle/frozen.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from sparsetensors.quantization.quant_config import QuantizationStatus
-from torch.nn import Module
-
-
-__all__ = [
-    "freeze_module_quantization",
-]
-
-
-def freeze_module_quantization(module: Module):
-    """
-    deletes observers so static quantization is completed.
-
-    apply to full model with `model.apply(freeze_module_quantization)`
-
-    :param module: module to freeze quantization for
-    """
-    if not getattr(module, "quantization_scheme", None):
-        # no quantization scheme nothing to do
-        return
-
-    # delete observers from module
-    submodule_name_do_delete = set()
-    for submodule_name, _ in module.named_modules():
-        if "." not in submodule_name and submodule_name.endswith("_observer"):
-            # delete any observers that belong directly to this module
-            submodule_name_do_delete.add(submodule_name)
-
-    for submodule_name in submodule_name_do_delete:
-        delattr(module, submodule_name)
-
-    module.quantization_status = QuantizationStatus.FROZEN
diff --git a/src/sparsetensors/quantization/lifecycle/initialize.py b/src/sparsetensors/quantization/lifecycle/initialize.py
deleted file mode 100644
index aa6e3994..00000000
--- a/src/sparsetensors/quantization/lifecycle/initialize.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-from typing import Optional
-
-import torch
-from sparsetensors.quantization.lifecycle.forward import wrap_module_forward_quantized
-from sparsetensors.quantization.quant_args import QuantizationArgs
-from sparsetensors.quantization.quant_config import QuantizationStatus
-from sparsetensors.quantization.quant_scheme import QuantizationScheme
-from torch.nn import Module, Parameter
-
-
-__all__ = [
-    "initialize_module_for_quantization",
-]
-
-
-_LOGGER = logging.getLogger(__name__)
-
-
-def initialize_module_for_quantization(
-    module: Module,
-    scheme: Optional[QuantizationScheme] = None,
-):
-    """
-    attaches appropriate scales, zero points, and observers to a layer
-    given its target quantization scheme
-
-    apply to full model with `model.apply(initialize_module_for_quantization)`
-
-    :param module: module to set for calibration
-    :param scheme: scheme to use for quantization. if None is provided,
-        will attempt to use scheme stored in the module under `quantization_scheme`,
-        if not provided, the layer will be skipped
-    """
-    scheme = scheme or getattr(module, "quantization_scheme", None)
-    if scheme is None:
-        # no scheme passed and layer not targeted for quantization - skip
-        return
-
-    if scheme.input_activations is not None:
-        _initialize_scale_zero_point_observer(module, "input", scheme.input_activations)
-    if scheme.weights is not None:
-        if hasattr(module, "weight"):
-            _initialize_scale_zero_point_observer(module, "weight", scheme.weights)
-        else:
-            _LOGGER.warning(
-                f"module type {type(module)} targeted for weight quantization but "
-                "has no attribute weight, skipping weight quantization "
-                f"for {type(module)}"
-            )
-    if scheme.output_activations is not None:
-        _initialize_scale_zero_point_observer(
-            module, "output", scheme.output_activations
-        )
-
-    module.quantization_scheme = scheme
-    module.quantization_status = QuantizationStatus.INITIALIZED
-
-    # wrap forward call of module to perform quantized actions based on calltime status
-    wrap_module_forward_quantized(module, scheme)
-
-
-def _initialize_scale_zero_point_observer(
-    module: Module, base_name: str, quantization_args: QuantizationArgs
-):
-    device = next(module.parameters()).device
-
-    # initializes empty scale and zero point parameters for the module
-    init_scale = Parameter(torch.empty(0, device=device), requires_grad=False)
-    module.register_parameter(f"{base_name}_scale", init_scale)
-
-    init_zero_point = Parameter(
-        torch.empty(0, device=device, dtype=int), requires_grad=False
-    )
-    module.register_parameter(f"{base_name}_zero_point", init_zero_point)
-
-    # initialize observer module and attach as submodule
-    observer = quantization_args.get_observer()
-    module.register_module(f"{base_name}_observer", observer)
diff --git a/src/sparsetensors/quantization/observers/__init__.py b/src/sparsetensors/quantization/observers/__init__.py
deleted file mode 100644
index d0362b8f..00000000
--- a/src/sparsetensors/quantization/observers/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-
-from .base import *
-from .memoryless import *
-from .min_max import *
diff --git a/src/sparsetensors/quantization/observers/base.py b/src/sparsetensors/quantization/observers/base.py
deleted file mode 100644
index 52a464b9..00000000
--- a/src/sparsetensors/quantization/observers/base.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple
-
-from sparsetensors.quantization.quant_args import QuantizationArgs
-from sparsetensors.registry.registry import RegistryMixin
-from torch import FloatTensor, IntTensor, Tensor
-from torch.nn import Module
-
-
-__all__ = ["Observer"]
-
-
-class Observer(Module, RegistryMixin):
-    """
-    Base Observer class to be subclassed for specific implementation.
-    Subclasses should override `calculate_qparams` to return a scale, zero_point
-    pair
-    """
-
-    def __init__(self, quantization_args: QuantizationArgs):
-        self.quantization_args: QuantizationArgs = quantization_args
-        super().__init__()
-        self._scale = None
-        self._zero_point = None
-
-    def forward(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
-        """
-        maps directly to get_qparams
-        :param observed: optional observed tensor to calculate quantization parameters
-            from
-        :return: tuple of scale and zero point based on last observed value
-        """
-        return self.get_qparams(observed=observed)
-
-    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
-        """
-        :param observed: observed tensor to calculate quantization parameters for
-        :return: tuple of scale and zero point derived from the observed tensor
-        """
-        raise NotImplementedError(f"{self.__class__} must implement calculate_qparams")
-
-    def get_qparams(
-        self, observed: Optional[Tensor] = None
-    ) -> Tuple[FloatTensor, IntTensor]:
-        """
-        Convenience function to wrap overwritten calculate_qparams
-        adds support to make observed tensor optional and support for tracking latest
-        calculated scale and zero point
-        :param observed: optional observed tensor to calculate quantization parameters
-            from
-        :return: tuple of scale and zero point based on last observed value
-        """
-        if observed is not None:
-            # re-calcualte scale and zero point, update the stored value
-            self._scale, self._zero_point = self.calculate_qparams(observed)
-        return self._scale, self._zero_point
diff --git a/src/sparsetensors/quantization/observers/memoryless.py b/src/sparsetensors/quantization/observers/memoryless.py
deleted file mode 100644
index 5fd92a6e..00000000
--- a/src/sparsetensors/quantization/observers/memoryless.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Tuple
-
-import torch
-from sparsetensors.quantization.observers.base import Observer
-from torch import FloatTensor, IntTensor, Tensor
-
-
-__all__ = ["MemorylessObserver"]
-
-
-@Observer.register("memoryless")
-class MemorylessObserver(Observer):
-    """
-    Implements a dynamic quantization observer that sets the scale and
-    zero point based on the latest observed value
-    """
-
-    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
-        """
-        :param observed: observed tensor to calculate quantization parameters for
-        :return: tuple of scale and zero point derived from the observed tensor
-        """
-        # TODO: Add support for full range of quantization Args, only supports 8bit
-        #       per tensor
-        bit_range = 255
-        min_val = observed.min()
-        max_val = observed.max()
-
-        # ensure zero is in the range
-        min_val = torch.min(min_val, torch.zeros_like(min_val))
-        max_val = torch.max(max_val, torch.zeros_like(max_val))
-
-        if self.quantization_args.symmetric:
-            symmetric_range = 2 * max(min_val.abs(), max_val.abs())
-            scale = symmetric_range / bit_range
-            zero_point = torch.tensor(0).to(torch.int8)
-        else:
-            # non-symmetric
-            observed_range = max_val - min_val
-            scale = observed_range / bit_range
-
-            # scales from a 0 range should be set to 1
-            scale[observed_range == 0] = 1
-
-            zero_point = ((0 - min_val) / scale).to(torch.int8)
-
-        return scale, zero_point
diff --git a/src/sparsetensors/quantization/observers/min_max.py b/src/sparsetensors/quantization/observers/min_max.py
deleted file mode 100644
index e73805b4..00000000
--- a/src/sparsetensors/quantization/observers/min_max.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Tuple
-
-import torch
-from sparsetensors.quantization.observers.base import Observer
-from sparsetensors.quantization.quant_args import QuantizationArgs
-from torch import FloatTensor, IntTensor, Tensor
-
-
-__all__ = ["MinMaxObserver"]
-
-
-@Observer.register("minmax")
-class MinMaxObserver(Observer):
-    """
-    Implements a dynamic quantization observer that sets the scale and
-    zero point based on the latest observed value
-    """
-
-    def __init__(self, quantization_args: QuantizationArgs):
-        super().__init__(quantization_args=quantization_args)
-
-        self.min_val = float("inf")
-        self.max_val = -float("inf")
-        self.counter = 0
-
-    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
-        """
-        :param observed: observed tensor to calculate quantization parameters for
-        :return: tuple of scale and zero point derived from the observed tensor
-        """
-        # TODO: Add support for full range of quantization Args, only supports 8bit
-        #       per tensor
-        bit_range = 255
-        min_val = torch.tensor([observed.min()])
-        max_val = torch.tensor([observed.max()])
-
-        # update running average
-        if self.counter > 0:
-            self.min_val = (self.min_val * self.counter + min_val) / (self.counter + 1)
-            self.max_val = (self.max_val * self.counter + max_val) / (self.counter + 1)
-        else:
-            self.min_val = min_val
-            self.max_val = max_val
-
-        # ensure that the zeros are in the range
-        min_val = torch.min(self.min_val, torch.zeros_like(self.min_val))
-        max_val = torch.max(self.max_val, torch.zeros_like(self.max_val))
-
-        self.counter += 1
-
-        if self.quantization_args.symmetric:
-            symmetric_range = 2 * max(min_val.abs(), max_val.abs())
-            scale = symmetric_range / bit_range
-            zero_point = torch.tensor(0).to(torch.int8)
-        else:
-            # non-symmetric
-            observed_range = max_val - min_val
-            scale = observed_range / bit_range
-
-            # scales from a 0 range should be set to 1
-            scale[observed_range == 0] = 1
-
-            zero_point = ((0 - min_val) / scale).to(torch.int8)
-
-        return scale, zero_point
diff --git a/src/sparsetensors/quantization/quant_args.py b/src/sparsetensors/quantization/quant_args.py
deleted file mode 100644
index 76bd61f0..00000000
--- a/src/sparsetensors/quantization/quant_args.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from enum import Enum
-from typing import Any, Dict, Optional
-
-from pydantic import BaseModel, Field
-
-
-__all__ = ["QuantizationType", "QuantizationStrategy", "QuantizationArgs"]
-
-
-class QuantizationType(str, Enum):
-    """
-    Enum storing quantization type options
-    """
-
-    INT = "int"
-    FLOAT = "float"
-
-
-class QuantizationStrategy(str, Enum):
-    """
-    Enum storing quantization strategy options
-    """
-
-    TENSOR = "tensor"
-    CHANNEL = "channel"
-    GROUP = "group"
-    BLOCK = "block"
-
-
-class QuantizationArgs(BaseModel):
-    """
-    User facing arguments used to define a quantization config for weights or
-    activations
-
-    :param num_bits: quantization bit depth
-    :param type: dtype to quantized to, either int or float
-    :param symmetric: whether or not quantization scale is symmetric about zero-point
-    :param strategy: string id determining the scope of scale/zero-point to apply
-    :param group_size: group length to use for the group strategy
-    :param block_structure: 2d block structure to use for the block strategy, must be
-    of the format "2x4", "8x16", etc.
-    """
-
-    num_bits: int = 8
-    type: QuantizationType = QuantizationType.INT
-    symmetric: bool = True
-    strategy: QuantizationStrategy = QuantizationStrategy.TENSOR
-    group_size: Optional[int] = None
-    block_structure: Optional[str] = None
-    observer: str = Field(
-        default="minmax",
-        description=(
-            "The class to use to compute the quantization param - "
-            "scale and zero-point'"
-        ),
-    )
-    observer_kwargs: Dict[str, Any] = Field(
-        default_factory=dict,
-        description=(
-            "optional dict of kwargs to be passed directly to torch quantization "
-            "Observers constructor excluding quantization range or symmetry"
-        ),
-    )
-
-    def get_observer(self):
-        """
-        :return: torch quantization FakeQuantize built based on these QuantizationArgs
-        """
-        from sparsetensors.quantization.observers.base import Observer
-
-        return Observer.load_from_registry(self.observer, quantization_args=self)
diff --git a/src/sparsetensors/quantization/quant_config.py b/src/sparsetensors/quantization/quant_config.py
deleted file mode 100644
index 2a2b345f..00000000
--- a/src/sparsetensors/quantization/quant_config.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from enum import Enum
-from typing import Dict, List, Optional
-
-from pydantic import BaseModel, Field
-from sparsetensors.quantization.quant_scheme import QuantizationScheme
-from sparsetensors.quantization.utils import (
-    calculate_compression_ratio,
-    is_module_quantized,
-    iter_named_leaf_modules,
-    module_type,
-)
-from torch.nn import Module
-
-
-__all__ = [
-    "QuantizationStatus",
-    "QuantizationConfig",
-    "LIFECYCLE_ORDER",
-]
-
-
-class QuantizationStatus(str, Enum):
-    """
-    Enum storing the different states a quantized layer can be in
-
-    Initialized: scale, zero points and observers have been attached to the layer but
-    are set to dummy values (not yet calibrated)
-    Calibration: scale and zero points have been calibrated through OBCQ or similar
-    algorithm, observers are still attached
-    Frozen: scale and zero points are finalized, observers have been deleted, weights
-    are still in their original precision
-    Compressed: weights have been converted to their target type or compressed to
-    their closed approximation
-    """
-
-    INITIALIZED = "initialized"
-    CALIBRATION = "calibration"
-    FROZEN = "frozen"
-    COMPRESSED = "compressed"
-
-    @classmethod
-    def lifecycle_order(cls) -> List["QuantizationStatus"]:
-        """
-        :return: list of correct quantization lifecycle order
-        """
-        return
-
-    def __ge__(self, other):
-        if not isinstance(other, self.__class__):
-            raise NotImplementedError
-        return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other)
-
-
-LIFECYCLE_ORDER = [
-    QuantizationStatus.INITIALIZED,
-    QuantizationStatus.CALIBRATION,
-    QuantizationStatus.FROZEN,
-    QuantizationStatus.COMPRESSED,
-]
-
-
-class QuantizationConfig(BaseModel):
-    """
-    Full configuration specifying how a model is quantized. Each quantized layer is
-    mapped to a QuantizationScheme in config_groups.
-
-    :param config_groups: dict of QuantizationSchemes specifying the quantization
-    settings for each quantized layer
-    :param quant_method: a constant used to differentiate sparseML quantization from
-    other quantization configs
-    :param format: specifies how the quantized model is stored on disk
-    :quantization_status: specifies the current status of all quantized layers. It is
-    assumed all layers are in the same state.
-    :global_compression_ratio: optional informational config to report the model
-    compression ratio acheived by the quantization config
-    :ignore: optional list of layers to ignore from config_groups. Layers in this list
-    are not quantized even if they match up with a target in config_groups
-    """
-
-    config_groups: Dict[str, QuantizationScheme]
-    quant_method: str = "sparseml"
-    format: str = "fakequant"
-    quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
-    global_compression_ratio: Optional[float] = None
-    ignore: Optional[List[str]] = Field(default_factory=list)
-
-    @staticmethod
-    def from_pretrained(model: Module) -> "QuantizationConfig":
-        """
-        Converts a model into its associated QuantizationConfig based on the
-        QuantizationScheme attached to each quanitzed module
-
-        :param model: model to calculate quantization scheme of
-        :return: filled out QuantizationScheme for the input model
-        """
-        quant_scheme_to_layers = []
-        quantization_status = None
-        ignore = {}
-        quantization_type_names = set()
-        for name, submodule in iter_named_leaf_modules(model):
-            layer_type = module_type(submodule)
-            if not is_module_quantized(submodule):
-                if layer_type not in ignore:
-                    ignore[layer_type] = []
-                ignore[layer_type].append(name)
-            else:
-                quantization_status = submodule.quantization_status
-                scheme = submodule.quantization_scheme
-                quantization_type_names.add(layer_type)
-
-                match_found = False
-                for existing_scheme in quant_scheme_to_layers:
-                    if scheme == existing_scheme:
-                        match_found = True
-                        break
-                if not match_found:
-                    quant_scheme_to_layers.append(scheme)
-
-        # clean up ignore list, we can leave out layers types if none of the
-        # instances are quantized
-        consolidated_ignore = []
-        for layer_type, ignore_names in ignore.items():
-            if layer_type in quantization_type_names:
-                # specific layers of a quantized type are ignored
-                consolidated_ignore += ignore_names
-            # else we leave it off the ignore list, doesn't fall under any of the
-            # existing quantization schemes so it won't be quantized
-
-        config_groups = {}
-        for idx, scheme in enumerate(quant_scheme_to_layers):
-            group_name = "group_" + str(idx)
-            config_groups[group_name] = scheme
-
-        compression_ratio = calculate_compression_ratio(model)
-        return QuantizationConfig(
-            config_groups=config_groups,
-            quantization_status=quantization_status,
-            global_compression_ratio=compression_ratio,
-            ignore=consolidated_ignore,
-        )
diff --git a/src/sparsetensors/quantization/quant_scheme.py b/src/sparsetensors/quantization/quant_scheme.py
deleted file mode 100644
index 7077c24e..00000000
--- a/src/sparsetensors/quantization/quant_scheme.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-from pydantic import BaseModel
-from sparsetensors.quantization.quant_args import QuantizationArgs
-
-
-__all__ = ["QuantizationScheme"]
-
-
-class QuantizationScheme(BaseModel):
-    """
-    Set of QuantizationArgs defining how the weights, inputs and outputs of target list
-    of modules should be quantized
-
-    :param targets: list of modules to apply the QuantizationArgs to, can be layer
-    names, layer types or a regular expression
-    :param weights: quantization config for layer weights
-    :param input_activations: quantization config for layer inputs
-    :param output_activations: quantization config for layer outputs
-    """
-
-    targets: List[str]
-    weights: Optional[QuantizationArgs] = None
-    input_activations: Optional[QuantizationArgs] = None
-    output_activations: Optional[QuantizationArgs] = None
diff --git a/src/sparsetensors/quantization/utils/__init__.py b/src/sparsetensors/quantization/utils/__init__.py
deleted file mode 100644
index a91f9e5d..00000000
--- a/src/sparsetensors/quantization/utils/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from .helpers import *
diff --git a/src/sparsetensors/quantization/utils/helpers.py b/src/sparsetensors/quantization/utils/helpers.py
deleted file mode 100644
index 3c00cdbe..00000000
--- a/src/sparsetensors/quantization/utils/helpers.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Tuple
-
-import torch
-from torch.nn import Module
-from tqdm import tqdm
-
-
-__all__ = [
-    "is_module_quantized",
-    "is_model_quantized",
-    "iter_named_leaf_modules",
-    "module_type",
-    "calculate_compression_ratio",
-]
-
-
-def is_module_quantized(module: Module) -> bool:
-    """
-    Check if a module is quantized, based on the existence of a non-empty quantization
-    scheme
-
-    :param module: pytorch module to check
-    :return: True if module is quantized, False otherwise
-    """
-    if not hasattr(module, "quantization_scheme"):
-        return False
-
-    if module.quantization_scheme.weights is not None:
-        return True
-
-    if module.quantization_scheme.input_activations is not None:
-        return True
-
-    if module.quantization_scheme.output_activations is not None:
-        return True
-
-    return False
-
-
-def is_model_quantized(model: Module) -> bool:
-    """
-    Check if any modules in a model are quantized, based on the existence of a non-empty
-    quantization scheme in at least one module
-
-    :param model: pytorch model
-    :return: True if model is quantized, False otherwise
-    """
-
-    for _, submodule in iter_named_leaf_modules(model):
-        if is_module_quantized(submodule):
-            return True
-
-    return False
-
-
-def module_type(module: Module) -> str:
-    """
-    Gets a string representation of a module type
-
-    :module: pytorch module to get type of
-    :return: module type as a string
-    """
-    return type(module).__name__
-
-
-def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]:
-    # yields modules that do not have any submodules
-    # TODO: potentially expand to add list of allowed submodules such as observers
-    for name, submodule in model.named_modules():
-        if len(list(submodule.children())) == 0:
-            yield name, submodule
-
-
-def calculate_compression_ratio(model: Module) -> float:
-    """
-    Calculates the quantization compression ratio of a pytorch model, based on the
-    number of bits needed to represent the total weights in compressed form. Does not
-    take into account activation quantizatons.
-
-    :param model: pytorch module to calculate compression ratio for
-    :return: compression ratio of the whole model
-    """
-    total_compressed = 0.0
-    total_uncompressed = 0.0
-    for name, submodule in tqdm(
-        iter_named_leaf_modules(model),
-        desc="Calculating quantization compression ratio",
-    ):
-        for parameter in model.parameters():
-            try:
-                uncompressed_bits = torch.finfo(parameter.dtype).bits
-            except TypeError:
-                uncompressed_bits = torch.iinfo(parameter.dtype).bits
-            compressed_bits = uncompressed_bits
-            if is_module_quantized(submodule):
-                compressed_bits = submodule.quantization_scheme.weights.num_bits
-            num_weights = parameter.numel()
-            total_compressed += compressed_bits * num_weights
-            total_uncompressed += uncompressed_bits * num_weights
-
-    return total_uncompressed / total_compressed
diff --git a/src/sparsetensors/registry/__init__.py b/src/sparsetensors/registry/__init__.py
deleted file mode 100644
index 241d9d55..00000000
--- a/src/sparsetensors/registry/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# flake8: noqa
-
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .registry import *
diff --git a/src/sparsetensors/registry/registry.py b/src/sparsetensors/registry/registry.py
deleted file mode 100644
index d8d8bc6d..00000000
--- a/src/sparsetensors/registry/registry.py
+++ /dev/null
@@ -1,360 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Universal registry to support registration and loading of child classes and plugins
-of neuralmagic utilities
-"""
-
-import importlib
-from collections import defaultdict
-from typing import Any, Dict, List, Optional, Type, Union
-
-
-__all__ = [
-    "RegistryMixin",
-    "register",
-    "get_from_registry",
-    "registered_names",
-    "registered_aliases",
-    "standardize_lookup_name",
-]
-
-
-_ALIAS_REGISTRY: Dict[Type, Dict[str, str]] = defaultdict(dict)
-_REGISTRY: Dict[Type, Dict[str, Any]] = defaultdict(dict)
-
-
-def standardize_lookup_name(name: str) -> str:
-    """
-    Standardize the given name for lookup in the registry.
-    This will replace all underscores and spaces with hyphens and
-    convert the name to lowercase.
-
-    example:
-    ```
-    standardize_lookup_name("Foo_bar baz") == "foo-bar-baz"
-    ```
-
-    :param name: name to standardize
-    :return: standardized name
-    """
-    return name.replace("_", "-").replace(" ", "-").lower()
-
-
-def standardize_alias_name(
-    name: Union[None, str, List[str]]
-) -> Union[None, str, List[str]]:
-    if name is None:
-        return None
-    elif isinstance(name, str):
-        return standardize_lookup_name(name)
-    else:  # isinstance(name, list)
-        return [standardize_lookup_name(n) for n in name]
-
-
-class RegistryMixin:
-    """
-    Universal registry to support registration and loading of child classes and plugins
-    of neuralmagic utilities.
-
-    Classes that require a registry or plugins may add the `RegistryMixin` and use
-    `register` and `load` as the main entrypoints for adding new implementations and
-    loading requested values from its registry.
-
-    If a class should only have its child classes in its registry, the class should
-    set the static attribute `registry_requires_subclass` to True
-
-    example
-    ```python
-    class Dataset(RegistryMixin):
-        pass
-
-
-    # register with default name
-    @Dataset.register()
-    class ImageNetDataset(Dataset):
-        pass
-
-    # load as "ImageNetDataset"
-    imagenet = Dataset.load("ImageNetDataset")
-
-    # register with custom name
-    @Dataset.register(name="cifar-dataset")
-    class Cifar(Dataset):
-        pass
-
-    Note: the name will be standardized for lookup in the registry.
-    For example, if a class is registered as "cifar_dataset" or
-    "cifar dataset", it will be stored as "cifar-dataset". The user
-    will be able to load the class with any of the three name variants.
-
-    # register with multiple aliases
-    @Dataset.register(alias=["cifar-10-dataset", "cifar_100_dataset"])
-    class Cifar(Dataset):
-        pass
-
-    # load as "cifar-dataset"
-    cifar = Dataset.load_from_registry("cifar-dataset")
-
-    # load from custom file that implements a dataset
-    mnist = Dataset.load_from_registry("/path/to/mnnist_dataset.py:MnistDataset")
-    ```
-    """
-
-    # set to True in child class to add check that registered/retrieved values
-    # implement the class it is registered to
-    registry_requires_subclass: bool = False
-
-    @classmethod
-    def register(
-        cls, name: Optional[str] = None, alias: Union[List[str], str, None] = None
-    ):
-        """
-        Decorator for registering a value (ie class or function) wrapped by this
-        decorator to the base class (class that .register is called from)
-
-        :param name: name or list of names to register the wrapped value as,
-            defaults to value.__name__
-        :param alias: alias or list of aliases to register the wrapped value as,
-            defaults to None
-        :return: register decorator
-        """
-
-        def decorator(value: Any):
-            cls.register_value(value, name=name, alias=alias)
-            return value
-
-        return decorator
-
-    @classmethod
-    def register_value(
-        cls, value: Any, name: str, alias: Union[str, List[str], None] = None
-    ):
-        """
-        Registers the given value to the class `.register_value` is called from
-        :param value: value to register
-        :param name: name to register the wrapped value as,
-            defaults to value.__name__
-        :param alias: alias or list of aliases to register the wrapped value as,
-            defaults to None
-        """
-        register(
-            parent_class=cls,
-            value=value,
-            name=name,
-            alias=alias,
-            require_subclass=cls.registry_requires_subclass,
-        )
-
-    @classmethod
-    def load_from_registry(cls, name: str, **constructor_kwargs) -> object:
-        """
-        :param name: name of registered class to load
-        :param constructor_kwargs: arguments to pass to the constructor retrieved
-            from the registry
-        :return: loaded object registered to this class under the given name,
-            constructed with the given kwargs. Raises error if the name is
-            not found in the registry
-        """
-        constructor = cls.get_value_from_registry(name=name)
-        return constructor(**constructor_kwargs)
-
-    @classmethod
-    def get_value_from_registry(cls, name: str):
-        """
-        :param name: name to retrieve from the registry
-        :return: value from retrieved the registry for the given name, raises
-            error if not found
-        """
-        return get_from_registry(
-            parent_class=cls,
-            name=name,
-            require_subclass=cls.registry_requires_subclass,
-        )
-
-    @classmethod
-    def registered_names(cls) -> List[str]:
-        """
-        :return: list of all names registered to this class
-        """
-        return registered_names(cls)
-
-    @classmethod
-    def registered_aliases(cls) -> List[str]:
-        """
-        :return: list of all aliases registered to this class
-        """
-        return registered_aliases(cls)
-
-
-def register(
-    parent_class: Type,
-    value: Any,
-    name: Optional[str] = None,
-    alias: Union[List[str], str, None] = None,
-    require_subclass: bool = False,
-):
-    """
-    :param parent_class: class to register the name under
-    :param value: the value to register
-    :param name: name to register the wrapped value as, defaults to value.__name__
-    :param alias: alias or list of aliases to register the wrapped value as,
-        defaults to None
-    :param require_subclass: require that value is a subclass of the class this
-        method is called from
-    """
-    if name is None:
-        # default name
-        name = value.__name__
-
-    name = standardize_lookup_name(name)
-    alias = standardize_alias_name(alias)
-    register_alias(name=name, alias=alias, parent_class=parent_class)
-
-    if require_subclass:
-        _validate_subclass(parent_class, value)
-
-    if name in _REGISTRY[parent_class]:
-        # name already exists - raise error if two different values are attempting
-        # to share the same name
-        registered_value = _REGISTRY[parent_class][name]
-        if registered_value is not value:
-            raise RuntimeError(
-                f"Attempting to register name {name} as {value} "
-                f"however {name} has already been registered as {registered_value}"
-            )
-    else:
-        _REGISTRY[parent_class][name] = value
-
-
-def get_from_registry(
-    parent_class: Type, name: str, require_subclass: bool = False
-) -> Any:
-    """
-    :param parent_class: class that the name is registered under
-    :param name: name to retrieve from the registry of the class
-    :param require_subclass: require that value is a subclass of the class this
-        method is called from
-    :return: value from retrieved the registry for the given name, raises
-        error if not found
-    """
-    name = standardize_lookup_name(name)
-
-    if ":" in name:
-        # user specifying specific module to load and value to import
-        module_path, value_name = name.split(":")
-        retrieved_value = _import_and_get_value_from_module(module_path, value_name)
-    else:
-        # look up name in alias registry
-        name = _ALIAS_REGISTRY[parent_class].get(name)
-        # look up name in registry
-        retrieved_value = _REGISTRY[parent_class].get(name)
-        if retrieved_value is None:
-            raise KeyError(
-                f"Unable to find {name} registered under type {parent_class}.\n"
-                f"Registered values for {parent_class}: "
-                f"{registered_names(parent_class)}\n"
-                f"Registered aliases for {parent_class}: "
-                f"{registered_aliases(parent_class)}"
-            )
-
-    if require_subclass:
-        _validate_subclass(parent_class, retrieved_value)
-
-    return retrieved_value
-
-
-def registered_names(parent_class: Type) -> List[str]:
-    """
-    :param parent_class: class to look up the registry of
-    :return: all names registered to the given class
-    """
-    return list(_REGISTRY[parent_class].keys())
-
-
-def registered_aliases(parent_class: Type) -> List[str]:
-    """
-    :param parent_class: class to look up the registry of
-    :return: all aliases registered to the given class
-    """
-    registered_aliases_plus_names = list(_ALIAS_REGISTRY[parent_class].keys())
-    registered_aliases = list(
-        set(registered_aliases_plus_names) - set(registered_names(parent_class))
-    )
-    return registered_aliases
-
-
-def register_alias(
-    name: str, parent_class: Type, alias: Union[str, List[str], None] = None
-):
-    """
-    Updates the mapping from the alias(es) to the given name.
-    If the alias is None, the name is used as the alias.
-    ```
-
-    :param name: name that the alias refers to
-    :param parent_class: class that the name is registered under
-    :param alias: single alias or list of aliases that
-        refer to the name, defaults to None
-    """
-    if alias is not None:
-        alias = alias if isinstance(alias, list) else [alias]
-    else:
-        alias = []
-
-    if name in alias:
-        raise KeyError(
-            f"Attempting to register alias {name}, "
-            f"that is identical to the standardized name: {name}."
-        )
-    alias.append(name)
-
-    for alias_name in alias:
-        if alias_name in _ALIAS_REGISTRY[parent_class]:
-            raise KeyError(
-                f"Attempting to register alias {alias_name} as {name} "
-                f"however {alias_name} has already been registered as "
-                f"{_ALIAS_REGISTRY[alias_name]}"
-            )
-        _ALIAS_REGISTRY[parent_class][alias_name] = name
-
-
-def _import_and_get_value_from_module(module_path: str, value_name: str) -> Any:
-    # import the given module path and try to get the value_name if it is included
-    # in the module
-
-    # load module
-    spec = importlib.util.spec_from_file_location(
-        f"plugin_module_for_{value_name}", module_path
-    )
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-
-    # get value from module
-    value = getattr(module, value_name, None)
-
-    if not value:
-        raise RuntimeError(
-            f"Unable to find attribute {value_name} in module {module_path}"
-        )
-    return value
-
-
-def _validate_subclass(parent_class: Type, child_class: Type):
-    if not issubclass(child_class, parent_class):
-        raise ValueError(
-            f"class {child_class} is not a subclass of the class it is "
-            f"registered for: {parent_class}."
-        )
diff --git a/src/sparsetensors/utils/__init__.py b/src/sparsetensors/utils/__init__.py
deleted file mode 100644
index e9e78d44..00000000
--- a/src/sparsetensors/utils/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-from .helpers import *
-from .safetensors_load import *
diff --git a/src/sparsetensors/utils/helpers.py b/src/sparsetensors/utils/helpers.py
deleted file mode 100644
index c584c2ee..00000000
--- a/src/sparsetensors/utils/helpers.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from sparsetensors.base import SPARSITY_CONFIG_NAME
-from sparsetensors.compressors import ModelCompressor
-from sparsetensors.config import CompressionConfig
-from transformers import AutoConfig
-
-
-__all__ = ["infer_compressor_from_model_config"]
-
-
-def infer_compressor_from_model_config(
-    pretrained_model_name_or_path: str,
-) -> Optional[ModelCompressor]:
-    """
-    Given a path to a model config, extract a sparsity config if it exists and return
-    the associated ModelCompressor
-
-    :param pretrained_model_name_or_path: path to model config on disk or HF hub
-    :return: matching compressor if config contains a sparsity config
-    """
-    config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
-    sparsity_config = getattr(config, SPARSITY_CONFIG_NAME, None)
-    if sparsity_config is None:
-        return None
-
-    format = sparsity_config.get("format")
-    sparsity_config = CompressionConfig.load_from_registry(format, **sparsity_config)
-    compressor = ModelCompressor.load_from_registry(format, config=sparsity_config)
-    return compressor
diff --git a/src/sparsetensors/utils/safetensors_load.py b/src/sparsetensors/utils/safetensors_load.py
deleted file mode 100644
index 4d71482a..00000000
--- a/src/sparsetensors/utils/safetensors_load.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import re
-import struct
-from typing import Dict, List, Optional
-
-from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, cached_file
-
-
-__all__ = [
-    "get_safetensors_folder",
-    "get_safetensors_header",
-    "match_param_name",
-    "merge_names",
-    "get_weight_mappings",
-    "get_nested_weight_mappings",
-]
-
-
-def get_safetensors_folder(
-    pretrained_model_name_or_path: str, cache_dir: Optional[str] = None
-) -> str:
-    """
-    Given a Hugging Face stub or a local path, return the folder containing the
-    safetensors weight files
-
-    :param pretrained_model_name_or_path: local path to model or HF stub
-    :param cache_dir: optional cache dir to search through, if none is specified the
-    model will be searched for in the default TRANSFORMERS_CACHE
-    :return: local folder containing model data
-    """
-    if os.path.exists(pretrained_model_name_or_path):
-        # argument is a path to a local folder
-        return pretrained_model_name_or_path
-
-    safetensors_path = cached_file(
-        pretrained_model_name_or_path,
-        SAFE_WEIGHTS_NAME,
-        cache_dir=cache_dir,
-        _raise_exceptions_for_missing_entries=False,
-    )
-    index_path = cached_file(
-        pretrained_model_name_or_path,
-        SAFE_WEIGHTS_INDEX_NAME,
-        cache_dir=cache_dir,
-        _raise_exceptions_for_missing_entries=False,
-    )
-    if safetensors_path is not None:
-        # found a single cached safetensors file
-        return os.path.split(safetensors_path)[0]
-    if index_path is not None:
-        # found a cached safetensors weight index file
-        return os.path.split(index_path)[0]
-
-    # model weights could not be found locally or cached from HF Hub
-    raise ValueError(
-        "Could not locate safetensors weight or index file from "
-        f"{pretrained_model_name_or_path}."
-    )
-
-
-def get_safetensors_header(safetensors_path: str) -> Dict[str, str]:
-    """
-    Extracts the metadata from a safetensors file as JSON
-
-    :param safetensors_path: path to a safetensors file
-    :return: dictionary of metadata extracted from the safetensors file
-    """
-    with open(safetensors_path, "rb") as f:
-        length_of_header = struct.unpack("<Q", f.read(8))[0]
-        header_data = f.read(length_of_header)
-        header = json.loads(header_data)
-
-    return header
-
-
-def match_param_name(full_name: str, param_name: str) -> str:
-    """
-    Helper function extracting the uncompressed parameterized layer name from a
-    compressed name. Assumes the compressed name was merged using merge_names.
-
-    :param full_name: full name of parameter in compressed model
-    :param param_name: compression paramater name
-    :return: uncompressed name of the uncompressed parameterized layer
-    """
-    pattern = r"^(.*)\." + param_name + r"$"
-    regex = re.findall(pattern, full_name)
-    if len(regex) == 0:
-        return None
-    return regex[0]
-
-
-def merge_names(parent_name: str, child_name: str) -> str:
-    """
-    Helper function for merging an uncompressed parameterized layer name with a
-    compression parameter. Names merged with this function can then be parsed by
-    match_param_name.
-
-    :param parent_name: uncompressed parameterized layer name
-    :param child_name: compression parameter name
-    :return: merged compressed name
-    """
-    return parent_name + "." + child_name
-
-
-def get_weight_mappings(model_path: str) -> Dict[str, str]:
-    """
-    Takes a path to a state dict saved in safetensors format and returns a mapping
-    from parameterized layer name to file location.
-
-    {
-        layer.weight.bitmask: file_location,
-        layer.weight.row_offsets: file_location,
-        layer.weight.shape: file_location,
-        layer.weight.compressed: file_location
-    }
-
-    This generalizes to cases where the model is split into multiple safetensors files
-
-    :param model_path: path to safetensors state dict, must contain either a single
-    safetensors file or multiple files with an index
-    :return: mapping of parameterized layer name to file location
-    """
-    safetensors_path = os.path.join(model_path, SAFE_WEIGHTS_NAME)
-    index_path = os.path.join(model_path, SAFE_WEIGHTS_INDEX_NAME)
-    if os.path.exists(safetensors_path):
-        # we have a single safetensors file to read
-        header = get_safetensors_header(safetensors_path)
-        for key in header.keys():
-            header[key] = SAFE_WEIGHTS_NAME
-        header.pop("__metadata__", None)
-    elif os.path.exists(index_path):
-        # we have multiple safetensors file, read from index
-        with open(index_path, "r", encoding="utf-8") as f:
-            index = json.load(f)
-        header = index["weight_map"]
-    else:
-        raise ValueError(
-            f"Could not find a safetensors weight or index file at {model_path}"
-        )
-
-    # convert weight locations to full paths
-    for key, value in header.items():
-        header[key] = os.path.join(model_path, value)
-
-    return header
-
-
-def get_nested_weight_mappings(
-    model_path: str, params_to_nest: List[str]
-) -> Dict[str, Dict[str, str]]:
-    """
-    Takes a path to a state dict saved in safetensors format and returns a nested
-    mapping from uncompressed parameterized layer names to the file locations of each
-    of the layers compression parameters.
-
-    layer.weight: {
-        bitmask: file_location,
-        row_offsets: file_location,
-        shape: file_location,
-        compressed: file_location
-    }
-
-    This generalizes to cases where the model is split into multiple safetensors files
-
-    :param model_path: path to safetensors state dict, must contain either a single
-    safetensors file or multiple files with an index
-    :return: nested mapping of parameterized layer name to file location
-    """
-    weight_mappings = get_weight_mappings(model_path)
-
-    nested_weight_mappings = {}
-    for key in weight_mappings.keys():
-        for param_name in params_to_nest:
-            maybe_match = match_param_name(key, param_name)
-            if maybe_match is not None:
-                dense_param = maybe_match
-                if dense_param not in nested_weight_mappings:
-                    nested_weight_mappings[dense_param] = {}
-                nested_weight_mappings[dense_param][param_name] = weight_mappings[key]
-
-    return nested_weight_mappings
diff --git a/tests/quantization/__init__.py b/tests/quantization/__init__.py
deleted file mode 100644
index 0c44f887..00000000
--- a/tests/quantization/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tests/quantization/lifecycle/__init__.py b/tests/quantization/lifecycle/__init__.py
deleted file mode 100644
index 0c44f887..00000000
--- a/tests/quantization/lifecycle/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tests/quantization/lifecycle/test_apply.py b/tests/quantization/lifecycle/test_apply.py
deleted file mode 100644
index eeb29a41..00000000
--- a/tests/quantization/lifecycle/test_apply.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from sparsetensors.quantization.lifecycle import apply_quantization_config
-from sparsetensors.quantization.quant_config import (
-    QuantizationConfig,
-    QuantizationStatus,
-)
-from transformers import AutoModelForCausalLM
-
-
-def test_apply_quantization_config_tinyllama():
-    quant_config = get_sample_tinyllama_quant_config()
-    model = get_tinyllama_model()
-
-    # check that model is not already quantized
-    for module in model.modules():
-        _test_layer_quantization_status(module, inputs=False, weights=False)
-
-    # apply quant config to model
-    apply_quantization_config(model, quant_config)
-
-    # check for correct application of quant config
-    num_linears = 0
-    num_embeddings = 0
-    num_rotary_embeddings = 0
-    for name, module in model.named_modules():
-        if name in quant_config.ignore:
-            continue
-        module_type = module.__class__.__name__
-        if module_type == "Linear":
-            num_linears += 1
-            _test_layer_quantization_status(module, inputs=True, weights=True)
-        elif module_type == "Embedding":
-            num_embeddings += 1
-            _test_layer_quantization_status(module, inputs=False, weights=True)
-        elif module_type == "LlamaRotaryEmbedding":
-            num_rotary_embeddings += 1
-            _test_layer_quantization_status(module, inputs=False, weights=False)
-
-    # sanity check correct number of layers targeted
-    assert num_linears == 154  # 155 Linear layers - 1 that gets ignored
-    assert num_embeddings == 1
-    assert num_rotary_embeddings == 22
-
-
-def test_serialize_config_tinyllama():
-    quant_config = get_sample_tinyllama_quant_config()
-    model = get_tinyllama_model()
-
-    # check that model is not already quantized
-    for module in model.modules():
-        _test_layer_quantization_status(module, inputs=False, weights=False)
-
-    # apply quant config to model
-    apply_quantization_config(model, quant_config)
-
-    serialized_config = QuantizationConfig.from_pretrained(model)
-    assert len(serialized_config.config_groups) == 2
-    assert serialized_config.config_groups["group_0"].targets == ["Embedding"]
-    assert serialized_config.config_groups["group_0"].input_activations is None
-    assert serialized_config.config_groups["group_1"].targets == ["Linear"]
-    assert serialized_config.config_groups["group_1"].input_activations is not None
-    assert serialized_config.quantization_status == QuantizationStatus.FROZEN
-    assert serialized_config.format == "fakequant"
-    assert serialized_config.quant_method == "sparseml"
-    assert serialized_config.ignore == ["model.layers.1.mlp.down_proj"]
-    assert serialized_config.global_compression_ratio > 1.0
-    assert serialized_config.global_compression_ratio < 8.0
-
-
-def _test_layer_quantization_status(module, inputs: bool, weights: bool):
-    # check if quantization is applied at all (true if inputs or weights targeted)
-    quantized = inputs or weights
-    assert hasattr(module, "quantization_scheme") == quantized
-    assert hasattr(module, "quantization_status") == quantized
-
-    # check inputs matches expected
-    assert hasattr(module, "input_scale") == inputs
-    assert hasattr(module, "input_zero_point") == inputs
-
-    # check weights matches expected
-    assert hasattr(module, "weight_scale") == weights
-    assert hasattr(module, "weight_zero_point") == weights
-
-
-def get_tinyllama_model():
-    return AutoModelForCausalLM.from_pretrained(
-        "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
-    )
-
-
-def get_sample_tinyllama_quant_config():
-    config_dict = {
-        "quant_method": "sparseml",
-        "format": "fakequant",
-        "quantization_status": "frozen",
-        "global_compression_ratio": None,
-        "config_groups": {
-            "group_1": {
-                "weights": {
-                    "num_bits": 8,
-                    "type": "int",
-                    "symmetric": True,
-                    "strategy": "tensor",
-                },
-                "input_activations": {
-                    "num_bits": 8,
-                    "type": "int",
-                    "symmetric": True,
-                    "strategy": "tensor",
-                },
-                "targets": ["Linear"],
-            },
-            "group_2": {
-                "weights": {
-                    "num_bits": 8,
-                    "type": "int",
-                    "symmetric": False,
-                    "strategy": "tensor",
-                },
-                "input_activations": None,
-                "targets": ["Embedding"],
-            },
-        },
-        "ignore": ["LlamaRotaryEmbedding", "model.layers.1.mlp.down_proj"],
-    }
-    return QuantizationConfig.parse_obj(config_dict)
diff --git a/tests/quantization/test_quant_args.py b/tests/quantization/test_quant_args.py
deleted file mode 100644
index c407eae5..00000000
--- a/tests/quantization/test_quant_args.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-from pydantic import ValidationError
-from sparsetensors.quantization import (
-    QuantizationArgs,
-    QuantizationStrategy,
-    QuantizationType,
-)
-
-
-def test_defaults():
-    default = QuantizationArgs()
-
-    assert default.num_bits == 8
-    assert default.type == QuantizationType.INT
-    assert default.symmetric
-    assert default.strategy == QuantizationStrategy.TENSOR
-    assert default.group_size is None
-    assert default.block_structure is None
-
-
-def test_group():
-    kwargs = {"strategy": "group", "group_size": 128}
-
-    group = QuantizationArgs(**kwargs)
-    assert group.strategy == QuantizationStrategy.GROUP
-    assert group.group_size == kwargs["group_size"]
-
-
-def test_block():
-    kwargs = {"strategy": "block", "block_structure": "2x4"}
-
-    block = QuantizationArgs(**kwargs)
-    assert block.strategy == QuantizationStrategy.BLOCK
-    assert block.block_structure == kwargs["block_structure"]
-
-
-def test_invalid():
-    with pytest.raises(ValidationError):
-        _ = QuantizationArgs(type="invalid")
-    with pytest.raises(ValidationError):
-        _ = QuantizationArgs(strategy="invalid")
diff --git a/tests/quantization/test_quant_config.py b/tests/quantization/test_quant_config.py
deleted file mode 100644
index 92b68ab7..00000000
--- a/tests/quantization/test_quant_config.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-from pydantic import ValidationError
-from sparsetensors.quantization import (
-    QuantizationConfig,
-    QuantizationScheme,
-    QuantizationStatus,
-)
-
-
-def test_basic_config():
-    config_groups = {"group_1": QuantizationScheme(targets=[])}
-    config = QuantizationConfig(config_groups=config_groups)
-
-    assert config.config_groups == config_groups
-    assert config.quant_method == "sparseml"
-    assert config.format == "fakequant"
-    assert config.quantization_status == QuantizationStatus.INITIALIZED
-    assert config.global_compression_ratio is None
-    assert isinstance(config.ignore, list) and len(config.ignore) == 0
-
-
-def test_full_config():
-    config_groups = {
-        "group_1": QuantizationScheme(targets=[]),
-        "group_2": QuantizationScheme(targets=[]),
-    }
-    global_compression_ratio = 3.5
-    ignore = ["model.layers.0"]
-    quantization_status = "compressed"
-
-    config = QuantizationConfig(
-        config_groups=config_groups,
-        global_compression_ratio=global_compression_ratio,
-        ignore=ignore,
-        quantization_status=quantization_status,
-    )
-    assert config.config_groups == config_groups
-    assert config.global_compression_ratio == global_compression_ratio
-    assert config.ignore == ignore
-    assert config.quantization_status == QuantizationStatus.COMPRESSED
-
-
-def test_need_config_groups():
-    with pytest.raises(ValidationError):
-        _ = QuantizationScheme()
diff --git a/tests/quantization/test_quant_scheme.py b/tests/quantization/test_quant_scheme.py
deleted file mode 100644
index 63b135b5..00000000
--- a/tests/quantization/test_quant_scheme.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-from pydantic import ValidationError
-from sparsetensors.quantization import QuantizationArgs, QuantizationScheme
-
-
-def test_basic_scheme():
-    targets = ["model.layer.0", "model.layer.3"]
-    weights = QuantizationArgs()
-
-    scheme = QuantizationScheme(targets=targets, weights=weights)
-    assert scheme.targets == targets
-    assert scheme.weights == weights
-    assert scheme.input_activations is None
-    assert scheme.output_activations is None
-
-
-def test_full_scheme():
-    targets = ["Linear"]
-    weights = QuantizationArgs()
-    input_activations = QuantizationArgs(num_bits=4)
-    output_activations = QuantizationArgs(num_bits=8, type="float", symmetric=False)
-
-    scheme = QuantizationScheme(
-        targets=targets,
-        weights=weights,
-        input_activations=input_activations,
-        output_activations=output_activations,
-    )
-    assert scheme.targets == targets
-    assert scheme.weights == weights
-    assert scheme.input_activations == input_activations
-    assert scheme.output_activations == output_activations
-
-
-def test_needs_targets():
-    with pytest.raises(ValidationError):
-        _ = QuantizationScheme()
diff --git a/tests/sparsetensors/quantization/lifecycle/conftest.py b/tests/sparsetensors/quantization/lifecycle/conftest.py
deleted file mode 100644
index a8ad01b2..00000000
--- a/tests/sparsetensors/quantization/lifecycle/conftest.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-import pytest
-from sparsetensors.quantization.quant_args import QuantizationArgs
-from sparsetensors.quantization.quant_scheme import QuantizationScheme
-
-
-@pytest.fixture
-def create_quantization_scheme():
-    def quantization_scheme(
-        targets: List[str],
-        weights: Optional[QuantizationArgs] = None,
-        input_activations: Optional[QuantizationArgs] = None,
-        output_activations: Optional[QuantizationArgs] = None,
-    ):
-        return QuantizationScheme(
-            targets=targets,
-            weights=weights,
-            input_activations=input_activations,
-            output_activations=output_activations,
-        )
-
-    return quantization_scheme
diff --git a/tests/sparsetensors/quantization/lifecycle/test_forward.py b/tests/sparsetensors/quantization/lifecycle/test_forward.py
deleted file mode 100644
index c2d27bd1..00000000
--- a/tests/sparsetensors/quantization/lifecycle/test_forward.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-import torch
-from sparsetensors.quantization.lifecycle.forward import (
-    maybe_calibrate_or_quantize,
-    wrap_module_forward_quantized,
-)
-from sparsetensors.quantization.lifecycle.initialize import (
-    initialize_module_for_quantization,
-)
-from sparsetensors.quantization.lifecycle.status import QuantizationStatus
-from sparsetensors.quantization.quant_args import QuantizationArgs
-from torch.nn import Linear
-
-
-def test_wrap_module_forward_quantized(create_quantization_scheme):
-    num_bits = 8
-    quantization_scheme = create_quantization_scheme(
-        targets=["*"],
-        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
-        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
-    )
-    layer = Linear(4, 4)
-
-    func_forward = layer.forward.__func__
-
-    # check that the forward call is overwritten
-    wrap_module_forward_quantized(layer, quantization_scheme)
-
-    assert not func_forward == layer.forward.__func__
-
-
-@pytest.mark.parametrize(
-    "quantization_status", ["INITIALIZED", "CALIBRATION", "FROZEN"]
-)
-def test_maybe_calibrate_or_quantize(create_quantization_scheme, quantization_status):
-    num_bits = 8
-    quantization_scheme = create_quantization_scheme(
-        targets=["*"],
-        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
-        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
-    )
-    quantization_args = QuantizationArgs(num_bits=num_bits, symmetric=False)
-    layer = Linear(4, 4)
-    layer.weight.data *= 100
-
-    initialize_module_for_quantization(layer, quantization_scheme)
-    layer.quantization_status = QuantizationStatus(quantization_status)
-
-    if layer.quantization_status == QuantizationStatus.INITIALIZED:
-        out = maybe_calibrate_or_quantize(
-            layer, layer.weight.data, "input", quantization_args
-        )
-        assert torch.allclose(out, layer.weight.data)
-    elif layer.quantization_status == QuantizationStatus.CALIBRATION:
-        out = maybe_calibrate_or_quantize(
-            layer, layer.weight.data, "input", quantization_args
-        )
-        assert not torch.allclose(out, layer.weight.data)
-
-    elif layer.quantization_status == QuantizationStatus.FROZEN:
-        # scale and zero points are empty -- cannot quantize
-        with pytest.raises(ValueError):
-            out = maybe_calibrate_or_quantize(
-                layer, layer.weight.data, "input", quantization_args
-            )
diff --git a/tests/sparsetensors/quantization/lifecycle/test_frozen.py b/tests/sparsetensors/quantization/lifecycle/test_frozen.py
deleted file mode 100644
index 0b5a18e8..00000000
--- a/tests/sparsetensors/quantization/lifecycle/test_frozen.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from sparsetensors.quantization.lifecycle.frozen import freeze_module_quantization
-from sparsetensors.quantization.lifecycle.initialize import (
-    initialize_module_for_quantization,
-)
-from sparsetensors.quantization.lifecycle.status import QuantizationStatus
-from sparsetensors.quantization.quant_args import QuantizationArgs
-from torch.nn import Linear
-
-
-def test_set_module_for_calibration(create_quantization_scheme):
-    num_bits = 8
-    quantization_scheme = create_quantization_scheme(
-        targets=["*"],
-        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
-        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
-    )
-
-    layer = Linear(4, 4)
-
-    initialize_module_for_quantization(layer, quantization_scheme)
-    layer.quantization_status = QuantizationStatus("CALIBRATION")
-
-    # should have both input and weight observer after initalizing
-    assert hasattr(layer, "input_observer")
-    assert hasattr(layer, "weight_observer")
-
-    # observers should get deleted after freezing
-    freeze_module_quantization(layer)
-    assert not hasattr(layer, "input_observer")
-    assert not hasattr(layer, "weight_observer")
-
-    assert layer.quantization_status == QuantizationStatus("FROZEN")
diff --git a/tests/sparsetensors/quantization/lifecycle/test_initialize.py b/tests/sparsetensors/quantization/lifecycle/test_initialize.py
deleted file mode 100644
index b2f01c0f..00000000
--- a/tests/sparsetensors/quantization/lifecycle/test_initialize.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-from sparsetensors.quantization.lifecycle.initialize import (
-    initialize_module_for_quantization,
-)
-from sparsetensors.quantization.quant_args import QuantizationArgs
-from sparsetensors.quantization.quant_config import QuantizationStatus
-from torch.nn import Linear
-
-
-NUM_BITS = 8
-
-
-@pytest.mark.parametrize(
-    "weights,input_activations",
-    [
-        (
-            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
-            None,
-        ),
-        (
-            None,
-            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
-        ),
-        (
-            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
-            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
-        ),
-    ],
-)
-def test_initialize_module_for_quantization(
-    create_quantization_scheme, weights, input_activations
-):
-    quantization_scheme = create_quantization_scheme(
-        targets=["*"],
-        weights=weights,
-        input_activations=input_activations,
-    )
-    layer = Linear(4, 4)
-
-    assert not hasattr(layer, "quantization_scheme")
-    assert not hasattr(layer, "quantization_status")
-
-    # add attributes, zero_points and scale
-    initialize_module_for_quantization(layer, quantization_scheme)
-
-    registered_params = {"weight", "bias"}
-    if weights is not None:
-        registered_params.add("weight_scale")
-        registered_params.add("weight_zero_point")
-
-    if input_activations is not None:
-        registered_params.add("input_scale")
-        registered_params.add("input_zero_point")
-
-    for key in layer.state_dict().keys():
-        assert key in registered_params
-        registered_params.remove(key)
-
-    assert len(registered_params) == 0
-
-    assert hasattr(layer, "quantization_scheme")
-    assert hasattr(layer, "quantization_status")
-
-    assert layer.quantization_status == QuantizationStatus.INITIALIZED
diff --git a/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py b/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py
deleted file mode 100644
index 2884bde4..00000000
--- a/tests/sparsetensors/quantization/lifecycle/test_lifecycle.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from copy import deepcopy
-
-import torch
-from sparsetensors.quantization.lifecycle.calibration import set_module_for_calibration
-from sparsetensors.quantization.lifecycle.frozen import freeze_module_quantization
-from sparsetensors.quantization.lifecycle.initialize import (
-    initialize_module_for_quantization,
-)
-from sparsetensors.quantization.quant_args import QuantizationArgs
-from sparsetensors.quantization.quant_config import QuantizationStatus
-from torch.nn import Linear
-
-
-def test_lifecyle(create_quantization_scheme):
-    num_bits = 8
-
-    quantization_scheme = create_quantization_scheme(
-        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
-        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
-        targets=["*"],
-    )
-
-    layer = Linear(4, 4)
-    layer.weight.data *= 100
-
-    # updated layer keys check
-    expected_layer_keys = {"weight", "bias"}
-    for key in layer.state_dict().keys():
-        expected_layer_keys.remove(key)
-    assert len(expected_layer_keys) == 0
-
-    # over write forward pass and register zero_point and scale
-    initialize_module_for_quantization(layer, quantization_scheme)
-    expected_layer_keys = {
-        "input_scale",
-        "input_zero_point",
-        "weight_scale",
-        "weight_zero_point",
-        "weight",
-        "bias",
-    }
-    for key in layer.state_dict().keys():
-        expected_layer_keys.remove(key)
-    assert len(expected_layer_keys) == 0
-
-    # should have both input and weight observer after initalizing
-    assert hasattr(layer, "input_observer")
-    assert hasattr(layer, "weight_observer")
-
-    assert hasattr(layer, "quantization_scheme")
-    assert hasattr(layer, "quantization_status")
-    assert layer.quantization_status == QuantizationStatus.INITIALIZED
-
-    set_module_for_calibration(layer)
-    assert layer.quantization_status == QuantizationStatus.CALIBRATION
-
-    # do a calibration step
-    assert torch.numel(layer.input_zero_point.data) == 0
-    assert torch.numel(layer.input_scale) == 0
-    assert torch.numel(layer.weight_scale) == 0
-    assert torch.numel(layer.weight_zero_point) == 0
-
-    layer(torch.randn(4, 4))
-
-    # zero-points and scale should be updated after forward pass
-    assert torch.numel(layer.input_zero_point.data) > 0
-    assert torch.numel(layer.input_scale) > 0
-    assert torch.numel(layer.weight_scale) > 0
-    assert torch.numel(layer.weight_zero_point) > 0
-
-    # symmetric zero points should center at 0
-    assert layer.weight_zero_point.data == 0
-
-    # check high and low bound of the weights
-    assert torch.all(layer.weight.data >= -128) and torch.all(layer.weight.data <= 127)
-
-    initalized_layer = deepcopy(layer)
-
-    # calibrate the layers with each iteration
-    for _ in range(10):
-        layer(torch.randn(4, 4))
-
-    assert initalized_layer.input_zero_point != layer.input_zero_point
-    assert initalized_layer.input_scale != layer.input_scale
-    assert initalized_layer.weight_scale != layer.weight_scale
-
-    # check quantization f_q(x) is applied after frozen without update
-    input_check_for_quant = torch.randn(4, 4)
-    out_calibration = layer(input_check_for_quant)
-
-    layer_before_freeze = deepcopy(layer)
-
-    # Freeze, no update after any forward pass
-    freeze_module_quantization(layer)
-
-    for _ in range(10):
-        layer(torch.randn(4, 4))
-    assert layer_before_freeze.input_zero_point == layer.input_zero_point
-    assert layer_before_freeze.input_scale == layer.input_scale
-    assert layer_before_freeze.weight_scale == layer.weight_scale
-
-    # check that the same quantization is applied as calibration to frozen
-    assert torch.all(out_calibration == layer(input_check_for_quant))
diff --git a/tests/sparsetensors/quantization/observers/test_min_max.py b/tests/sparsetensors/quantization/observers/test_min_max.py
deleted file mode 100644
index a5273d02..00000000
--- a/tests/sparsetensors/quantization/observers/test_min_max.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-import torch
-from sparsetensors.quantization.quant_args import QuantizationArgs
-
-
-@pytest.mark.parametrize(
-    "symmetric,expected_scale,expected_zero_point",
-    [
-        (True, 0.0078, 0),
-        (False, 0.0039, 0),
-    ],
-)
-def test_min_max_observer(symmetric, expected_scale, expected_zero_point):
-    tensor = torch.tensor([1, 1, 1, 1, 1])
-    num_bits = 8
-    weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric)
-
-    observer = weights.get_observer()
-    scale, zero_point = observer(tensor)
-
-    assert round(scale.item(), 4) == expected_scale
-    assert round(zero_point.item(), 4) == expected_zero_point
-
-
-def test_min_max_observer_symmetric_scale_range():
-    tensor = torch.rand(4, 4)
-    tensor *= 127
-
-    num_bits = 8
-    weights = QuantizationArgs(num_bits=num_bits, symmetric=True)
-
-    observer = weights.get_observer()
-    scale, zero_point = observer(tensor)
-
-    # if symmetric, max symmetric_range = abs(-128) / 255
-    assert round(scale.item(), 4) <= 1.0039
-    assert round(zero_point.item(), 4) == 0
-
-
-def test_min_max_observer_value_update():
-    inp = torch.tensor([1, 1, 1, 1, 1])
-    inp_update_max = torch.tensor([127, 1, 1, 1, 1])
-    inp_update_min = torch.tensor([-128, 1, 1, 1, 1])
-
-    # udpate the min, max twice total
-    tensors = [
-        inp,
-        inp,
-        inp_update_max,  # update max
-        inp,
-        inp_update_min,  # update min
-    ]
-
-    tensor = inp
-    num_bits = 8
-    weights = QuantizationArgs(num_bits=num_bits, symmetric=True)
-
-    observer = weights.get_observer()
-    curr_max = 1
-    curr_min = 1
-    for i, tensor in enumerate(tensors):
-        observer(tensor)
-        curr_max = max(observer.max_val, curr_max)
-        curr_min = min(observer.min_val, curr_max)
-
-        if i < 2:
-            assert curr_max == 1
-            assert curr_min == 1
-        elif i < 4:
-            assert curr_max == 43  # (127 + 2) / 3
-            assert curr_min == 1
-        else:
-            assert curr_max == 43
-            assert curr_min == -24.8  # (-128 + 4) / 5
diff --git a/tests/test_bitmask.py b/tests/test_bitmask.py
index b5bca142..248580bc 100644
--- a/tests/test_bitmask.py
+++ b/tests/test_bitmask.py
@@ -17,8 +17,8 @@
 
 import pytest
 import torch
+from compressed_tensors import BitmaskCompressor, BitmaskConfig, BitmaskTensor
 from safetensors.torch import save_file
-from sparsetensors import BitmaskCompressor, BitmaskConfig, BitmaskTensor
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_registry.py b/tests/test_registry.py
index b73d357f..a183d77d 100644
--- a/tests/test_registry.py
+++ b/tests/test_registry.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import pytest
-from sparsetensors import (
+from compressed_tensors import (
     BitmaskCompressor,
     BitmaskConfig,
     CompressionConfig,

From 3e037a537f788bc1af53fba336be8e8fda721b82 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Fri, 26 Apr 2024 20:32:06 +0000
Subject: [PATCH 08/10] rebase

---
 src/compressed_tensors/README.md              | 162 ++++++++
 src/compressed_tensors/__init__.py            |  21 +
 src/compressed_tensors/base.py                |  15 +
 .../compressors/__init__.py                   |  19 +
 src/compressed_tensors/compressors/base.py    |  73 ++++
 src/compressed_tensors/compressors/dense.py   |  31 ++
 .../compressors/sparse_bitmask.py             | 233 ++++++++++++
 src/compressed_tensors/config/__init__.py     |  18 +
 src/compressed_tensors/config/base.py         |  36 ++
 src/compressed_tensors/config/dense.py        |  36 ++
 .../config/sparse_bitmask.py                  |  36 ++
 .../quantization/__init__.py                  |  21 +
 .../quantization/lifecycle/__init__.py        |  22 ++
 .../quantization/lifecycle/apply.py           | 105 +++++
 .../quantization/lifecycle/calibration.py     |  51 +++
 .../quantization/lifecycle/forward.py         | 137 +++++++
 .../quantization/lifecycle/frozen.py          |  47 +++
 .../quantization/lifecycle/initialize.py      |  96 +++++
 .../quantization/observers/__init__.py        |  19 +
 .../quantization/observers/base.py            |  69 ++++
 .../quantization/observers/memoryless.py      |  61 +++
 .../quantization/observers/min_max.py         |  79 ++++
 .../quantization/quant_args.py                |  85 +++++
 .../quantization/quant_config.py              | 154 ++++++++
 .../quantization/quant_scheme.py              |  39 ++
 .../quantization/utils/__init__.py            |  16 +
 .../quantization/utils/helpers.py             | 115 ++++++
 src/compressed_tensors/registry/__init__.py   |  17 +
 src/compressed_tensors/registry/registry.py   | 360 ++++++++++++++++++
 src/compressed_tensors/utils/__init__.py      |  17 +
 src/compressed_tensors/utils/helpers.py       |  45 +++
 .../utils/safetensors_load.py                 | 196 ++++++++++
 .../observers/quantization/__init__.py        |  13 +
 .../quantization/lifecycle/__init__.py        |  13 +
 .../quantization/lifecycle/conftest.py        |  37 ++
 .../quantization/lifecycle/test_apply.py      | 140 +++++++
 .../quantization/lifecycle/test_forward.py    |  82 ++++
 .../quantization/lifecycle/test_frozen.py     |  47 +++
 .../quantization/lifecycle/test_initialize.py |  79 ++++
 .../quantization/lifecycle/test_lifecycle.py  | 119 ++++++
 .../observers/quantization/test_quant_args.py |  55 +++
 .../quantization/test_quant_config.py         |  60 +++
 .../quantization/test_quant_scheme.py         |  51 +++
 .../quantization/observers/test_min_max.py    |  89 +++++
 44 files changed, 3216 insertions(+)
 create mode 100644 src/compressed_tensors/README.md
 create mode 100644 src/compressed_tensors/__init__.py
 create mode 100644 src/compressed_tensors/base.py
 create mode 100644 src/compressed_tensors/compressors/__init__.py
 create mode 100644 src/compressed_tensors/compressors/base.py
 create mode 100644 src/compressed_tensors/compressors/dense.py
 create mode 100644 src/compressed_tensors/compressors/sparse_bitmask.py
 create mode 100644 src/compressed_tensors/config/__init__.py
 create mode 100644 src/compressed_tensors/config/base.py
 create mode 100644 src/compressed_tensors/config/dense.py
 create mode 100644 src/compressed_tensors/config/sparse_bitmask.py
 create mode 100644 src/compressed_tensors/quantization/__init__.py
 create mode 100644 src/compressed_tensors/quantization/lifecycle/__init__.py
 create mode 100644 src/compressed_tensors/quantization/lifecycle/apply.py
 create mode 100644 src/compressed_tensors/quantization/lifecycle/calibration.py
 create mode 100644 src/compressed_tensors/quantization/lifecycle/forward.py
 create mode 100644 src/compressed_tensors/quantization/lifecycle/frozen.py
 create mode 100644 src/compressed_tensors/quantization/lifecycle/initialize.py
 create mode 100644 src/compressed_tensors/quantization/observers/__init__.py
 create mode 100644 src/compressed_tensors/quantization/observers/base.py
 create mode 100644 src/compressed_tensors/quantization/observers/memoryless.py
 create mode 100644 src/compressed_tensors/quantization/observers/min_max.py
 create mode 100644 src/compressed_tensors/quantization/quant_args.py
 create mode 100644 src/compressed_tensors/quantization/quant_config.py
 create mode 100644 src/compressed_tensors/quantization/quant_scheme.py
 create mode 100644 src/compressed_tensors/quantization/utils/__init__.py
 create mode 100644 src/compressed_tensors/quantization/utils/helpers.py
 create mode 100644 src/compressed_tensors/registry/__init__.py
 create mode 100644 src/compressed_tensors/registry/registry.py
 create mode 100644 src/compressed_tensors/utils/__init__.py
 create mode 100644 src/compressed_tensors/utils/helpers.py
 create mode 100644 src/compressed_tensors/utils/safetensors_load.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/__init__.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py
 create mode 100644 tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py
 create mode 100644 tests/compressed_tensors/quantization/observers/test_min_max.py

diff --git a/src/compressed_tensors/README.md b/src/compressed_tensors/README.md
new file mode 100644
index 00000000..5b1c8ece
--- /dev/null
+++ b/src/compressed_tensors/README.md
@@ -0,0 +1,162 @@
+# Save/Load Compressed SafeTensors
+
+## Motivation
+
+* Reduce disk space by saving in a compressed format for sparse models. Models in this compressed format will be loaded by vLLM for more efficient inference
+* Set up the save/load architecture such that we can easily expand to additional compression formats in the future. The config should be human readable so users can understand the compression format at a quick glance
+
+## SafeTensors File Format
+
+For each parameter in the uncompressed state_dict, we store the following attributes 
+needed for decompression in the compressed state_dict:
+
+* compressed tensor
+* bitmask
+* uncompressed shape
+* row offsets
+
+```python
+# dense
+{
+    PARAM_NAME: uncompressed_tensor
+}
+
+# compressed
+{
+    PARAM_NAME.compressed: compressed_tensor # 1d tensor
+    PARAM_NAME.bitmask: value # 2d bitmask tensor (nrows x (ncols / 8))
+    PARAM_NAME.shape: value # uncompressed shape tensor
+    PARAM_NAME.row_offsets: value # 1d offsets tensor
+}
+```
+
+Config information gets stored in the HF config file
+```json
+// config.json
+{
+    "sparsity_config": {
+        "format": "sparse_bitmask", // "dense_sparsity" for original tensor format
+
+        // informational
+        "sparsity_structure": "unstructured", // or 2:4, 8:16 etc...
+        "global_sparsity": "0.5"
+    }
+}
+```
+
+## Saving/Loading Interface 
+
+Loading in a compressed model requires no interface changes
+
+```python
+from sparseml.transformers.utils import SparseAutoModelForCausalLM
+
+# should contain model.safetensors or model.safetensors.index.json
+model_path = "/PATH/TO/COMPRESSED_MODEL"
+
+model = SparseAutoModelForCausalLM.from_pretrained(
+    model_name_or_path=model_path,
+    **model_kwargs,
+)
+```
+
+Saving a compressed model with an explicitly provided compression config. The config
+is saved to the model's `config.json` file. **Note:** the model must have been 
+initialized with SparseAutoModelForCausalLM.from_pretrained()
+
+```python
+from compressed_tensors import BitmaskConfig
+
+output_dir = "/PATH/TO/SAVE/COMPRESSED_MODEL"
+sparsity_config = BitmaskConfig()
+
+model.save_pretrained(
+    save_directory=output_dir,
+    sparsity_config=sparsity_config,
+)
+```
+
+Saving a compressed model, inferring the config from the model attributes
+
+```python
+model.save_pretrained(
+    save_directory=output_dir,
+    save_compressed=True
+)
+```
+
+Saving a model in the dense format. If the model has at least 5% global sparsity a 
+sparsity config will still be included in `config.json` with format `dense_sparsity`
+
+```python
+model.save_pretrained(
+    save_directory=output_dir
+)
+```
+
+Saving a model in the dense format, bypassing the sparsity config calculation. When the
+`skip_compression_stats` flag is set, no sparsity config will be written to 
+`config.json`
+
+```python
+model.save_pretrained(
+    save_directory=output_dir
+    skip_compression_stats=True
+)
+```
+
+## Enable Compression During One-Shot and Sparse Finetunining
+Models that are saved in a supported compressed format on disk will automatically be
+decompressed when loaded as input to `sparseml.transformers.oneshot` or 
+`sparseml.transformers.train`
+
+To enable compression on save after oneshot or finetuning simply add the 
+`save_compressed=True` argument to `sparseml.transformers.oneshot` or 
+`sparseml.transformers.train`
+
+```python
+from sparseml.transformers import train
+
+train(
+    save_compressed=True,
+    model="neuralmagic/TinyLlama-1.1B-Chat-v1.0-pruned2.4",
+    recipe=RECIPE,
+    dataset=DATASET
+)
+```
+
+
+## Example Code
+
+Loads a 60% sparse model, compresses it using the inferred bitmask compression, then 
+reloads the compressed model.
+
+```python
+from sparseml.transformers import SparseAutoModelForCausalLM
+from sparseml.utils.pytorch.utils import measure_cuda_memory
+import torch
+
+MODEL_PATH = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60"
+OUTPUT_PATH = "./test_compress_output"
+RECIPE = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60"
+
+torch.cuda.set_device(0)
+with measure_cuda_memory() as m:
+    model = SparseAutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="cuda:0")
+print(f"Load dense model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
+
+sparsity_config = getattr(model,"sparsity_config", None)
+print(f"Sparsity config before compression: {sparsity_config}")
+with measure_cuda_memory() as m:
+    model.save_pretrained(OUTPUT_PATH, save_compressed=True)
+print(f"Save compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
+
+torch.cuda.set_device(1)
+with measure_cuda_memory() as m:
+    model_again = SparseAutoModelForCausalLM.from_pretrained(
+        OUTPUT_PATH, device_map="cuda:1"
+    )
+print(f"Load compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
+sparsity_config = getattr(model_again,"sparsity_config", None)
+print(f"Sparsity config after compression: {sparsity_config}")
+```
diff --git a/src/compressed_tensors/__init__.py b/src/compressed_tensors/__init__.py
new file mode 100644
index 00000000..0833dd42
--- /dev/null
+++ b/src/compressed_tensors/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import *
+
+# flake8: noqa
+from .compressors import *
+from .config import *
+from .quantization import QuantizationConfig, QuantizationStatus
+from .utils import *
diff --git a/src/compressed_tensors/base.py b/src/compressed_tensors/base.py
new file mode 100644
index 00000000..f01a055f
--- /dev/null
+++ b/src/compressed_tensors/base.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SPARSITY_CONFIG_NAME = "sparsity_config"
diff --git a/src/compressed_tensors/compressors/__init__.py b/src/compressed_tensors/compressors/__init__.py
new file mode 100644
index 00000000..1c7362eb
--- /dev/null
+++ b/src/compressed_tensors/compressors/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+from .base import ModelCompressor
+from .dense import DenseCompressor
+from .sparse_bitmask import BitmaskCompressor, BitmaskTensor
diff --git a/src/compressed_tensors/compressors/base.py b/src/compressed_tensors/compressors/base.py
new file mode 100644
index 00000000..9c205f93
--- /dev/null
+++ b/src/compressed_tensors/compressors/base.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+from typing import Dict, Generator, Tuple
+
+from compressed_tensors.base import SPARSITY_CONFIG_NAME
+from compressed_tensors.config import CompressionConfig
+from compressed_tensors.registry import RegistryMixin
+from torch import Tensor
+from torch.nn import Module, Parameter
+from tqdm import tqdm
+
+
+__all__ = ["ModelCompressor"]
+
+
+class ModelCompressor(RegistryMixin):
+    """
+    Base class representing a model compression algorithm.
+
+    :param config: config specifying compression parameters
+    """
+
+    def __init__(self, config: CompressionConfig):
+        self.config = config
+
+    def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Compresses a dense state dict
+
+        :param model_state: state dict of uncompressed model
+        :return: compressed state dict
+        """
+        raise NotImplementedError()
+
+    def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]:
+        """
+        Reads a compressed state dict located at model_path and returns a
+        generator for sequentially decompressing back to a dense state dict
+
+        :param model_path: path to compressed safetensors model
+        :return: compressed state dict
+        """
+        raise NotImplementedError()
+
+    def overwrite_weights(self, model_path: str, model: Module):
+        """
+        Overwrites the weights in model with weights decompressed from model_path
+
+        :param model_path: path to compressed weights
+        :param model: pytorch model to load decompressed weights into
+        """
+        dense_gen = self.decompress(model_path)
+        for name, data in tqdm(dense_gen, desc="Decompressing model"):
+            # loading the decompressed weights into the model
+            model_device = operator.attrgetter(name)(model).device
+            data_new = Parameter(data.to(model_device))
+            data_old = operator.attrgetter(name)(model)
+            data_old.data = data_new.data
+
+        setattr(model, SPARSITY_CONFIG_NAME, self.config)
diff --git a/src/compressed_tensors/compressors/dense.py b/src/compressed_tensors/compressors/dense.py
new file mode 100644
index 00000000..6e8785bc
--- /dev/null
+++ b/src/compressed_tensors/compressors/dense.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Generator, Tuple
+
+from compressed_tensors.compressors import ModelCompressor
+from torch import Tensor
+
+
+@ModelCompressor.register(name="dense_sparsity")
+class DenseCompressor(ModelCompressor):
+    """
+    Identity compressor for dense models, returns the original state_dict
+    """
+
+    def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        return model_state
+
+    def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]:
+        return iter([])
diff --git a/src/compressed_tensors/compressors/sparse_bitmask.py b/src/compressed_tensors/compressors/sparse_bitmask.py
new file mode 100644
index 00000000..f6f03f0b
--- /dev/null
+++ b/src/compressed_tensors/compressors/sparse_bitmask.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Dict, Generator, List, Tuple, Union
+
+import numpy
+import torch
+from compressed_tensors.compressors import ModelCompressor
+from compressed_tensors.utils import get_nested_weight_mappings, merge_names
+from safetensors import safe_open
+from torch import Tensor
+from tqdm import tqdm
+
+
+__all__ = [
+    "BitmaskCompressor",
+    "BitmaskTensor",
+    "bitmask_compress",
+    "bitmask_decompress",
+    "pack_bitmasks",
+    "unpack_bitmasks",
+]
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
+@ModelCompressor.register(name="sparse_bitmask")
+class BitmaskCompressor(ModelCompressor):
+    """
+    Compression for sparse models using bitmasks. Non-zero weights are stored in a 1d
+    values tensor, with their locations stored in a 2d bitmask
+    """
+
+    COMPRESSION_PARAM_NAMES = ["shape", "compressed", "bitmask", "row_offsets"]
+
+    def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Compresses a dense state dict using bitmask compression
+
+        :param model_state: state dict of uncompressed model
+        :return: compressed state dict
+        """
+        compressed_dict = {}
+        _LOGGER.debug(
+            f"Compressing model with {len(model_state)} parameterized layers..."
+        )
+        for name, value in tqdm(model_state.items(), desc="Compressing model"):
+            bitmask_tensor = BitmaskTensor.from_dense(value)
+            bitmask_dict = bitmask_tensor.dict(name_prefix=name, device="cpu")
+            for key in bitmask_dict.keys():
+                if key in compressed_dict:
+                    _LOGGER.warn(
+                        f"Expected all compressed state_dict keys to be unique, but "
+                        f"found an existing entry for {key}. The existing entry will "
+                        "be replaced."
+                    )
+            compressed_dict |= bitmask_dict
+
+        return compressed_dict
+
+    def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]:
+        """
+        Reads a bitmask compressed state dict located at model_path and returns a
+        generator for sequentially decompressing back to a dense state dict
+
+        :param model_path: path to compressed safetensors model
+        :return: iterator for generating decompressed weights
+        """
+        weight_mappings = get_nested_weight_mappings(
+            model_path, self.COMPRESSION_PARAM_NAMES
+        )
+        for weight_name in weight_mappings.keys():
+            weight_data = {}
+            for param_name, safe_path in weight_mappings[weight_name].items():
+                full_name = merge_names(weight_name, param_name)
+                with safe_open(safe_path, framework="pt", device="cpu") as f:
+                    weight_data[param_name] = f.get_tensor(full_name)
+            data = BitmaskTensor(**weight_data)
+            decompressed = data.decompress()
+            yield weight_name, decompressed
+
+
+class BitmaskTensor:
+    """
+    Owns compressions and decompression for a single bitmask compressed tensor.
+    Adapted from: https://github.com/mgoin/torch_bitmask/tree/main
+
+    :param shape: shape of dense tensor
+    :compressed: flat tensor of non-zero values
+    :bitmask: 2d bitmask of non-zero values
+    :row_offsets: flat tensor indicating what index in values each dense row starts at
+    """
+
+    def __init__(
+        self,
+        shape: Union[torch.Size, List],
+        compressed: Tensor,
+        bitmask: Tensor,
+        row_offsets: Tensor,
+    ):
+        self.shape = list(shape)
+        self.compressed = compressed
+        self.bitmask = bitmask
+        self.row_offsets = row_offsets
+
+    @staticmethod
+    def from_dense(tensor: Tensor) -> "BitmaskTensor":
+        """
+        :param tensor: dense tensor to compress
+        :return: instantiated compressed tensor
+        """
+        shape = tensor.shape
+        compressed, bitmask, row_offsets = bitmask_compress(tensor.cpu())
+        return BitmaskTensor(
+            shape=shape, compressed=compressed, bitmask=bitmask, row_offsets=row_offsets
+        )
+
+    def decompress(self) -> Tensor:
+        """
+        :return: reconstructed dense tensor
+        """
+        return bitmask_decompress(self.compressed, self.bitmask, self.shape)
+
+    def curr_memory_size_bytes(self):
+        """
+        :return: size in bytes required to store compressed tensor on disk
+        """
+
+        def sizeof_tensor(a):
+            return a.element_size() * a.nelement()
+
+        return (
+            sizeof_tensor(self.compressed)
+            + sizeof_tensor(self.bitmask)
+            + sizeof_tensor(self.row_offsets)
+        )
+
+    def dict(self, name_prefix: str, device: str = "cpu") -> Dict[str, Tensor]:
+        """
+        :name_prefix: name of original tensor to store compressed weight as
+        :return: dict of compressed data for the stored weight
+        """
+        return {
+            merge_names(name_prefix, "shape"): torch.tensor(self.shape, device=device),
+            merge_names(name_prefix, "compressed"): self.compressed.to(device),
+            merge_names(name_prefix, "bitmask"): self.bitmask.to(device),
+            merge_names(name_prefix, "row_offsets"): self.row_offsets.to(device),
+        }
+
+    def __repr__(self):
+        return f"BitmaskTensor(shape={self.shape}, compressed=True)"
+
+
+def bitmask_compress(tensor: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Compresses a dense tensor using bitmask compression
+
+    :param tensor: dense tensor to compress
+    :return: tuple of compressed data representing tensor
+    """
+    bytemasks = tensor != 0
+    row_counts = bytemasks.sum(dim=-1)
+    row_offsets = torch.cumsum(row_counts, 0) - row_counts
+    values = tensor[bytemasks]
+    bitmasks_packed = pack_bitmasks(bytemasks)
+
+    return values, bitmasks_packed, row_offsets
+
+
+def bitmask_decompress(
+    values: Tensor, bitmasks: Tensor, original_shape: torch.Size
+) -> Tensor:
+    """
+    Reconstructs a dense tensor from a compressed one
+
+    :param values: 1d tensor of non-zero values
+    :param bitmasks: 2d int8 tensor flagging locations of non-zero values in the
+    tensors original shape
+    :param original_shape: shape of the dense tensor
+    :return: decompressed dense tensor
+    """
+    bytemasks_unpacked = unpack_bitmasks(bitmasks, original_shape)
+
+    decompressed_tensor = torch.zeros(original_shape, dtype=values.dtype)
+    decompressed_tensor[bytemasks_unpacked] = values
+
+    return decompressed_tensor
+
+
+def pack_bitmasks(bytemasks: Tensor) -> Tensor:
+    """
+    Converts a bytemask tensor to a bitmask tensor to reduce memory. Shape RxC will be
+    compressed to R x ceil(C/8)
+    :param bytemasks: mask tensor where each byte corresponds to a weight
+    :return: mask tensor where each bit corresounds to a weight
+    """
+    packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little")
+    packed_bits_torch = torch.from_numpy(packed_bits_numpy)
+
+    return packed_bits_torch
+
+
+def unpack_bitmasks(packed_bitmasks: Tensor, original_shape: torch.Size) -> Tensor:
+    """
+    Converts a bitmask tensor back to a bytemask tensor for use during decompression
+
+    :param packed_bitmasks: mask tensor where each bit corresponds to a weight
+    :param original_shape: dense shape to decompress to
+    :return: boolean mask of weights in the original dense shape
+    """
+    # Unpack the bits
+    unpacked_bits = numpy.unpackbits(
+        packed_bitmasks.numpy(), axis=-1, count=original_shape[-1], bitorder="little"
+    )
+
+    # Reshape to match the original shape
+    unpacked_bitmasks_torch = torch.from_numpy(
+        unpacked_bits.reshape(original_shape).astype(bool)
+    )
+
+    return unpacked_bitmasks_torch
diff --git a/src/compressed_tensors/config/__init__.py b/src/compressed_tensors/config/__init__.py
new file mode 100644
index 00000000..ff83f5af
--- /dev/null
+++ b/src/compressed_tensors/config/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+from .base import *
+from .dense import *
+from .sparse_bitmask import *
diff --git a/src/compressed_tensors/config/base.py b/src/compressed_tensors/config/base.py
new file mode 100644
index 00000000..f58b11f8
--- /dev/null
+++ b/src/compressed_tensors/config/base.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from compressed_tensors.registry import RegistryMixin
+from pydantic import BaseModel
+
+
+__all__ = ["CompressionConfig"]
+
+
+class CompressionConfig(RegistryMixin, BaseModel):
+    """
+    Base data class for storing compression parameters
+
+    :param format: name of compression format
+    :param global_sparsity: average sparsity of the entire model
+    :param sparsity_structure: structure of the sparsity, such as
+    "unstructured", "2:4", "8:16" etc
+    """
+
+    format: str
+    global_sparsity: Optional[float] = 0.0
+    sparsity_structure: Optional[str] = "unstructured"
diff --git a/src/compressed_tensors/config/dense.py b/src/compressed_tensors/config/dense.py
new file mode 100644
index 00000000..aa23220c
--- /dev/null
+++ b/src/compressed_tensors/config/dense.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from compressed_tensors.config import CompressionConfig
+
+
+__all__ = ["DenseSparsityConfig"]
+
+
+@CompressionConfig.register(name="dense_sparsity")
+class DenseSparsityConfig(CompressionConfig):
+    """
+    Identity configuration for storing a sparse model in
+    an uncompressed dense format
+
+    :param global_sparsity: average sparsity of the entire model
+    :param sparsity_structure: structure of the sparsity, such as
+    "unstructured", "2:4", "8:16" etc
+    """
+
+    format: str = "dense_sparsity"
+    global_sparsity: Optional[float] = 0.0
+    sparsity_structure: Optional[str] = "unstructured"
diff --git a/src/compressed_tensors/config/sparse_bitmask.py b/src/compressed_tensors/config/sparse_bitmask.py
new file mode 100644
index 00000000..9b9cf211
--- /dev/null
+++ b/src/compressed_tensors/config/sparse_bitmask.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from compressed_tensors.config.base import CompressionConfig
+
+
+__all__ = ["BitmaskConfig"]
+
+
+@CompressionConfig.register(name="sparse_bitmask")
+class BitmaskConfig(CompressionConfig):
+    """
+    Configuration for storing a sparse model using
+    bitmask compression
+
+    :param global_sparsity: average sparsity of the entire model
+    :param sparsity_structure: structure of the sparsity, such as
+    "unstructured", "2:4", "8:16" etc
+    """
+
+    format: str = "sparse_bitmask"
+    global_sparsity: Optional[float] = 0.0
+    sparsity_structure: Optional[str] = "unstructured"
diff --git a/src/compressed_tensors/quantization/__init__.py b/src/compressed_tensors/quantization/__init__.py
new file mode 100644
index 00000000..9fde69a3
--- /dev/null
+++ b/src/compressed_tensors/quantization/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+# isort: skip_file
+
+from .quant_args import *
+from .quant_config import *
+from .quant_scheme import *
+from .lifecycle import *
diff --git a/src/compressed_tensors/quantization/lifecycle/__init__.py b/src/compressed_tensors/quantization/lifecycle/__init__.py
new file mode 100644
index 00000000..9504597b
--- /dev/null
+++ b/src/compressed_tensors/quantization/lifecycle/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+# isort: skip_file
+
+from .calibration import *
+from .forward import *
+from .frozen import *
+from .initialize import *
+from .apply import *
diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py
new file mode 100644
index 00000000..08cb42f9
--- /dev/null
+++ b/src/compressed_tensors/quantization/lifecycle/apply.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from collections import OrderedDict
+from typing import Iterable, Optional
+
+from compressed_tensors.quantization.lifecycle.calibration import (
+    set_module_for_calibration,
+)
+from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quantization
+from compressed_tensors.quantization.lifecycle.initialize import (
+    initialize_module_for_quantization,
+)
+from compressed_tensors.quantization.quant_config import (
+    QuantizationConfig,
+    QuantizationStatus,
+)
+from compressed_tensors.quantization.utils import iter_named_leaf_modules
+from torch.nn import Module
+
+
+__all__ = [
+    "apply_quantization_config",
+    "apply_quantization_status",
+]
+
+
+def apply_quantization_config(model: Module, config: QuantizationConfig):
+    """
+    Initializes the model for quantization in-place based on the given config
+
+    :param model: model to apply quantization config to
+    :param config: quantization config
+    """
+    # build mapping of targets to schemes for easier matching
+    # use ordered dict to preserve target ordering in config
+    target_to_scheme = OrderedDict()
+    for scheme in config.config_groups.values():
+        for target in scheme.targets:
+            target_to_scheme[target] = scheme
+
+    # mark appropriate layers for quantization by setting their quantization schemes
+    for name, submodule in iter_named_leaf_modules(model):
+        if _find_first_name_or_class_match(name, submodule, config.ignore):
+            continue  # layer matches ignore list, continue
+        target = _find_first_name_or_class_match(name, submodule, target_to_scheme)
+        if target is not None:
+            # target matched - add layer and scheme to target list
+            submodule.quantization_scheme = target_to_scheme[target]
+
+    # apply current quantization status across all targeted layers
+    apply_quantization_status(model, config.quantization_status)
+
+
+def apply_quantization_status(model: Module, status: QuantizationStatus):
+    """
+    Applies in place the quantization lifecycle up to the given status
+
+    :param model: model to apply quantization to
+    :param status: status to update the module to
+    """
+    if status >= QuantizationStatus.INITIALIZED:
+        model.apply(initialize_module_for_quantization)
+    if status >= QuantizationStatus.CALIBRATION:
+        model.apply(set_module_for_calibration)
+    if status >= QuantizationStatus.FROZEN:
+        model.apply(freeze_module_quantization)
+
+
+def _find_first_name_or_class_match(
+    name: str,
+    module: Module,
+    targets: Iterable[str],
+) -> Optional[str]:
+    # first element of targets that matches the given name
+    # if no name matches returns first target that matches the class name
+    # returns None otherwise
+    return _find_first_match(name, targets) or _find_first_match(
+        module.__class__.__name__, targets
+    )
+
+
+def _find_first_match(value: str, targets: Iterable[str]) -> Optional[str]:
+    # returns first element of target that matches value either
+    # exactly or as a regex after 're:'
+    for target in targets:
+        if target.startswith("re:"):
+            pattern = target[3:]
+            if re.match(pattern, value):
+                return target
+        elif target == value:
+            return target
+    return None
diff --git a/src/compressed_tensors/quantization/lifecycle/calibration.py b/src/compressed_tensors/quantization/lifecycle/calibration.py
new file mode 100644
index 00000000..7ab1d896
--- /dev/null
+++ b/src/compressed_tensors/quantization/lifecycle/calibration.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Module
+
+
+__all__ = [
+    "set_module_for_calibration",
+]
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def set_module_for_calibration(module: Module):
+    """
+    marks a layer as ready for calibration which activates observers
+    to update scales and zero points on each forward pass
+
+    apply to full model with `model.apply(set_module_for_calibration)`
+
+    :param module: module to set for calibration
+    """
+    if not getattr(module, "quantization_scheme", None):
+        # no quantization scheme nothing to do
+        return
+    status = getattr(module, "quantization_status", None)
+    if not status or status != QuantizationStatus.INITIALIZED:
+        raise _LOGGER.warning(
+            f"Attempting set module with status {status} to calibration mode. "
+            f"but status is not {QuantizationStatus.INITIALIZED} - you may "
+            "be calibrating an uninitialized module which may fail or attempting "
+            "to re-calibrate a frozen module"
+        )
+
+    module.quantization_status = QuantizationStatus.CALIBRATION
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
new file mode 100644
index 00000000..2118cf74
--- /dev/null
+++ b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import wraps
+
+import torch
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from torch.nn import Module
+
+
+__all__ = [
+    "wrap_module_forward_quantized",
+    "quantize",
+    "dequantize",
+    "fake_quantize",
+    "maybe_calibrate_or_quantize",
+]
+
+
+@torch.no_grad()
+def quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    q_min: torch.Tensor,
+    q_max: torch.Tensor,
+) -> torch.Tensor:
+    return torch.clamp(
+        torch.round(
+            x / scale + zero_point,
+        ),
+        q_min,
+        q_max,
+    )
+
+
+@torch.no_grad()
+def dequantize(
+    x_q: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+) -> torch.Tensor:
+    return (x_q - zero_point) * scale
+
+
+@torch.no_grad()
+def fake_quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    args: QuantizationArgs,
+) -> torch.Tensor:
+    bit_range = 2**args.num_bits
+    max_q = torch.tensor(bit_range / 2 - 1, device=x.device)
+    min_q = torch.tensor(-bit_range / 2, device=x.device)
+    Q = torch.zeros_like(x)
+    Q = quantize(x, scale, zero_point, min_q, max_q)
+    return dequantize(Q, scale, zero_point)
+
+
+def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
+    # expects a module already initialized and injected with the parameters in
+    # initialize_module_for_quantization
+    forward_func_orig = module.forward.__func__
+
+    @wraps(forward_func_orig)  # ensures docstring, names, etc are propagated
+    def wrapped_forward(self, *args, **kwargs):
+        input_ = args[0]
+
+        if scheme.input_activations is not None:
+            # calibrate and (fake) quantize input activations when applicable
+            input_ = maybe_calibrate_or_quantize(
+                module, input_, "input", scheme.input_activations
+            )
+
+        if scheme.weights is not None:
+            # calibrate and (fake) quantize weights when applicable
+            self.weight.data = maybe_calibrate_or_quantize(
+                module, self.weight, "weight", scheme.weights
+            )
+
+        # perform wrapped forward call
+        output = forward_func_orig.__get__(module, module.__class__)(
+            input_, *args[1:], **kwargs
+        )
+
+        if scheme.output_activations is not None:
+            # calibrate and (fake) quantize output activations when applicable
+            output = maybe_calibrate_or_quantize(
+                module, output, "output", scheme.output_activations
+            )
+
+        return output
+
+    # bind wrapped forward to module class so reference to `self` is correct
+    bound_wrapped_forward = wrapped_forward.__get__(module, module.__class__)
+    # set forward to wrapped forward
+    setattr(module, "forward", bound_wrapped_forward)
+
+
+def maybe_calibrate_or_quantize(
+    module: Module, value: Module, base_name: str, args: "QuantizationArgs"
+) -> torch.Tensor:
+    # only run quantized for the included stages
+    if module.quantization_status not in {
+        QuantizationStatus.CALIBRATION,
+        QuantizationStatus.FROZEN,
+    }:
+        return value
+
+    device = next(module.parameters()).device
+    scale = getattr(module, f"{base_name}_scale")
+    zero_point = getattr(module, f"{base_name}_zero_point")
+
+    if module.quantization_status == QuantizationStatus.CALIBRATION:
+        # get observer and get new quant params from observation
+        observer = getattr(module, f"{base_name}_observer")
+        updated_scale, updated_zero_point = observer(value)
+
+        # update scale and zero point
+        scale.data = updated_scale.to(device)
+        zero_point.data = updated_zero_point.to(device)
+
+    return fake_quantize(value, scale, zero_point, args)
diff --git a/src/compressed_tensors/quantization/lifecycle/frozen.py b/src/compressed_tensors/quantization/lifecycle/frozen.py
new file mode 100644
index 00000000..3fa91fa9
--- /dev/null
+++ b/src/compressed_tensors/quantization/lifecycle/frozen.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Module
+
+
+__all__ = [
+    "freeze_module_quantization",
+]
+
+
+def freeze_module_quantization(module: Module):
+    """
+    deletes observers so static quantization is completed.
+
+    apply to full model with `model.apply(freeze_module_quantization)`
+
+    :param module: module to freeze quantization for
+    """
+    if not getattr(module, "quantization_scheme", None):
+        # no quantization scheme nothing to do
+        return
+
+    # delete observers from module
+    submodule_name_do_delete = set()
+    for submodule_name, _ in module.named_modules():
+        if "." not in submodule_name and submodule_name.endswith("_observer"):
+            # delete any observers that belong directly to this module
+            submodule_name_do_delete.add(submodule_name)
+
+    for submodule_name in submodule_name_do_delete:
+        delattr(module, submodule_name)
+
+    module.quantization_status = QuantizationStatus.FROZEN
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
new file mode 100644
index 00000000..4ef6379b
--- /dev/null
+++ b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+from typing import Optional
+
+import torch
+from compressed_tensors.quantization.lifecycle.forward import (
+    wrap_module_forward_quantized,
+)
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from torch.nn import Module, Parameter
+
+
+__all__ = [
+    "initialize_module_for_quantization",
+]
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def initialize_module_for_quantization(
+    module: Module,
+    scheme: Optional[QuantizationScheme] = None,
+):
+    """
+    attaches appropriate scales, zero points, and observers to a layer
+    given its target quantization scheme
+
+    apply to full model with `model.apply(initialize_module_for_quantization)`
+
+    :param module: module to set for calibration
+    :param scheme: scheme to use for quantization. if None is provided,
+        will attempt to use scheme stored in the module under `quantization_scheme`,
+        if not provided, the layer will be skipped
+    """
+    scheme = scheme or getattr(module, "quantization_scheme", None)
+    if scheme is None:
+        # no scheme passed and layer not targeted for quantization - skip
+        return
+
+    if scheme.input_activations is not None:
+        _initialize_scale_zero_point_observer(module, "input", scheme.input_activations)
+    if scheme.weights is not None:
+        if hasattr(module, "weight"):
+            _initialize_scale_zero_point_observer(module, "weight", scheme.weights)
+        else:
+            _LOGGER.warning(
+                f"module type {type(module)} targeted for weight quantization but "
+                "has no attribute weight, skipping weight quantization "
+                f"for {type(module)}"
+            )
+    if scheme.output_activations is not None:
+        _initialize_scale_zero_point_observer(
+            module, "output", scheme.output_activations
+        )
+
+    module.quantization_scheme = scheme
+    module.quantization_status = QuantizationStatus.INITIALIZED
+
+    # wrap forward call of module to perform quantized actions based on calltime status
+    wrap_module_forward_quantized(module, scheme)
+
+
+def _initialize_scale_zero_point_observer(
+    module: Module, base_name: str, quantization_args: QuantizationArgs
+):
+    device = next(module.parameters()).device
+
+    # initializes empty scale and zero point parameters for the module
+    init_scale = Parameter(torch.empty(0, device=device), requires_grad=False)
+    module.register_parameter(f"{base_name}_scale", init_scale)
+
+    init_zero_point = Parameter(
+        torch.empty(0, device=device, dtype=int), requires_grad=False
+    )
+    module.register_parameter(f"{base_name}_zero_point", init_zero_point)
+
+    # initialize observer module and attach as submodule
+    observer = quantization_args.get_observer()
+    module.register_module(f"{base_name}_observer", observer)
diff --git a/src/compressed_tensors/quantization/observers/__init__.py b/src/compressed_tensors/quantization/observers/__init__.py
new file mode 100644
index 00000000..d0362b8f
--- /dev/null
+++ b/src/compressed_tensors/quantization/observers/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+from .base import *
+from .memoryless import *
+from .min_max import *
diff --git a/src/compressed_tensors/quantization/observers/base.py b/src/compressed_tensors/quantization/observers/base.py
new file mode 100644
index 00000000..96fe1049
--- /dev/null
+++ b/src/compressed_tensors/quantization/observers/base.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.registry.registry import RegistryMixin
+from torch import FloatTensor, IntTensor, Tensor
+from torch.nn import Module
+
+
+__all__ = ["Observer"]
+
+
+class Observer(Module, RegistryMixin):
+    """
+    Base Observer class to be subclassed for specific implementation.
+    Subclasses should override `calculate_qparams` to return a scale, zero_point
+    pair
+    """
+
+    def __init__(self, quantization_args: QuantizationArgs):
+        self.quantization_args: QuantizationArgs = quantization_args
+        super().__init__()
+        self._scale = None
+        self._zero_point = None
+
+    def forward(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+        """
+        maps directly to get_qparams
+        :param observed: optional observed tensor to calculate quantization parameters
+            from
+        :return: tuple of scale and zero point based on last observed value
+        """
+        return self.get_qparams(observed=observed)
+
+    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+        """
+        :param observed: observed tensor to calculate quantization parameters for
+        :return: tuple of scale and zero point derived from the observed tensor
+        """
+        raise NotImplementedError(f"{self.__class__} must implement calculate_qparams")
+
+    def get_qparams(
+        self, observed: Optional[Tensor] = None
+    ) -> Tuple[FloatTensor, IntTensor]:
+        """
+        Convenience function to wrap overwritten calculate_qparams
+        adds support to make observed tensor optional and support for tracking latest
+        calculated scale and zero point
+        :param observed: optional observed tensor to calculate quantization parameters
+            from
+        :return: tuple of scale and zero point based on last observed value
+        """
+        if observed is not None:
+            # re-calcualte scale and zero point, update the stored value
+            self._scale, self._zero_point = self.calculate_qparams(observed)
+        return self._scale, self._zero_point
diff --git a/src/compressed_tensors/quantization/observers/memoryless.py b/src/compressed_tensors/quantization/observers/memoryless.py
new file mode 100644
index 00000000..0ba4d9f6
--- /dev/null
+++ b/src/compressed_tensors/quantization/observers/memoryless.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import torch
+from compressed_tensors.quantization.observers.base import Observer
+from torch import FloatTensor, IntTensor, Tensor
+
+
+__all__ = ["MemorylessObserver"]
+
+
+@Observer.register("memoryless")
+class MemorylessObserver(Observer):
+    """
+    Implements a dynamic quantization observer that sets the scale and
+    zero point based on the latest observed value
+    """
+
+    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+        """
+        :param observed: observed tensor to calculate quantization parameters for
+        :return: tuple of scale and zero point derived from the observed tensor
+        """
+        # TODO: Add support for full range of quantization Args, only supports 8bit
+        #       per tensor
+        bit_range = 255
+        min_val = observed.min()
+        max_val = observed.max()
+
+        # ensure zero is in the range
+        min_val = torch.min(min_val, torch.zeros_like(min_val))
+        max_val = torch.max(max_val, torch.zeros_like(max_val))
+
+        if self.quantization_args.symmetric:
+            symmetric_range = 2 * max(min_val.abs(), max_val.abs())
+            scale = symmetric_range / bit_range
+            zero_point = torch.tensor(0).to(torch.int8)
+        else:
+            # non-symmetric
+            observed_range = max_val - min_val
+            scale = observed_range / bit_range
+
+            # scales from a 0 range should be set to 1
+            scale[observed_range == 0] = 1
+
+            zero_point = ((0 - min_val) / scale).to(torch.int8)
+
+        return scale, zero_point
diff --git a/src/compressed_tensors/quantization/observers/min_max.py b/src/compressed_tensors/quantization/observers/min_max.py
new file mode 100644
index 00000000..eb575df1
--- /dev/null
+++ b/src/compressed_tensors/quantization/observers/min_max.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import torch
+from compressed_tensors.quantization.observers.base import Observer
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from torch import FloatTensor, IntTensor, Tensor
+
+
+__all__ = ["MinMaxObserver"]
+
+
+@Observer.register("minmax")
+class MinMaxObserver(Observer):
+    """
+    Implements a dynamic quantization observer that sets the scale and
+    zero point based on the latest observed value
+    """
+
+    def __init__(self, quantization_args: QuantizationArgs):
+        super().__init__(quantization_args=quantization_args)
+
+        self.min_val = float("inf")
+        self.max_val = -float("inf")
+        self.counter = 0
+
+    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+        """
+        :param observed: observed tensor to calculate quantization parameters for
+        :return: tuple of scale and zero point derived from the observed tensor
+        """
+        # TODO: Add support for full range of quantization Args, only supports 8bit
+        #       per tensor
+        bit_range = 255
+        min_val = torch.tensor([observed.min()])
+        max_val = torch.tensor([observed.max()])
+
+        # update running average
+        if self.counter > 0:
+            self.min_val = (self.min_val * self.counter + min_val) / (self.counter + 1)
+            self.max_val = (self.max_val * self.counter + max_val) / (self.counter + 1)
+        else:
+            self.min_val = min_val
+            self.max_val = max_val
+
+        # ensure that the zeros are in the range
+        min_val = torch.min(self.min_val, torch.zeros_like(self.min_val))
+        max_val = torch.max(self.max_val, torch.zeros_like(self.max_val))
+
+        self.counter += 1
+
+        if self.quantization_args.symmetric:
+            symmetric_range = 2 * max(min_val.abs(), max_val.abs())
+            scale = symmetric_range / bit_range
+            zero_point = torch.tensor(0).to(torch.int8)
+        else:
+            # non-symmetric
+            observed_range = max_val - min_val
+            scale = observed_range / bit_range
+
+            # scales from a 0 range should be set to 1
+            scale[observed_range == 0] = 1
+
+            zero_point = ((0 - min_val) / scale).to(torch.int8)
+
+        return scale, zero_point
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
new file mode 100644
index 00000000..64b5005f
--- /dev/null
+++ b/src/compressed_tensors/quantization/quant_args.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+
+__all__ = ["QuantizationType", "QuantizationStrategy", "QuantizationArgs"]
+
+
+class QuantizationType(str, Enum):
+    """
+    Enum storing quantization type options
+    """
+
+    INT = "int"
+    FLOAT = "float"
+
+
+class QuantizationStrategy(str, Enum):
+    """
+    Enum storing quantization strategy options
+    """
+
+    TENSOR = "tensor"
+    CHANNEL = "channel"
+    GROUP = "group"
+    BLOCK = "block"
+
+
+class QuantizationArgs(BaseModel):
+    """
+    User facing arguments used to define a quantization config for weights or
+    activations
+
+    :param num_bits: quantization bit depth
+    :param type: dtype to quantized to, either int or float
+    :param symmetric: whether or not quantization scale is symmetric about zero-point
+    :param strategy: string id determining the scope of scale/zero-point to apply
+    :param group_size: group length to use for the group strategy
+    :param block_structure: 2d block structure to use for the block strategy, must be
+    of the format "2x4", "8x16", etc.
+    """
+
+    num_bits: int = 8
+    type: QuantizationType = QuantizationType.INT
+    symmetric: bool = True
+    strategy: QuantizationStrategy = QuantizationStrategy.TENSOR
+    group_size: Optional[int] = None
+    block_structure: Optional[str] = None
+    observer: str = Field(
+        default="minmax",
+        description=(
+            "The class to use to compute the quantization param - "
+            "scale and zero-point'"
+        ),
+    )
+    observer_kwargs: Dict[str, Any] = Field(
+        default_factory=dict,
+        description=(
+            "optional dict of kwargs to be passed directly to torch quantization "
+            "Observers constructor excluding quantization range or symmetry"
+        ),
+    )
+
+    def get_observer(self):
+        """
+        :return: torch quantization FakeQuantize built based on these QuantizationArgs
+        """
+        from compressed_tensors.quantization.observers.base import Observer
+
+        return Observer.load_from_registry(self.observer, quantization_args=self)
diff --git a/src/compressed_tensors/quantization/quant_config.py b/src/compressed_tensors/quantization/quant_config.py
new file mode 100644
index 00000000..a62a79bd
--- /dev/null
+++ b/src/compressed_tensors/quantization/quant_config.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from typing import Dict, List, Optional
+
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from compressed_tensors.quantization.utils import (
+    calculate_compression_ratio,
+    is_module_quantized,
+    iter_named_leaf_modules,
+    module_type,
+)
+from pydantic import BaseModel, Field
+from torch.nn import Module
+
+
+__all__ = [
+    "QuantizationStatus",
+    "QuantizationConfig",
+    "LIFECYCLE_ORDER",
+]
+
+
+class QuantizationStatus(str, Enum):
+    """
+    Enum storing the different states a quantized layer can be in
+
+    Initialized: scale, zero points and observers have been attached to the layer but
+    are set to dummy values (not yet calibrated)
+    Calibration: scale and zero points have been calibrated through OBCQ or similar
+    algorithm, observers are still attached
+    Frozen: scale and zero points are finalized, observers have been deleted, weights
+    are still in their original precision
+    Compressed: weights have been converted to their target type or compressed to
+    their closed approximation
+    """
+
+    INITIALIZED = "initialized"
+    CALIBRATION = "calibration"
+    FROZEN = "frozen"
+    COMPRESSED = "compressed"
+
+    @classmethod
+    def lifecycle_order(cls) -> List["QuantizationStatus"]:
+        """
+        :return: list of correct quantization lifecycle order
+        """
+        return
+
+    def __ge__(self, other):
+        if not isinstance(other, self.__class__):
+            raise NotImplementedError
+        return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other)
+
+
+LIFECYCLE_ORDER = [
+    QuantizationStatus.INITIALIZED,
+    QuantizationStatus.CALIBRATION,
+    QuantizationStatus.FROZEN,
+    QuantizationStatus.COMPRESSED,
+]
+
+
+class QuantizationConfig(BaseModel):
+    """
+    Full configuration specifying how a model is quantized. Each quantized layer is
+    mapped to a QuantizationScheme in config_groups.
+
+    :param config_groups: dict of QuantizationSchemes specifying the quantization
+    settings for each quantized layer
+    :param quant_method: a constant used to differentiate sparseML quantization from
+    other quantization configs
+    :param format: specifies how the quantized model is stored on disk
+    :quantization_status: specifies the current status of all quantized layers. It is
+    assumed all layers are in the same state.
+    :global_compression_ratio: optional informational config to report the model
+    compression ratio acheived by the quantization config
+    :ignore: optional list of layers to ignore from config_groups. Layers in this list
+    are not quantized even if they match up with a target in config_groups
+    """
+
+    config_groups: Dict[str, QuantizationScheme]
+    quant_method: str = "sparseml"
+    format: str = "fakequant"
+    quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
+    global_compression_ratio: Optional[float] = None
+    ignore: Optional[List[str]] = Field(default_factory=list)
+
+    @staticmethod
+    def from_pretrained(model: Module) -> "QuantizationConfig":
+        """
+        Converts a model into its associated QuantizationConfig based on the
+        QuantizationScheme attached to each quanitzed module
+
+        :param model: model to calculate quantization scheme of
+        :return: filled out QuantizationScheme for the input model
+        """
+        quant_scheme_to_layers = []
+        quantization_status = None
+        ignore = {}
+        quantization_type_names = set()
+        for name, submodule in iter_named_leaf_modules(model):
+            layer_type = module_type(submodule)
+            if not is_module_quantized(submodule):
+                if layer_type not in ignore:
+                    ignore[layer_type] = []
+                ignore[layer_type].append(name)
+            else:
+                quantization_status = submodule.quantization_status
+                scheme = submodule.quantization_scheme
+                quantization_type_names.add(layer_type)
+
+                match_found = False
+                for existing_scheme in quant_scheme_to_layers:
+                    if scheme == existing_scheme:
+                        match_found = True
+                        break
+                if not match_found:
+                    quant_scheme_to_layers.append(scheme)
+
+        # clean up ignore list, we can leave out layers types if none of the
+        # instances are quantized
+        consolidated_ignore = []
+        for layer_type, ignore_names in ignore.items():
+            if layer_type in quantization_type_names:
+                # specific layers of a quantized type are ignored
+                consolidated_ignore += ignore_names
+            # else we leave it off the ignore list, doesn't fall under any of the
+            # existing quantization schemes so it won't be quantized
+
+        config_groups = {}
+        for idx, scheme in enumerate(quant_scheme_to_layers):
+            group_name = "group_" + str(idx)
+            config_groups[group_name] = scheme
+
+        compression_ratio = calculate_compression_ratio(model)
+        return QuantizationConfig(
+            config_groups=config_groups,
+            quantization_status=quantization_status,
+            global_compression_ratio=compression_ratio,
+            ignore=consolidated_ignore,
+        )
diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py
new file mode 100644
index 00000000..ed0f8245
--- /dev/null
+++ b/src/compressed_tensors/quantization/quant_scheme.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from pydantic import BaseModel
+
+
+__all__ = ["QuantizationScheme"]
+
+
+class QuantizationScheme(BaseModel):
+    """
+    Set of QuantizationArgs defining how the weights, inputs and outputs of target list
+    of modules should be quantized
+
+    :param targets: list of modules to apply the QuantizationArgs to, can be layer
+    names, layer types or a regular expression
+    :param weights: quantization config for layer weights
+    :param input_activations: quantization config for layer inputs
+    :param output_activations: quantization config for layer outputs
+    """
+
+    targets: List[str]
+    weights: Optional[QuantizationArgs] = None
+    input_activations: Optional[QuantizationArgs] = None
+    output_activations: Optional[QuantizationArgs] = None
diff --git a/src/compressed_tensors/quantization/utils/__init__.py b/src/compressed_tensors/quantization/utils/__init__.py
new file mode 100644
index 00000000..a91f9e5d
--- /dev/null
+++ b/src/compressed_tensors/quantization/utils/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+from .helpers import *
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
new file mode 100644
index 00000000..3c00cdbe
--- /dev/null
+++ b/src/compressed_tensors/quantization/utils/helpers.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import torch
+from torch.nn import Module
+from tqdm import tqdm
+
+
+__all__ = [
+    "is_module_quantized",
+    "is_model_quantized",
+    "iter_named_leaf_modules",
+    "module_type",
+    "calculate_compression_ratio",
+]
+
+
+def is_module_quantized(module: Module) -> bool:
+    """
+    Check if a module is quantized, based on the existence of a non-empty quantization
+    scheme
+
+    :param module: pytorch module to check
+    :return: True if module is quantized, False otherwise
+    """
+    if not hasattr(module, "quantization_scheme"):
+        return False
+
+    if module.quantization_scheme.weights is not None:
+        return True
+
+    if module.quantization_scheme.input_activations is not None:
+        return True
+
+    if module.quantization_scheme.output_activations is not None:
+        return True
+
+    return False
+
+
+def is_model_quantized(model: Module) -> bool:
+    """
+    Check if any modules in a model are quantized, based on the existence of a non-empty
+    quantization scheme in at least one module
+
+    :param model: pytorch model
+    :return: True if model is quantized, False otherwise
+    """
+
+    for _, submodule in iter_named_leaf_modules(model):
+        if is_module_quantized(submodule):
+            return True
+
+    return False
+
+
+def module_type(module: Module) -> str:
+    """
+    Gets a string representation of a module type
+
+    :module: pytorch module to get type of
+    :return: module type as a string
+    """
+    return type(module).__name__
+
+
+def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]:
+    # yields modules that do not have any submodules
+    # TODO: potentially expand to add list of allowed submodules such as observers
+    for name, submodule in model.named_modules():
+        if len(list(submodule.children())) == 0:
+            yield name, submodule
+
+
+def calculate_compression_ratio(model: Module) -> float:
+    """
+    Calculates the quantization compression ratio of a pytorch model, based on the
+    number of bits needed to represent the total weights in compressed form. Does not
+    take into account activation quantizatons.
+
+    :param model: pytorch module to calculate compression ratio for
+    :return: compression ratio of the whole model
+    """
+    total_compressed = 0.0
+    total_uncompressed = 0.0
+    for name, submodule in tqdm(
+        iter_named_leaf_modules(model),
+        desc="Calculating quantization compression ratio",
+    ):
+        for parameter in model.parameters():
+            try:
+                uncompressed_bits = torch.finfo(parameter.dtype).bits
+            except TypeError:
+                uncompressed_bits = torch.iinfo(parameter.dtype).bits
+            compressed_bits = uncompressed_bits
+            if is_module_quantized(submodule):
+                compressed_bits = submodule.quantization_scheme.weights.num_bits
+            num_weights = parameter.numel()
+            total_compressed += compressed_bits * num_weights
+            total_uncompressed += uncompressed_bits * num_weights
+
+    return total_uncompressed / total_compressed
diff --git a/src/compressed_tensors/registry/__init__.py b/src/compressed_tensors/registry/__init__.py
new file mode 100644
index 00000000..241d9d55
--- /dev/null
+++ b/src/compressed_tensors/registry/__init__.py
@@ -0,0 +1,17 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import *
diff --git a/src/compressed_tensors/registry/registry.py b/src/compressed_tensors/registry/registry.py
new file mode 100644
index 00000000..d8d8bc6d
--- /dev/null
+++ b/src/compressed_tensors/registry/registry.py
@@ -0,0 +1,360 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Universal registry to support registration and loading of child classes and plugins
+of neuralmagic utilities
+"""
+
+import importlib
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Type, Union
+
+
+__all__ = [
+    "RegistryMixin",
+    "register",
+    "get_from_registry",
+    "registered_names",
+    "registered_aliases",
+    "standardize_lookup_name",
+]
+
+
+_ALIAS_REGISTRY: Dict[Type, Dict[str, str]] = defaultdict(dict)
+_REGISTRY: Dict[Type, Dict[str, Any]] = defaultdict(dict)
+
+
+def standardize_lookup_name(name: str) -> str:
+    """
+    Standardize the given name for lookup in the registry.
+    This will replace all underscores and spaces with hyphens and
+    convert the name to lowercase.
+
+    example:
+    ```
+    standardize_lookup_name("Foo_bar baz") == "foo-bar-baz"
+    ```
+
+    :param name: name to standardize
+    :return: standardized name
+    """
+    return name.replace("_", "-").replace(" ", "-").lower()
+
+
+def standardize_alias_name(
+    name: Union[None, str, List[str]]
+) -> Union[None, str, List[str]]:
+    if name is None:
+        return None
+    elif isinstance(name, str):
+        return standardize_lookup_name(name)
+    else:  # isinstance(name, list)
+        return [standardize_lookup_name(n) for n in name]
+
+
+class RegistryMixin:
+    """
+    Universal registry to support registration and loading of child classes and plugins
+    of neuralmagic utilities.
+
+    Classes that require a registry or plugins may add the `RegistryMixin` and use
+    `register` and `load` as the main entrypoints for adding new implementations and
+    loading requested values from its registry.
+
+    If a class should only have its child classes in its registry, the class should
+    set the static attribute `registry_requires_subclass` to True
+
+    example
+    ```python
+    class Dataset(RegistryMixin):
+        pass
+
+
+    # register with default name
+    @Dataset.register()
+    class ImageNetDataset(Dataset):
+        pass
+
+    # load as "ImageNetDataset"
+    imagenet = Dataset.load("ImageNetDataset")
+
+    # register with custom name
+    @Dataset.register(name="cifar-dataset")
+    class Cifar(Dataset):
+        pass
+
+    Note: the name will be standardized for lookup in the registry.
+    For example, if a class is registered as "cifar_dataset" or
+    "cifar dataset", it will be stored as "cifar-dataset". The user
+    will be able to load the class with any of the three name variants.
+
+    # register with multiple aliases
+    @Dataset.register(alias=["cifar-10-dataset", "cifar_100_dataset"])
+    class Cifar(Dataset):
+        pass
+
+    # load as "cifar-dataset"
+    cifar = Dataset.load_from_registry("cifar-dataset")
+
+    # load from custom file that implements a dataset
+    mnist = Dataset.load_from_registry("/path/to/mnnist_dataset.py:MnistDataset")
+    ```
+    """
+
+    # set to True in child class to add check that registered/retrieved values
+    # implement the class it is registered to
+    registry_requires_subclass: bool = False
+
+    @classmethod
+    def register(
+        cls, name: Optional[str] = None, alias: Union[List[str], str, None] = None
+    ):
+        """
+        Decorator for registering a value (ie class or function) wrapped by this
+        decorator to the base class (class that .register is called from)
+
+        :param name: name or list of names to register the wrapped value as,
+            defaults to value.__name__
+        :param alias: alias or list of aliases to register the wrapped value as,
+            defaults to None
+        :return: register decorator
+        """
+
+        def decorator(value: Any):
+            cls.register_value(value, name=name, alias=alias)
+            return value
+
+        return decorator
+
+    @classmethod
+    def register_value(
+        cls, value: Any, name: str, alias: Union[str, List[str], None] = None
+    ):
+        """
+        Registers the given value to the class `.register_value` is called from
+        :param value: value to register
+        :param name: name to register the wrapped value as,
+            defaults to value.__name__
+        :param alias: alias or list of aliases to register the wrapped value as,
+            defaults to None
+        """
+        register(
+            parent_class=cls,
+            value=value,
+            name=name,
+            alias=alias,
+            require_subclass=cls.registry_requires_subclass,
+        )
+
+    @classmethod
+    def load_from_registry(cls, name: str, **constructor_kwargs) -> object:
+        """
+        :param name: name of registered class to load
+        :param constructor_kwargs: arguments to pass to the constructor retrieved
+            from the registry
+        :return: loaded object registered to this class under the given name,
+            constructed with the given kwargs. Raises error if the name is
+            not found in the registry
+        """
+        constructor = cls.get_value_from_registry(name=name)
+        return constructor(**constructor_kwargs)
+
+    @classmethod
+    def get_value_from_registry(cls, name: str):
+        """
+        :param name: name to retrieve from the registry
+        :return: value from retrieved the registry for the given name, raises
+            error if not found
+        """
+        return get_from_registry(
+            parent_class=cls,
+            name=name,
+            require_subclass=cls.registry_requires_subclass,
+        )
+
+    @classmethod
+    def registered_names(cls) -> List[str]:
+        """
+        :return: list of all names registered to this class
+        """
+        return registered_names(cls)
+
+    @classmethod
+    def registered_aliases(cls) -> List[str]:
+        """
+        :return: list of all aliases registered to this class
+        """
+        return registered_aliases(cls)
+
+
+def register(
+    parent_class: Type,
+    value: Any,
+    name: Optional[str] = None,
+    alias: Union[List[str], str, None] = None,
+    require_subclass: bool = False,
+):
+    """
+    :param parent_class: class to register the name under
+    :param value: the value to register
+    :param name: name to register the wrapped value as, defaults to value.__name__
+    :param alias: alias or list of aliases to register the wrapped value as,
+        defaults to None
+    :param require_subclass: require that value is a subclass of the class this
+        method is called from
+    """
+    if name is None:
+        # default name
+        name = value.__name__
+
+    name = standardize_lookup_name(name)
+    alias = standardize_alias_name(alias)
+    register_alias(name=name, alias=alias, parent_class=parent_class)
+
+    if require_subclass:
+        _validate_subclass(parent_class, value)
+
+    if name in _REGISTRY[parent_class]:
+        # name already exists - raise error if two different values are attempting
+        # to share the same name
+        registered_value = _REGISTRY[parent_class][name]
+        if registered_value is not value:
+            raise RuntimeError(
+                f"Attempting to register name {name} as {value} "
+                f"however {name} has already been registered as {registered_value}"
+            )
+    else:
+        _REGISTRY[parent_class][name] = value
+
+
+def get_from_registry(
+    parent_class: Type, name: str, require_subclass: bool = False
+) -> Any:
+    """
+    :param parent_class: class that the name is registered under
+    :param name: name to retrieve from the registry of the class
+    :param require_subclass: require that value is a subclass of the class this
+        method is called from
+    :return: value from retrieved the registry for the given name, raises
+        error if not found
+    """
+    name = standardize_lookup_name(name)
+
+    if ":" in name:
+        # user specifying specific module to load and value to import
+        module_path, value_name = name.split(":")
+        retrieved_value = _import_and_get_value_from_module(module_path, value_name)
+    else:
+        # look up name in alias registry
+        name = _ALIAS_REGISTRY[parent_class].get(name)
+        # look up name in registry
+        retrieved_value = _REGISTRY[parent_class].get(name)
+        if retrieved_value is None:
+            raise KeyError(
+                f"Unable to find {name} registered under type {parent_class}.\n"
+                f"Registered values for {parent_class}: "
+                f"{registered_names(parent_class)}\n"
+                f"Registered aliases for {parent_class}: "
+                f"{registered_aliases(parent_class)}"
+            )
+
+    if require_subclass:
+        _validate_subclass(parent_class, retrieved_value)
+
+    return retrieved_value
+
+
+def registered_names(parent_class: Type) -> List[str]:
+    """
+    :param parent_class: class to look up the registry of
+    :return: all names registered to the given class
+    """
+    return list(_REGISTRY[parent_class].keys())
+
+
+def registered_aliases(parent_class: Type) -> List[str]:
+    """
+    :param parent_class: class to look up the registry of
+    :return: all aliases registered to the given class
+    """
+    registered_aliases_plus_names = list(_ALIAS_REGISTRY[parent_class].keys())
+    registered_aliases = list(
+        set(registered_aliases_plus_names) - set(registered_names(parent_class))
+    )
+    return registered_aliases
+
+
+def register_alias(
+    name: str, parent_class: Type, alias: Union[str, List[str], None] = None
+):
+    """
+    Updates the mapping from the alias(es) to the given name.
+    If the alias is None, the name is used as the alias.
+    ```
+
+    :param name: name that the alias refers to
+    :param parent_class: class that the name is registered under
+    :param alias: single alias or list of aliases that
+        refer to the name, defaults to None
+    """
+    if alias is not None:
+        alias = alias if isinstance(alias, list) else [alias]
+    else:
+        alias = []
+
+    if name in alias:
+        raise KeyError(
+            f"Attempting to register alias {name}, "
+            f"that is identical to the standardized name: {name}."
+        )
+    alias.append(name)
+
+    for alias_name in alias:
+        if alias_name in _ALIAS_REGISTRY[parent_class]:
+            raise KeyError(
+                f"Attempting to register alias {alias_name} as {name} "
+                f"however {alias_name} has already been registered as "
+                f"{_ALIAS_REGISTRY[alias_name]}"
+            )
+        _ALIAS_REGISTRY[parent_class][alias_name] = name
+
+
+def _import_and_get_value_from_module(module_path: str, value_name: str) -> Any:
+    # import the given module path and try to get the value_name if it is included
+    # in the module
+
+    # load module
+    spec = importlib.util.spec_from_file_location(
+        f"plugin_module_for_{value_name}", module_path
+    )
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    # get value from module
+    value = getattr(module, value_name, None)
+
+    if not value:
+        raise RuntimeError(
+            f"Unable to find attribute {value_name} in module {module_path}"
+        )
+    return value
+
+
+def _validate_subclass(parent_class: Type, child_class: Type):
+    if not issubclass(child_class, parent_class):
+        raise ValueError(
+            f"class {child_class} is not a subclass of the class it is "
+            f"registered for: {parent_class}."
+        )
diff --git a/src/compressed_tensors/utils/__init__.py b/src/compressed_tensors/utils/__init__.py
new file mode 100644
index 00000000..e9e78d44
--- /dev/null
+++ b/src/compressed_tensors/utils/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+
+from .helpers import *
+from .safetensors_load import *
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
new file mode 100644
index 00000000..ac9ed229
--- /dev/null
+++ b/src/compressed_tensors/utils/helpers.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+from compressed_tensors.base import SPARSITY_CONFIG_NAME
+from compressed_tensors.compressors import ModelCompressor
+from compressed_tensors.config import CompressionConfig
+from transformers import AutoConfig
+
+
+__all__ = ["infer_compressor_from_model_config"]
+
+
+def infer_compressor_from_model_config(
+    pretrained_model_name_or_path: str,
+) -> Optional[ModelCompressor]:
+    """
+    Given a path to a model config, extract a sparsity config if it exists and return
+    the associated ModelCompressor
+
+    :param pretrained_model_name_or_path: path to model config on disk or HF hub
+    :return: matching compressor if config contains a sparsity config
+    """
+    config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+    sparsity_config = getattr(config, SPARSITY_CONFIG_NAME, None)
+    if sparsity_config is None:
+        return None
+
+    format = sparsity_config.get("format")
+    sparsity_config = CompressionConfig.load_from_registry(format, **sparsity_config)
+    compressor = ModelCompressor.load_from_registry(format, config=sparsity_config)
+    return compressor
diff --git a/src/compressed_tensors/utils/safetensors_load.py b/src/compressed_tensors/utils/safetensors_load.py
new file mode 100644
index 00000000..4d71482a
--- /dev/null
+++ b/src/compressed_tensors/utils/safetensors_load.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import re
+import struct
+from typing import Dict, List, Optional
+
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, cached_file
+
+
+__all__ = [
+    "get_safetensors_folder",
+    "get_safetensors_header",
+    "match_param_name",
+    "merge_names",
+    "get_weight_mappings",
+    "get_nested_weight_mappings",
+]
+
+
+def get_safetensors_folder(
+    pretrained_model_name_or_path: str, cache_dir: Optional[str] = None
+) -> str:
+    """
+    Given a Hugging Face stub or a local path, return the folder containing the
+    safetensors weight files
+
+    :param pretrained_model_name_or_path: local path to model or HF stub
+    :param cache_dir: optional cache dir to search through, if none is specified the
+    model will be searched for in the default TRANSFORMERS_CACHE
+    :return: local folder containing model data
+    """
+    if os.path.exists(pretrained_model_name_or_path):
+        # argument is a path to a local folder
+        return pretrained_model_name_or_path
+
+    safetensors_path = cached_file(
+        pretrained_model_name_or_path,
+        SAFE_WEIGHTS_NAME,
+        cache_dir=cache_dir,
+        _raise_exceptions_for_missing_entries=False,
+    )
+    index_path = cached_file(
+        pretrained_model_name_or_path,
+        SAFE_WEIGHTS_INDEX_NAME,
+        cache_dir=cache_dir,
+        _raise_exceptions_for_missing_entries=False,
+    )
+    if safetensors_path is not None:
+        # found a single cached safetensors file
+        return os.path.split(safetensors_path)[0]
+    if index_path is not None:
+        # found a cached safetensors weight index file
+        return os.path.split(index_path)[0]
+
+    # model weights could not be found locally or cached from HF Hub
+    raise ValueError(
+        "Could not locate safetensors weight or index file from "
+        f"{pretrained_model_name_or_path}."
+    )
+
+
+def get_safetensors_header(safetensors_path: str) -> Dict[str, str]:
+    """
+    Extracts the metadata from a safetensors file as JSON
+
+    :param safetensors_path: path to a safetensors file
+    :return: dictionary of metadata extracted from the safetensors file
+    """
+    with open(safetensors_path, "rb") as f:
+        length_of_header = struct.unpack("<Q", f.read(8))[0]
+        header_data = f.read(length_of_header)
+        header = json.loads(header_data)
+
+    return header
+
+
+def match_param_name(full_name: str, param_name: str) -> str:
+    """
+    Helper function extracting the uncompressed parameterized layer name from a
+    compressed name. Assumes the compressed name was merged using merge_names.
+
+    :param full_name: full name of parameter in compressed model
+    :param param_name: compression paramater name
+    :return: uncompressed name of the uncompressed parameterized layer
+    """
+    pattern = r"^(.*)\." + param_name + r"$"
+    regex = re.findall(pattern, full_name)
+    if len(regex) == 0:
+        return None
+    return regex[0]
+
+
+def merge_names(parent_name: str, child_name: str) -> str:
+    """
+    Helper function for merging an uncompressed parameterized layer name with a
+    compression parameter. Names merged with this function can then be parsed by
+    match_param_name.
+
+    :param parent_name: uncompressed parameterized layer name
+    :param child_name: compression parameter name
+    :return: merged compressed name
+    """
+    return parent_name + "." + child_name
+
+
+def get_weight_mappings(model_path: str) -> Dict[str, str]:
+    """
+    Takes a path to a state dict saved in safetensors format and returns a mapping
+    from parameterized layer name to file location.
+
+    {
+        layer.weight.bitmask: file_location,
+        layer.weight.row_offsets: file_location,
+        layer.weight.shape: file_location,
+        layer.weight.compressed: file_location
+    }
+
+    This generalizes to cases where the model is split into multiple safetensors files
+
+    :param model_path: path to safetensors state dict, must contain either a single
+    safetensors file or multiple files with an index
+    :return: mapping of parameterized layer name to file location
+    """
+    safetensors_path = os.path.join(model_path, SAFE_WEIGHTS_NAME)
+    index_path = os.path.join(model_path, SAFE_WEIGHTS_INDEX_NAME)
+    if os.path.exists(safetensors_path):
+        # we have a single safetensors file to read
+        header = get_safetensors_header(safetensors_path)
+        for key in header.keys():
+            header[key] = SAFE_WEIGHTS_NAME
+        header.pop("__metadata__", None)
+    elif os.path.exists(index_path):
+        # we have multiple safetensors file, read from index
+        with open(index_path, "r", encoding="utf-8") as f:
+            index = json.load(f)
+        header = index["weight_map"]
+    else:
+        raise ValueError(
+            f"Could not find a safetensors weight or index file at {model_path}"
+        )
+
+    # convert weight locations to full paths
+    for key, value in header.items():
+        header[key] = os.path.join(model_path, value)
+
+    return header
+
+
+def get_nested_weight_mappings(
+    model_path: str, params_to_nest: List[str]
+) -> Dict[str, Dict[str, str]]:
+    """
+    Takes a path to a state dict saved in safetensors format and returns a nested
+    mapping from uncompressed parameterized layer names to the file locations of each
+    of the layers compression parameters.
+
+    layer.weight: {
+        bitmask: file_location,
+        row_offsets: file_location,
+        shape: file_location,
+        compressed: file_location
+    }
+
+    This generalizes to cases where the model is split into multiple safetensors files
+
+    :param model_path: path to safetensors state dict, must contain either a single
+    safetensors file or multiple files with an index
+    :return: nested mapping of parameterized layer name to file location
+    """
+    weight_mappings = get_weight_mappings(model_path)
+
+    nested_weight_mappings = {}
+    for key in weight_mappings.keys():
+        for param_name in params_to_nest:
+            maybe_match = match_param_name(key, param_name)
+            if maybe_match is not None:
+                dense_param = maybe_match
+                if dense_param not in nested_weight_mappings:
+                    nested_weight_mappings[dense_param] = {}
+                nested_weight_mappings[dense_param][param_name] = weight_mappings[key]
+
+    return nested_weight_mappings
diff --git a/tests/compressed_tensors/quantization/observers/quantization/__init__.py b/tests/compressed_tensors/quantization/observers/quantization/__init__.py
new file mode 100644
index 00000000..0c44f887
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py
new file mode 100644
index 00000000..0c44f887
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py
new file mode 100644
index 00000000..97bf8b0c
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/conftest.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import pytest
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+
+
+@pytest.fixture
+def create_quantization_scheme():
+    def quantization_scheme(
+        targets: List[str],
+        weights: Optional[QuantizationArgs] = None,
+        input_activations: Optional[QuantizationArgs] = None,
+        output_activations: Optional[QuantizationArgs] = None,
+    ):
+        return QuantizationScheme(
+            targets=targets,
+            weights=weights,
+            input_activations=input_activations,
+            output_activations=output_activations,
+        )
+
+    return quantization_scheme
diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py
new file mode 100644
index 00000000..6a3d17af
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_apply.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from compressed_tensors.quantization.lifecycle import apply_quantization_config
+from compressed_tensors.quantization.quant_config import (
+    QuantizationConfig,
+    QuantizationStatus,
+)
+from transformers import AutoModelForCausalLM
+
+
+def test_apply_quantization_config_tinyllama():
+    quant_config = get_sample_tinyllama_quant_config()
+    model = get_tinyllama_model()
+
+    # check that model is not already quantized
+    for module in model.modules():
+        _test_layer_quantization_status(module, inputs=False, weights=False)
+
+    # apply quant config to model
+    apply_quantization_config(model, quant_config)
+
+    # check for correct application of quant config
+    num_linears = 0
+    num_embeddings = 0
+    num_rotary_embeddings = 0
+    for name, module in model.named_modules():
+        if name in quant_config.ignore:
+            continue
+        module_type = module.__class__.__name__
+        if module_type == "Linear":
+            num_linears += 1
+            _test_layer_quantization_status(module, inputs=True, weights=True)
+        elif module_type == "Embedding":
+            num_embeddings += 1
+            _test_layer_quantization_status(module, inputs=False, weights=True)
+        elif module_type == "LlamaRotaryEmbedding":
+            num_rotary_embeddings += 1
+            _test_layer_quantization_status(module, inputs=False, weights=False)
+
+    # sanity check correct number of layers targeted
+    assert num_linears == 154  # 155 Linear layers - 1 that gets ignored
+    assert num_embeddings == 1
+    assert num_rotary_embeddings == 22
+
+
+def test_serialize_config_tinyllama():
+    quant_config = get_sample_tinyllama_quant_config()
+    model = get_tinyllama_model()
+
+    # check that model is not already quantized
+    for module in model.modules():
+        _test_layer_quantization_status(module, inputs=False, weights=False)
+
+    # apply quant config to model
+    apply_quantization_config(model, quant_config)
+
+    serialized_config = QuantizationConfig.from_pretrained(model)
+    assert len(serialized_config.config_groups) == 2
+    assert serialized_config.config_groups["group_0"].targets == ["Embedding"]
+    assert serialized_config.config_groups["group_0"].input_activations is None
+    assert serialized_config.config_groups["group_1"].targets == ["Linear"]
+    assert serialized_config.config_groups["group_1"].input_activations is not None
+    assert serialized_config.quantization_status == QuantizationStatus.FROZEN
+    assert serialized_config.format == "fakequant"
+    assert serialized_config.quant_method == "sparseml"
+    assert serialized_config.ignore == ["model.layers.1.mlp.down_proj"]
+    assert serialized_config.global_compression_ratio > 1.0
+    assert serialized_config.global_compression_ratio < 8.0
+
+
+def _test_layer_quantization_status(module, inputs: bool, weights: bool):
+    # check if quantization is applied at all (true if inputs or weights targeted)
+    quantized = inputs or weights
+    assert hasattr(module, "quantization_scheme") == quantized
+    assert hasattr(module, "quantization_status") == quantized
+
+    # check inputs matches expected
+    assert hasattr(module, "input_scale") == inputs
+    assert hasattr(module, "input_zero_point") == inputs
+
+    # check weights matches expected
+    assert hasattr(module, "weight_scale") == weights
+    assert hasattr(module, "weight_zero_point") == weights
+
+
+def get_tinyllama_model():
+    return AutoModelForCausalLM.from_pretrained(
+        "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+    )
+
+
+def get_sample_tinyllama_quant_config():
+    config_dict = {
+        "quant_method": "sparseml",
+        "format": "fakequant",
+        "quantization_status": "frozen",
+        "global_compression_ratio": None,
+        "config_groups": {
+            "group_1": {
+                "weights": {
+                    "num_bits": 8,
+                    "type": "int",
+                    "symmetric": True,
+                    "strategy": "tensor",
+                },
+                "input_activations": {
+                    "num_bits": 8,
+                    "type": "int",
+                    "symmetric": True,
+                    "strategy": "tensor",
+                },
+                "targets": ["Linear"],
+            },
+            "group_2": {
+                "weights": {
+                    "num_bits": 8,
+                    "type": "int",
+                    "symmetric": False,
+                    "strategy": "tensor",
+                },
+                "input_activations": None,
+                "targets": ["Embedding"],
+            },
+        },
+        "ignore": ["LlamaRotaryEmbedding", "model.layers.1.mlp.down_proj"],
+    }
+    return QuantizationConfig.parse_obj(config_dict)
diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py
new file mode 100644
index 00000000..00c95d16
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_forward.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+import torch
+from compressed_tensors.quantization.lifecycle.forward import (
+    maybe_calibrate_or_quantize,
+    wrap_module_forward_quantized,
+)
+from compressed_tensors.quantization.lifecycle.initialize import (
+    initialize_module_for_quantization,
+)
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Linear
+
+
+def test_wrap_module_forward_quantized(create_quantization_scheme):
+    num_bits = 8
+    quantization_scheme = create_quantization_scheme(
+        targets=["*"],
+        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
+        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
+    )
+    layer = Linear(4, 4)
+
+    func_forward = layer.forward.__func__
+
+    # check that the forward call is overwritten
+    wrap_module_forward_quantized(layer, quantization_scheme)
+
+    assert not func_forward == layer.forward.__func__
+
+
+@pytest.mark.parametrize(
+    "quantization_status", ["initialized", "calibration", "frozen"]
+)
+def test_maybe_calibrate_or_quantize(create_quantization_scheme, quantization_status):
+    num_bits = 8
+    quantization_scheme = create_quantization_scheme(
+        targets=["*"],
+        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
+        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=True),
+    )
+    quantization_args = QuantizationArgs(num_bits=num_bits, symmetric=True)
+    layer = Linear(4, 4)
+    layer.weight.data *= 100
+
+    initialize_module_for_quantization(layer, quantization_scheme)
+    layer.quantization_status = QuantizationStatus(quantization_status)
+
+    # only calibration updates the scale and zero-point
+    if layer.quantization_status == QuantizationStatus.INITIALIZED:
+        out = maybe_calibrate_or_quantize(
+            layer, layer.weight.data, "input", quantization_args
+        )
+        assert torch.allclose(out, layer.weight.data)
+    elif layer.quantization_status == QuantizationStatus.CALIBRATION:
+
+        out = maybe_calibrate_or_quantize(
+            layer, layer.weight.data, "input", quantization_args
+        )
+        assert torch.allclose(out, layer.weight.data, atol=0.2)
+
+    elif layer.quantization_status == QuantizationStatus.FROZEN:
+        # scale and zero points are empty -- cannot quantize
+        with pytest.raises(Exception):
+            out = maybe_calibrate_or_quantize(
+                layer, layer.weight.data, "input", quantization_args
+            )
diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py
new file mode 100644
index 00000000..056c6089
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_frozen.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quantization
+from compressed_tensors.quantization.lifecycle.initialize import (
+    initialize_module_for_quantization,
+)
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Linear
+
+
+def test_set_module_for_calibration(create_quantization_scheme):
+    num_bits = 8
+    quantization_scheme = create_quantization_scheme(
+        targets=["*"],
+        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
+        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
+    )
+
+    layer = Linear(4, 4)
+
+    initialize_module_for_quantization(layer, quantization_scheme)
+    layer.quantization_status = QuantizationStatus("calibration")
+
+    # should have both input and weight observer after initalizing
+    assert hasattr(layer, "input_observer")
+    assert hasattr(layer, "weight_observer")
+
+    # observers should get deleted after freezing
+    freeze_module_quantization(layer)
+    assert not hasattr(layer, "input_observer")
+    assert not hasattr(layer, "weight_observer")
+
+    assert layer.quantization_status == QuantizationStatus("frozen")
diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py
new file mode 100644
index 00000000..987b2ae2
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_initialize.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+from compressed_tensors.quantization.lifecycle.initialize import (
+    initialize_module_for_quantization,
+)
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Linear
+
+
+NUM_BITS = 8
+
+
+@pytest.mark.parametrize(
+    "weights,input_activations",
+    [
+        (
+            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
+            None,
+        ),
+        (
+            None,
+            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
+        ),
+        (
+            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
+            QuantizationArgs(num_bits=NUM_BITS, symmetric=True),
+        ),
+    ],
+)
+def test_initialize_module_for_quantization(
+    create_quantization_scheme, weights, input_activations
+):
+    quantization_scheme = create_quantization_scheme(
+        targets=["*"],
+        weights=weights,
+        input_activations=input_activations,
+    )
+    layer = Linear(4, 4)
+
+    assert not hasattr(layer, "quantization_scheme")
+    assert not hasattr(layer, "quantization_status")
+
+    # add attributes, zero_points and scale
+    initialize_module_for_quantization(layer, quantization_scheme)
+
+    registered_params = {"weight", "bias"}
+    if weights is not None:
+        registered_params.add("weight_scale")
+        registered_params.add("weight_zero_point")
+
+    if input_activations is not None:
+        registered_params.add("input_scale")
+        registered_params.add("input_zero_point")
+
+    for key in layer.state_dict().keys():
+        assert key in registered_params
+        registered_params.remove(key)
+
+    assert len(registered_params) == 0
+
+    assert hasattr(layer, "quantization_scheme")
+    assert hasattr(layer, "quantization_status")
+
+    assert layer.quantization_status == QuantizationStatus.INITIALIZED
diff --git a/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py
new file mode 100644
index 00000000..44932778
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/lifecycle/test_lifecycle.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+
+import torch
+from compressed_tensors.quantization.lifecycle.calibration import (
+    set_module_for_calibration,
+)
+from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quantization
+from compressed_tensors.quantization.lifecycle.initialize import (
+    initialize_module_for_quantization,
+)
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Linear
+
+
+def test_lifecyle(create_quantization_scheme):
+    num_bits = 8
+
+    quantization_scheme = create_quantization_scheme(
+        input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
+        weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
+        targets=["*"],
+    )
+
+    layer = Linear(4, 4)
+    layer.weight.data *= 100
+
+    # updated layer keys check
+    expected_layer_keys = {"weight", "bias"}
+    for key in layer.state_dict().keys():
+        expected_layer_keys.remove(key)
+    assert len(expected_layer_keys) == 0
+
+    # over write forward pass and register zero_point and scale
+    initialize_module_for_quantization(layer, quantization_scheme)
+    expected_layer_keys = {
+        "input_scale",
+        "input_zero_point",
+        "weight_scale",
+        "weight_zero_point",
+        "weight",
+        "bias",
+    }
+    for key in layer.state_dict().keys():
+        expected_layer_keys.remove(key)
+    assert len(expected_layer_keys) == 0
+
+    # should have both input and weight observer after initalizing
+    assert hasattr(layer, "input_observer")
+    assert hasattr(layer, "weight_observer")
+
+    assert hasattr(layer, "quantization_scheme")
+    assert hasattr(layer, "quantization_status")
+    assert layer.quantization_status == QuantizationStatus.INITIALIZED
+
+    set_module_for_calibration(layer)
+    assert layer.quantization_status == QuantizationStatus.CALIBRATION
+
+    # do a calibration step
+    assert torch.numel(layer.input_zero_point.data) == 0
+    assert torch.numel(layer.input_scale) == 0
+    assert torch.numel(layer.weight_scale) == 0
+    assert torch.numel(layer.weight_zero_point) == 0
+
+    layer(torch.randn(4, 4))
+
+    # zero-points and scale should be updated after forward pass
+    assert torch.numel(layer.input_zero_point.data) > 0
+    assert torch.numel(layer.input_scale) > 0
+    assert torch.numel(layer.weight_scale) > 0
+    assert torch.numel(layer.weight_zero_point) > 0
+
+    # symmetric zero points should center at 0
+    assert layer.weight_zero_point.data == 0
+
+    # check high and low bound of the weights
+    assert torch.all(layer.weight.data >= -128) and torch.all(layer.weight.data <= 127)
+
+    initalized_layer = deepcopy(layer)
+
+    # calibrate the layers with each iteration
+    for _ in range(10):
+        layer(torch.randn(4, 4))
+
+    assert initalized_layer.input_zero_point != layer.input_zero_point
+    assert initalized_layer.input_scale != layer.input_scale
+    assert initalized_layer.weight_scale != layer.weight_scale
+
+    # check quantization f_q(x) is applied after frozen without update
+    input_check_for_quant = torch.randn(4, 4)
+    out_calibration = layer(input_check_for_quant)
+
+    layer_before_freeze = deepcopy(layer)
+
+    # Freeze, no update after any forward pass
+    freeze_module_quantization(layer)
+
+    for _ in range(10):
+        layer(torch.randn(4, 4))
+    assert layer_before_freeze.input_zero_point == layer.input_zero_point
+    assert layer_before_freeze.input_scale == layer.input_scale
+    assert layer_before_freeze.weight_scale == layer.weight_scale
+
+    # check that the same quantization is applied as calibration to frozen
+    assert torch.all(out_calibration == layer(input_check_for_quant))
diff --git a/tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py b/tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py
new file mode 100644
index 00000000..c1c84be6
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/test_quant_args.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationStrategy,
+    QuantizationType,
+)
+from pydantic import ValidationError
+
+
+def test_defaults():
+    default = QuantizationArgs()
+
+    assert default.num_bits == 8
+    assert default.type == QuantizationType.INT
+    assert default.symmetric
+    assert default.strategy == QuantizationStrategy.TENSOR
+    assert default.group_size is None
+    assert default.block_structure is None
+
+
+def test_group():
+    kwargs = {"strategy": "group", "group_size": 128}
+
+    group = QuantizationArgs(**kwargs)
+    assert group.strategy == QuantizationStrategy.GROUP
+    assert group.group_size == kwargs["group_size"]
+
+
+def test_block():
+    kwargs = {"strategy": "block", "block_structure": "2x4"}
+
+    block = QuantizationArgs(**kwargs)
+    assert block.strategy == QuantizationStrategy.BLOCK
+    assert block.block_structure == kwargs["block_structure"]
+
+
+def test_invalid():
+    with pytest.raises(ValidationError):
+        _ = QuantizationArgs(type="invalid")
+    with pytest.raises(ValidationError):
+        _ = QuantizationArgs(strategy="invalid")
diff --git a/tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py b/tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py
new file mode 100644
index 00000000..091be723
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/test_quant_config.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+from compressed_tensors.quantization import (
+    QuantizationConfig,
+    QuantizationScheme,
+    QuantizationStatus,
+)
+from pydantic import ValidationError
+
+
+def test_basic_config():
+    config_groups = {"group_1": QuantizationScheme(targets=[])}
+    config = QuantizationConfig(config_groups=config_groups)
+
+    assert config.config_groups == config_groups
+    assert config.quant_method == "sparseml"
+    assert config.format == "fakequant"
+    assert config.quantization_status == QuantizationStatus.INITIALIZED
+    assert config.global_compression_ratio is None
+    assert isinstance(config.ignore, list) and len(config.ignore) == 0
+
+
+def test_full_config():
+    config_groups = {
+        "group_1": QuantizationScheme(targets=[]),
+        "group_2": QuantizationScheme(targets=[]),
+    }
+    global_compression_ratio = 3.5
+    ignore = ["model.layers.0"]
+    quantization_status = "compressed"
+
+    config = QuantizationConfig(
+        config_groups=config_groups,
+        global_compression_ratio=global_compression_ratio,
+        ignore=ignore,
+        quantization_status=quantization_status,
+    )
+    assert config.config_groups == config_groups
+    assert config.global_compression_ratio == global_compression_ratio
+    assert config.ignore == ignore
+    assert config.quantization_status == QuantizationStatus.COMPRESSED
+
+
+def test_need_config_groups():
+    with pytest.raises(ValidationError):
+        _ = QuantizationScheme()
diff --git a/tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py b/tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py
new file mode 100644
index 00000000..14ba9f7e
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/quantization/test_quant_scheme.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
+from pydantic import ValidationError
+
+
+def test_basic_scheme():
+    targets = ["model.layer.0", "model.layer.3"]
+    weights = QuantizationArgs()
+
+    scheme = QuantizationScheme(targets=targets, weights=weights)
+    assert scheme.targets == targets
+    assert scheme.weights == weights
+    assert scheme.input_activations is None
+    assert scheme.output_activations is None
+
+
+def test_full_scheme():
+    targets = ["Linear"]
+    weights = QuantizationArgs()
+    input_activations = QuantizationArgs(num_bits=4)
+    output_activations = QuantizationArgs(num_bits=8, type="float", symmetric=False)
+
+    scheme = QuantizationScheme(
+        targets=targets,
+        weights=weights,
+        input_activations=input_activations,
+        output_activations=output_activations,
+    )
+    assert scheme.targets == targets
+    assert scheme.weights == weights
+    assert scheme.input_activations == input_activations
+    assert scheme.output_activations == output_activations
+
+
+def test_needs_targets():
+    with pytest.raises(ValidationError):
+        _ = QuantizationScheme()
diff --git a/tests/compressed_tensors/quantization/observers/test_min_max.py b/tests/compressed_tensors/quantization/observers/test_min_max.py
new file mode 100644
index 00000000..a14866ef
--- /dev/null
+++ b/tests/compressed_tensors/quantization/observers/test_min_max.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+import torch
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+
+
+@pytest.mark.parametrize(
+    "symmetric,expected_scale,expected_zero_point",
+    [
+        (True, 0.0078, 0),
+        (False, 0.0039, 0),
+    ],
+)
+def test_min_max_observer(symmetric, expected_scale, expected_zero_point):
+    tensor = torch.tensor([1, 1, 1, 1, 1])
+    num_bits = 8
+    weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric)
+
+    observer = weights.get_observer()
+    scale, zero_point = observer(tensor)
+
+    assert round(scale.item(), 4) == expected_scale
+    assert round(zero_point.item(), 4) == expected_zero_point
+
+
+def test_min_max_observer_symmetric_scale_range():
+    tensor = torch.rand(4, 4)
+    tensor *= 127
+
+    num_bits = 8
+    weights = QuantizationArgs(num_bits=num_bits, symmetric=True)
+
+    observer = weights.get_observer()
+    scale, zero_point = observer(tensor)
+
+    # if symmetric, max symmetric_range = abs(-128) / 255
+    assert round(scale.item(), 4) <= 1.0039
+    assert round(zero_point.item(), 4) == 0
+
+
+def test_min_max_observer_value_update():
+    inp = torch.tensor([1, 1, 1, 1, 1])
+    inp_update_max = torch.tensor([127, 1, 1, 1, 1])
+    inp_update_min = torch.tensor([-128, 1, 1, 1, 1])
+
+    # udpate the min, max twice total
+    tensors = [
+        inp,
+        inp,
+        inp_update_max,  # update max
+        inp,
+        inp_update_min,  # update min
+    ]
+
+    tensor = inp
+    num_bits = 8
+    weights = QuantizationArgs(num_bits=num_bits, symmetric=True)
+
+    observer = weights.get_observer()
+    curr_max = 1
+    curr_min = 1
+    for i, tensor in enumerate(tensors):
+        observer(tensor)
+        curr_max = max(observer.max_val, curr_max)
+        curr_min = min(observer.min_val, curr_max)
+
+        if i < 2:
+            assert curr_max == 1
+            assert curr_min == 1
+        elif i < 4:
+            assert curr_max == 43  # (127 + 2) / 3
+            assert curr_min == 1
+        else:
+            assert curr_max == 43
+            assert curr_min == -24.8  # (-128 + 4) / 5

From 2125b15a9d5b49d8c22515ef99ca5d9bc04aa20d Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Mon, 29 Apr 2024 13:50:00 +0000
Subject: [PATCH 09/10] fix

---
 setup.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/setup.py b/setup.py
index d111af24..896609ed 100644
--- a/setup.py
+++ b/setup.py
@@ -31,13 +31,8 @@ def _setup_extras() -> Dict:
     return {"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"]}
 
 setup(
-<<<<<<< HEAD
-    name="compressed_tensors",
-    version="0.3.0",
-=======
     name="compressed-tensors",
     version="0.3.1",
->>>>>>> main
     author="Neuralmagic, Inc.",
     author_email="support@neuralmagic.com",
     license="Apache 2.0",

From 8ea5b0d62f176c8816e96a68fd450a3799382bb9 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Mon, 29 Apr 2024 13:57:04 +0000
Subject: [PATCH 10/10] fix

---
 README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/README.md b/README.md
index 3c60a838..09851d18 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,4 @@
-<<<<<<< HEAD
 # compressed_tensors
-=======
-# compressed-tensors
 
 This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.