From 20dce6923aa038e45f3d5a643e06e950ddda5af5 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 19 Apr 2024 09:30:15 +0000
Subject: [PATCH 01/13] initial commit

---
 src/compressed_tensors/compressors/base.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/compressed_tensors/compressors/base.py b/src/compressed_tensors/compressors/base.py
index 9c205f93..f92f85fa 100644
--- a/src/compressed_tensors/compressors/base.py
+++ b/src/compressed_tensors/compressors/base.py
@@ -71,3 +71,7 @@ def overwrite_weights(self, model_path: str, model: Module):
             data_old.data = data_new.data
 
         setattr(model, SPARSITY_CONFIG_NAME, self.config)
+
+
+def dummy_test(*args, **kwargs):
+    pass

From 71a5627651e03c618f29affffc8c3db460c84d2e Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 19 Apr 2024 09:41:08 +0000
Subject: [PATCH 02/13] is this a version problem

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6ab8dc09..8bb3a535 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ def _setup_extras() -> Dict:
 
 setup(
     name="compressed-tensors",
-    version="0.3.0",
+    #version="0.3.0",
     author="Neuralmagic, Inc.",
     author_email="support@neuralmagic.com",
     description="Library for utilization of compressed safetensors of neural network models",

From bff9cfe7f0358d37cf20b7eeadea4bf7ed908392 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 19 Apr 2024 09:46:31 +0000
Subject: [PATCH 03/13] or wrong find_packages logic

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 8bb3a535..225d7b8d 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
 
 def _setup_packages() -> List:
     return find_packages(
-        "src", include=["compressed-tensors", "compressed-tensors.*"], exclude=["*.__pycache__.*"]
+        "src", include=["compressed_tensors", "compressed_tensors.*"], exclude=["*.__pycache__.*"]
     )
     
 def _setup_install_requires() -> List:
@@ -29,7 +29,7 @@ def _setup_extras() -> Dict:
 
 setup(
     name="compressed-tensors",
-    #version="0.3.0",
+    version="0.3.0",
     author="Neuralmagic, Inc.",
     author_email="support@neuralmagic.com",
     description="Library for utilization of compressed safetensors of neural network models",

From 4ac3f5f766cb313772f1b66b473be254a93cbb48 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 19 Apr 2024 09:50:17 +0000
Subject: [PATCH 04/13] all_right

---
 src/compressed_tensors/compressors/base.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/compressed_tensors/compressors/base.py b/src/compressed_tensors/compressors/base.py
index f92f85fa..9c205f93 100644
--- a/src/compressed_tensors/compressors/base.py
+++ b/src/compressed_tensors/compressors/base.py
@@ -71,7 +71,3 @@ def overwrite_weights(self, model_path: str, model: Module):
             data_old.data = data_new.data
 
         setattr(model, SPARSITY_CONFIG_NAME, self.config)
-
-
-def dummy_test(*args, **kwargs):
-    pass

From b614d9fc67b5a4ad833b63a4432ddd67097a61e1 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 19 Apr 2024 12:06:15 +0000
Subject: [PATCH 05/13] initial commit

---
 README.md                                  | 99 ++++++++++++++++++++++
 src/compressed_tensors/base.py             |  2 +-
 src/compressed_tensors/compressors/base.py |  4 +-
 src/compressed_tensors/utils/helpers.py    | 56 ++++++++++--
 tests/quantization/lifecycle/test_apply.py |  1 -
 tests/test_utils/test_helpers.py           | 79 +++++++++++++++++
 6 files changed, 232 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_utils/test_helpers.py

diff --git a/README.md b/README.md
index 05fa83a3..fa3b812e 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,100 @@
 # compressed-tensors
+
+This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.
+
+## Motivation
+
+### Reduce disk space by saving sparse tensors in a compressed format
+
+The compressed format stores the data much more efficiently by taking advantage of two properties of tensors:
+
+- Sparse tensors -> due to a large number of entries that are equal to zero.
+- Quantized -> due to their low precision representation.
+
+
+### Introduce an elegant interface to save/load compressed tensors
+
+The library provides the user with the ability to compress/decompress tensors. The properties of tensors are defined by human-readable configs, allowing the users to understand the compression format at a quick glance.
+
+## Installation
+
+### Pip
+
+```bash
+pip install compressed-tensors
+```
+
+### From source
+
+```bash
+git clone https://github.com/neuralmagic/compressed-tensors
+cd compressed-tensors
+pip install -e .
+```
+
+## Getting started
+
+### Saving
+
+The function `save_compressed` returns an optional `compression_config` (if compression has been applied). It can be used to inspect the applied compression.
+
+```python
+from compressed_tensors import save_compressed
+from torch import Tensor
+
+tensors: Dict[str, Tensor] = ...
+compression_config: Dict = save_compressed(tensors, "model.safetensors")
+
+
+```
+
+### Loading
+
+```python
+from compressed_tensors import load_compressed
+from torch import Tensor
+
+tensors: Dict[str, Tensor] = load_compressed("model.safetensors", device="cpu")
+```
+
+## Benefits
+TODO
+
+## SafeTensors File Format
+
+For each parameter in the uncompressed state_dict, we store the following attributes needed for decompression in the compressed state_dict:
+
+- Compressed tensor
+- Bitmask
+- Uncompressed shape
+- Row offsets
+
+```python
+# Dense
+{
+    PARAM_NAME: uncompressed_tensor
+}
+
+# Compressed
+{
+    PARAM_NAME.compressed: compressed_tensor,  # 1d tensor
+    PARAM_NAME.bitmask: value,  # 2d bitmask tensor (nrows x (ncols / 8))
+    PARAM_NAME.shape: value,  # Uncompressed shape tensor
+    PARAM_NAME.row_offsets: value  # 1d offsets tensor
+}
+```
+
+The library provides pathways to automatically add the config information to the HF config file.
+
+```json
+// config.json
+{
+    "sparsity_config": {
+        "format": "sparse_bitmask", // "dense_sparsity" for the original tensor format
+
+        // Informational
+        "sparsity_structure": "unstructured", // Or 2:4, 8:16, etc.
+        "global_sparsity": "0.5"
+    }
+}
+```
\ No newline at end of file
diff --git a/src/compressed_tensors/base.py b/src/compressed_tensors/base.py
index f01a055f..964b3087 100644
--- a/src/compressed_tensors/base.py
+++ b/src/compressed_tensors/base.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-SPARSITY_CONFIG_NAME = "sparsity_config"
+CONFIG_NAME = "compression_config"
diff --git a/src/compressed_tensors/compressors/base.py b/src/compressed_tensors/compressors/base.py
index 9c205f93..aed54069 100644
--- a/src/compressed_tensors/compressors/base.py
+++ b/src/compressed_tensors/compressors/base.py
@@ -15,7 +15,7 @@
 import operator
 from typing import Dict, Generator, Tuple
 
-from compressed_tensors.base import SPARSITY_CONFIG_NAME
+from compressed_tensors.base import CONFIG_NAME
 from compressed_tensors.config import CompressionConfig
 from compressed_tensors.registry import RegistryMixin
 from torch import Tensor
@@ -70,4 +70,4 @@ def overwrite_weights(self, model_path: str, model: Module):
             data_old = operator.attrgetter(name)(model)
             data_old.data = data_new.data
 
-        setattr(model, SPARSITY_CONFIG_NAME, self.config)
+        setattr(model, CONFIG_NAME, self.config)
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
index ac9ed229..e776f9f7 100644
--- a/src/compressed_tensors/utils/helpers.py
+++ b/src/compressed_tensors/utils/helpers.py
@@ -12,16 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
+from typing import Dict, Optional, Union
 
-from typing import Optional
-
-from compressed_tensors.base import SPARSITY_CONFIG_NAME
+from compressed_tensors.base import CONFIG_NAME
 from compressed_tensors.compressors import ModelCompressor
 from compressed_tensors.config import CompressionConfig
+from safetensors.torch import save_file
+from torch import Tensor
 from transformers import AutoConfig
 
 
-__all__ = ["infer_compressor_from_model_config"]
+__all__ = ["infer_compressor_from_model_config", "load_compressed", "save_compressed"]
 
 
 def infer_compressor_from_model_config(
@@ -35,7 +37,7 @@ def infer_compressor_from_model_config(
     :return: matching compressor if config contains a sparsity config
     """
     config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
-    sparsity_config = getattr(config, SPARSITY_CONFIG_NAME, None)
+    sparsity_config = getattr(config, CONFIG_NAME, None)
     if sparsity_config is None:
         return None
 
@@ -43,3 +45,47 @@ def infer_compressor_from_model_config(
     sparsity_config = CompressionConfig.load_from_registry(format, **sparsity_config)
     compressor = ModelCompressor.load_from_registry(format, config=sparsity_config)
     return compressor
+
+
+def save_compressed(
+    tensors: Dict[str, Tensor],
+    save_path: Union[str, Path],
+    compression_config: Optional[CompressionConfig] = None,
+) -> Optional[CompressionConfig]:
+    """
+    Save compressed tensors to disk. If tensors are not compressed,
+    save them as is.
+
+    :param tensors: dictionary of tensors to compress
+    :param save_path: path to save compressed tensors
+    :param compression_config: compression config to use for compressing tensors.
+        Can be either inferred from tensors or provided explicitly
+    :return: compression config, if tensors were compressed - None otherwise
+    """
+    if tensors is None or len(tensors) == 0:
+        raise ValueError("No tensors or empty tensors provided to compress")
+
+    # create compression config if not provided
+    # TODO: Not implemented, need to get this in ASAP
+    # compression_config = compression_config or infer_compression_config(tensors)
+
+    if compression_config is None:
+        # no compression applied
+        save_file(tensors, save_path)
+        return None
+
+    # compress
+    compression_format = compression_config.format
+    compressor = ModelCompressor.load_from_registry(
+        compression_format, config=compression_config
+    )
+    # save compressed tensors
+    compressed_tensors = compressor.compress(tensors)
+    save_file(compressed_tensors, save_path)
+
+    # return compression_config as dict
+    return {CONFIG_NAME: compression_config.model_dump(exclude_unset=True)}
+
+
+def load_compressed(compressed_tensors: Union[str, Path], device: str):
+    pass
diff --git a/tests/quantization/lifecycle/test_apply.py b/tests/quantization/lifecycle/test_apply.py
index 6a3d17af..7e4e0f77 100644
--- a/tests/quantization/lifecycle/test_apply.py
+++ b/tests/quantization/lifecycle/test_apply.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from compressed_tensors.quantization.lifecycle import apply_quantization_config
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
diff --git a/tests/test_utils/test_helpers.py b/tests/test_utils/test_helpers.py
new file mode 100644
index 00000000..d4d91e97
--- /dev/null
+++ b/tests/test_utils/test_helpers.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from compressed_tensors import save_compressed
+from compressed_tensors.config import BitmaskConfig
+
+
+@pytest.fixture
+def tensors_and_config_sparse():
+    tensors = {"tensor_1": torch.Tensor([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]])}
+    expected_config_json = {
+        "compression_config": {
+            "format": "sparse_bitmask",
+            "global_sparsity": (
+                tensors["tensor_1"].sum() / tensors["tensor_1"].numel()
+            ).item(),
+            "sparsity_structure": "unstructured",
+        }
+    }
+    return tensors, expected_config_json
+
+
+@pytest.fixture
+def tensors_dense():
+    tensors = {"tensor_1": torch.Tensor([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])}
+    return tensors
+
+
+def test_save_compressed_sparse(tmp_path, tensors_and_config_sparse):
+    tensors, expected_config_json = tensors_and_config_sparse
+
+    config_json = save_compressed(
+        tensors,
+        compression_config=BitmaskConfig(
+            format=expected_config_json["compression_config"]["format"],
+            global_sparsity=expected_config_json["compression_config"][
+                "global_sparsity"
+            ],
+            sparsity_structure=expected_config_json["compression_config"][
+                "sparsity_structure"
+            ],
+        ),
+        save_path=tmp_path / "model.safetensors",
+    )
+    assert (tmp_path / "model.safetensors").exists()
+    assert config_json == expected_config_json
+
+
+def test_save_compressed_dense(tmp_path, tensors_dense):
+    tensors = tensors_dense
+
+    config_json = save_compressed(
+        tensors,
+        save_path=tmp_path / "model.safetensors",
+    )
+    assert (tmp_path / "model.safetensors").exists()
+    assert config_json is None
+
+
+def test_save_compressed_empty():
+    # make sure function raises error
+    with pytest.raises(Exception):
+        save_compressed({}, "")
+
+    with pytest.raises(Exception):
+        save_compressed(None, "")

From e394eb2a4621dc5d8823280a6f2452466104a77e Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 19 Apr 2024 13:53:50 +0000
Subject: [PATCH 06/13] add load_compress func

---
 README.md                                     |  2 -
 src/compressed_tensors/compressors/base.py    | 12 +++--
 src/compressed_tensors/compressors/dense.py   |  4 +-
 .../compressors/sparse_bitmask.py             | 16 +++---
 src/compressed_tensors/utils/helpers.py       | 40 ++++++++++++++-
 .../utils/safetensors_load.py                 | 51 +++++++++++--------
 tests/quantization/lifecycle/test_apply.py    |  1 +
 tests/test_utils/test_helpers.py              | 25 +++++----
 8 files changed, 106 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index fa3b812e..294c6a2f 100644
--- a/README.md
+++ b/README.md
@@ -44,8 +44,6 @@ from torch import Tensor
 
 tensors: Dict[str, Tensor] = ...
 compression_config: Dict = save_compressed(tensors, "model.safetensors")
-
-
 ```
 
 ### Loading
diff --git a/src/compressed_tensors/compressors/base.py b/src/compressed_tensors/compressors/base.py
index aed54069..9d7033d3 100644
--- a/src/compressed_tensors/compressors/base.py
+++ b/src/compressed_tensors/compressors/base.py
@@ -45,12 +45,16 @@ def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """
         raise NotImplementedError()
 
-    def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]:
+    def decompress(
+        self, path_to_model_or_tensors: str
+    ) -> Generator[Tuple[str, Tensor], None, None]:
         """
-        Reads a compressed state dict located at model_path and returns a
-        generator for sequentially decompressing back to a dense state dict
+        Reads a compressed state dict located at path_to_model_or_tensors
+        and returns a generator for sequentially decompressing back to a
+        dense state dict
 
-        :param model_path: path to compressed safetensors model
+        :param model_path: path to compressed safetensors model (directory with
+            one or more safetensors files) or compressed tensors file
         :return: compressed state dict
         """
         raise NotImplementedError()
diff --git a/src/compressed_tensors/compressors/dense.py b/src/compressed_tensors/compressors/dense.py
index 6e8785bc..2fa1603c 100644
--- a/src/compressed_tensors/compressors/dense.py
+++ b/src/compressed_tensors/compressors/dense.py
@@ -27,5 +27,7 @@ class DenseCompressor(ModelCompressor):
     def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
         return model_state
 
-    def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]:
+    def decompress(
+        self, path_to_model_or_tensors: str
+    ) -> Generator[Tuple[str, Tensor], None, None]:
         return iter([])
diff --git a/src/compressed_tensors/compressors/sparse_bitmask.py b/src/compressed_tensors/compressors/sparse_bitmask.py
index f6f03f0b..cb19b633 100644
--- a/src/compressed_tensors/compressors/sparse_bitmask.py
+++ b/src/compressed_tensors/compressors/sparse_bitmask.py
@@ -70,22 +70,26 @@ def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
 
         return compressed_dict
 
-    def decompress(self, model_path: str) -> Generator[Tuple[str, Tensor], None, None]:
+    def decompress(
+        self, path_to_model_or_tensors: str, device: str = "cpu"
+    ) -> Generator[Tuple[str, Tensor], None, None]:
         """
-        Reads a bitmask compressed state dict located at model_path and returns a
-        generator for sequentially decompressing back to a dense state dict
+        Reads a bitmask compressed state dict located at path_to_model_or_tensors
+        and returns agenerator for sequentially decompressing back to a dense state dict
 
-        :param model_path: path to compressed safetensors model
+        :param model_path: path to compressed safetensors model (directory with
+            one or more safetensors files) or compressed tensors file
+        :param device: device to load decompressed weights onto
         :return: iterator for generating decompressed weights
         """
         weight_mappings = get_nested_weight_mappings(
-            model_path, self.COMPRESSION_PARAM_NAMES
+            path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
         )
         for weight_name in weight_mappings.keys():
             weight_data = {}
             for param_name, safe_path in weight_mappings[weight_name].items():
                 full_name = merge_names(weight_name, param_name)
-                with safe_open(safe_path, framework="pt", device="cpu") as f:
+                with safe_open(safe_path, framework="pt", device=device) as f:
                     weight_data[param_name] = f.get_tensor(full_name)
             data = BitmaskTensor(**weight_data)
             decompressed = data.decompress()
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
index e776f9f7..8de3c233 100644
--- a/src/compressed_tensors/utils/helpers.py
+++ b/src/compressed_tensors/utils/helpers.py
@@ -18,6 +18,7 @@
 from compressed_tensors.base import CONFIG_NAME
 from compressed_tensors.compressors import ModelCompressor
 from compressed_tensors.config import CompressionConfig
+from safetensors import safe_open
 from safetensors.torch import save_file
 from torch import Tensor
 from transformers import AutoConfig
@@ -87,5 +88,40 @@ def save_compressed(
     return {CONFIG_NAME: compression_config.model_dump(exclude_unset=True)}
 
 
-def load_compressed(compressed_tensors: Union[str, Path], device: str):
-    pass
+def load_compressed(
+    compressed_tensors: Union[str, Path],
+    compression_config: Optional[CompressionConfig] = None,
+    device: Optional[str] = "cpu",
+) -> Dict[str, Tensor]:
+    """
+    Load compressed tensors from disk. If tensors are not compressed,
+    load them as is.
+
+    :param compressed_tensors: path to compressed tensors
+    :param compression_config: compression config to use for decompressing tensors.
+        Can be either inferred from tensors or provided explicitly.
+    :param device: device to move tensors to. If None, tensors are loaded on CPU.
+    :return decompressed tensors
+    """
+
+    if compressed_tensors is None or not Path(compressed_tensors).exists():
+        raise ValueError("No compressed tensors provided to load")
+
+    # create compression config if not provided
+    # TODO: Not implemented, need to get this in ASAP
+    # compression_config = compression_config or infer_compression_config(tensors)
+
+    if compression_config is None:
+        # no compression applied
+        tensors = {}
+        with safe_open(compressed_tensors, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                tensors[key] = f.get_tensor(key)
+        return tensors
+
+    # decompress
+    compression_format = compression_config.format
+    compressor = ModelCompressor.load_from_registry(
+        compression_format, config=compression_config
+    )
+    return dict(compressor.decompress(compressed_tensors))
diff --git a/src/compressed_tensors/utils/safetensors_load.py b/src/compressed_tensors/utils/safetensors_load.py
index 4d71482a..64abc137 100644
--- a/src/compressed_tensors/utils/safetensors_load.py
+++ b/src/compressed_tensors/utils/safetensors_load.py
@@ -117,7 +117,7 @@ def merge_names(parent_name: str, child_name: str) -> str:
     return parent_name + "." + child_name
 
 
-def get_weight_mappings(model_path: str) -> Dict[str, str]:
+def get_weight_mappings(path_to_model_or_tensors: str) -> Dict[str, str]:
     """
     Takes a path to a state dict saved in safetensors format and returns a mapping
     from parameterized layer name to file location.
@@ -131,31 +131,42 @@ def get_weight_mappings(model_path: str) -> Dict[str, str]:
 
     This generalizes to cases where the model is split into multiple safetensors files
 
-    :param model_path: path to safetensors state dict, must contain either a single
-    safetensors file or multiple files with an index
+    :param path_to_model_or_tensors: path to directory that contains
+        safetensors (must contain either a single file or multiple files with an index),
+        or a path to a single safetensors file
     :return: mapping of parameterized layer name to file location
     """
-    safetensors_path = os.path.join(model_path, SAFE_WEIGHTS_NAME)
-    index_path = os.path.join(model_path, SAFE_WEIGHTS_INDEX_NAME)
-    if os.path.exists(safetensors_path):
+
+    if os.path.isfile(path_to_model_or_tensors):
         # we have a single safetensors file to read
-        header = get_safetensors_header(safetensors_path)
+        header = get_safetensors_header(path_to_model_or_tensors)
         for key in header.keys():
-            header[key] = SAFE_WEIGHTS_NAME
+            header[key] = path_to_model_or_tensors
         header.pop("__metadata__", None)
-    elif os.path.exists(index_path):
-        # we have multiple safetensors file, read from index
-        with open(index_path, "r", encoding="utf-8") as f:
-            index = json.load(f)
-        header = index["weight_map"]
     else:
-        raise ValueError(
-            f"Could not find a safetensors weight or index file at {model_path}"
-        )
-
-    # convert weight locations to full paths
-    for key, value in header.items():
-        header[key] = os.path.join(model_path, value)
+        # we have a directory with multiple safetensors files
+        safetensors_path = os.path.join(path_to_model_or_tensors, SAFE_WEIGHTS_NAME)
+        index_path = os.path.join(path_to_model_or_tensors, SAFE_WEIGHTS_INDEX_NAME)
+        if os.path.exists(safetensors_path):
+            # we have a single safetensors file to read
+            header = get_safetensors_header(safetensors_path)
+            for key in header.keys():
+                header[key] = SAFE_WEIGHTS_NAME
+            header.pop("__metadata__", None)
+        elif os.path.exists(index_path):
+            # we have multiple safetensors file, read from index
+            with open(index_path, "r", encoding="utf-8") as f:
+                index = json.load(f)
+            header = index["weight_map"]
+        else:
+            raise ValueError(
+                "Could not find a safetensors weight "
+                f"or index file at {path_to_model_or_tensors}"
+            )
+
+        # convert weight locations to full paths
+        for key, value in header.items():
+            header[key] = os.path.join(path_to_model_or_tensors, value)
 
     return header
 
diff --git a/tests/quantization/lifecycle/test_apply.py b/tests/quantization/lifecycle/test_apply.py
index 7e4e0f77..6a3d17af 100644
--- a/tests/quantization/lifecycle/test_apply.py
+++ b/tests/quantization/lifecycle/test_apply.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from compressed_tensors.quantization.lifecycle import apply_quantization_config
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
diff --git a/tests/test_utils/test_helpers.py b/tests/test_utils/test_helpers.py
index d4d91e97..2770604e 100644
--- a/tests/test_utils/test_helpers.py
+++ b/tests/test_utils/test_helpers.py
@@ -14,7 +14,7 @@
 
 import pytest
 import torch
-from compressed_tensors import save_compressed
+from compressed_tensors import load_compressed, save_compressed
 from compressed_tensors.config import BitmaskConfig
 
 
@@ -44,15 +44,7 @@ def test_save_compressed_sparse(tmp_path, tensors_and_config_sparse):
 
     config_json = save_compressed(
         tensors,
-        compression_config=BitmaskConfig(
-            format=expected_config_json["compression_config"]["format"],
-            global_sparsity=expected_config_json["compression_config"][
-                "global_sparsity"
-            ],
-            sparsity_structure=expected_config_json["compression_config"][
-                "sparsity_structure"
-            ],
-        ),
+        compression_config=BitmaskConfig(**expected_config_json["compression_config"]),
         save_path=tmp_path / "model.safetensors",
     )
     assert (tmp_path / "model.safetensors").exists()
@@ -77,3 +69,16 @@ def test_save_compressed_empty():
 
     with pytest.raises(Exception):
         save_compressed(None, "")
+
+
+def test_load_compressed_sparse(tmp_path, tensors_and_config_sparse):
+    tensors, expected_config_json = tensors_and_config_sparse
+    compression_config = BitmaskConfig(**expected_config_json["compression_config"])
+    save_compressed(
+        tensors,
+        compression_config=compression_config,
+        save_path=tmp_path / "model.safetensors",
+    )
+    loaded_tensors = load_compressed(tmp_path / "model.safetensors", compression_config)
+    for key in tensors:
+        assert torch.allclose(tensors[key], loaded_tensors[key])

From 26192e960f3ad126396918c8812519d4f0a18ca2 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Fri, 19 Apr 2024 16:09:24 +0200
Subject: [PATCH 07/13] More tests (loading dense tensors)

---
 tests/test_utils/test_helpers.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/test_utils/test_helpers.py b/tests/test_utils/test_helpers.py
index 2770604e..54b6b1f9 100644
--- a/tests/test_utils/test_helpers.py
+++ b/tests/test_utils/test_helpers.py
@@ -82,3 +82,13 @@ def test_load_compressed_sparse(tmp_path, tensors_and_config_sparse):
     loaded_tensors = load_compressed(tmp_path / "model.safetensors", compression_config)
     for key in tensors:
         assert torch.allclose(tensors[key], loaded_tensors[key])
+
+
+def test_load_compressed_dense(tmp_path, tensors_dense):
+    save_compressed(
+        tensors_dense,
+        save_path=tmp_path / "model.safetensors",
+    )
+    loaded_tensors = load_compressed(tmp_path / "model.safetensors")
+    for key in tensors_dense:
+        assert torch.allclose(tensors_dense[key], loaded_tensors[key])

From a5cfaa1b621360aeda5a226bbfb8a6da6c5033ca Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 19 Apr 2024 15:54:16 +0000
Subject: [PATCH 08/13] simplify UX

---
 README.md                                     | 27 ++++---
 src/compressed_tensors/README.md              |  2 +-
 src/compressed_tensors/compressors/base.py    |  4 +-
 src/compressed_tensors/compressors/dense.py   |  3 +-
 .../compressors/sparse_bitmask.py             |  3 +-
 src/compressed_tensors/config/base.py         |  8 +-
 src/compressed_tensors/config/dense.py        |  6 +-
 .../config/sparse_bitmask.py                  |  6 +-
 src/compressed_tensors/utils/helpers.py       | 45 +++++------
 tests/test_registry.py                        | 10 ++-
 tests/test_utils/test_helpers.py              | 78 +++++++++++--------
 11 files changed, 107 insertions(+), 85 deletions(-)

diff --git a/README.md b/README.md
index 294c6a2f..cf292014 100644
--- a/README.md
+++ b/README.md
@@ -36,23 +36,28 @@ pip install -e .
 
 ### Saving
 
-The function `save_compressed` returns an optional `compression_config` (if compression has been applied). It can be used to inspect the applied compression.
+The function `save_compressed` uses the `compression_format` argument to apply compression to tensors.
+The function `load_compressed` reverses the process: converts the compressed weights on disk to decompressed weights in device memory.
 
 ```python
-from compressed_tensors import save_compressed
+from compressed_tensors import save_compressed, load_compressed, BitmaskConfig
 from torch import Tensor
+from typing import Dict
 
-tensors: Dict[str, Tensor] = ...
-compression_config: Dict = save_compressed(tensors, "model.safetensors")
-```
+# the example BitmaskConfig method efficiently compresses 
+# tensors with large number of zero entries 
+compression_config = BitmaskConfig()
 
-### Loading
 
-```python
-from compressed_tensors import load_compressed
-from torch import Tensor
+tensors: Dict[str, Tensor] = {"tensor_1": Tensor(
+    [[0.0, 0.0, 0.0], 
+     [1.0, 1.0, 1.0]]
+)}
+# compress tensors using BitmaskConfig compression format (save them efficiently on disk)
+save_compressed(tensors, "model.safetensors", compression_format=compression_config.format)
 
-tensors: Dict[str, Tensor] = load_compressed("model.safetensors", device="cpu")
+# decompress tensors (load the uncompressed representation to device memory)
+tensors = load_compressed("model.safetensors", device="cpu", compression_config = compression_config)
 ```
 
 ## Benefits
@@ -87,7 +92,7 @@ The library provides pathways to automatically add the config information to the
 ```json
 // config.json
 {
-    "sparsity_config": {
+    "compression_config": {
         "format": "sparse_bitmask", // "dense_sparsity" for the original tensor format
 
         // Informational
diff --git a/src/compressed_tensors/README.md b/src/compressed_tensors/README.md
index 5b1c8ece..107eca65 100644
--- a/src/compressed_tensors/README.md
+++ b/src/compressed_tensors/README.md
@@ -34,7 +34,7 @@ Config information gets stored in the HF config file
 ```json
 // config.json
 {
-    "sparsity_config": {
+    "compression_config": {
         "format": "sparse_bitmask", // "dense_sparsity" for original tensor format
 
         // informational
diff --git a/src/compressed_tensors/compressors/base.py b/src/compressed_tensors/compressors/base.py
index 9d7033d3..50c34da8 100644
--- a/src/compressed_tensors/compressors/base.py
+++ b/src/compressed_tensors/compressors/base.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import operator
-from typing import Dict, Generator, Tuple
+from typing import Dict, Generator, Optional, Tuple
 
 from compressed_tensors.base import CONFIG_NAME
 from compressed_tensors.config import CompressionConfig
@@ -33,7 +33,7 @@ class ModelCompressor(RegistryMixin):
     :param config: config specifying compression parameters
     """
 
-    def __init__(self, config: CompressionConfig):
+    def __init__(self, config: Optional[CompressionConfig] = None):
         self.config = config
 
     def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
diff --git a/src/compressed_tensors/compressors/dense.py b/src/compressed_tensors/compressors/dense.py
index 2fa1603c..97a62a81 100644
--- a/src/compressed_tensors/compressors/dense.py
+++ b/src/compressed_tensors/compressors/dense.py
@@ -15,10 +15,11 @@
 from typing import Dict, Generator, Tuple
 
 from compressed_tensors.compressors import ModelCompressor
+from compressed_tensors.config import CompressionFormat
 from torch import Tensor
 
 
-@ModelCompressor.register(name="dense_sparsity")
+@ModelCompressor.register(name=CompressionFormat.dense_sparsity.value)
 class DenseCompressor(ModelCompressor):
     """
     Identity compressor for dense models, returns the original state_dict
diff --git a/src/compressed_tensors/compressors/sparse_bitmask.py b/src/compressed_tensors/compressors/sparse_bitmask.py
index cb19b633..4a946fb9 100644
--- a/src/compressed_tensors/compressors/sparse_bitmask.py
+++ b/src/compressed_tensors/compressors/sparse_bitmask.py
@@ -18,6 +18,7 @@
 import numpy
 import torch
 from compressed_tensors.compressors import ModelCompressor
+from compressed_tensors.config import CompressionFormat
 from compressed_tensors.utils import get_nested_weight_mappings, merge_names
 from safetensors import safe_open
 from torch import Tensor
@@ -36,7 +37,7 @@
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
 
-@ModelCompressor.register(name="sparse_bitmask")
+@ModelCompressor.register(name=CompressionFormat.sparse_bitmask.value)
 class BitmaskCompressor(ModelCompressor):
     """
     Compression for sparse models using bitmasks. Non-zero weights are stored in a 1d
diff --git a/src/compressed_tensors/config/base.py b/src/compressed_tensors/config/base.py
index f58b11f8..96778995 100644
--- a/src/compressed_tensors/config/base.py
+++ b/src/compressed_tensors/config/base.py
@@ -12,13 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from enum import Enum
 from typing import Optional
 
 from compressed_tensors.registry import RegistryMixin
 from pydantic import BaseModel
 
 
-__all__ = ["CompressionConfig"]
+__all__ = ["CompressionConfig", "CompressionFormat"]
+
+
+class CompressionFormat(Enum):
+    dense_sparsity = "dense-sparsity"
+    sparse_bitmask = "sparse-bitmask"
 
 
 class CompressionConfig(RegistryMixin, BaseModel):
diff --git a/src/compressed_tensors/config/dense.py b/src/compressed_tensors/config/dense.py
index aa23220c..0a18309e 100644
--- a/src/compressed_tensors/config/dense.py
+++ b/src/compressed_tensors/config/dense.py
@@ -14,13 +14,13 @@
 
 from typing import Optional
 
-from compressed_tensors.config import CompressionConfig
+from compressed_tensors.config import CompressionConfig, CompressionFormat
 
 
 __all__ = ["DenseSparsityConfig"]
 
 
-@CompressionConfig.register(name="dense_sparsity")
+@CompressionConfig.register(name=CompressionFormat.dense_sparsity.value)
 class DenseSparsityConfig(CompressionConfig):
     """
     Identity configuration for storing a sparse model in
@@ -31,6 +31,6 @@ class DenseSparsityConfig(CompressionConfig):
     "unstructured", "2:4", "8:16" etc
     """
 
-    format: str = "dense_sparsity"
+    format: str = CompressionFormat.dense_sparsity.value
     global_sparsity: Optional[float] = 0.0
     sparsity_structure: Optional[str] = "unstructured"
diff --git a/src/compressed_tensors/config/sparse_bitmask.py b/src/compressed_tensors/config/sparse_bitmask.py
index 9b9cf211..9d2015f3 100644
--- a/src/compressed_tensors/config/sparse_bitmask.py
+++ b/src/compressed_tensors/config/sparse_bitmask.py
@@ -14,13 +14,13 @@
 
 from typing import Optional
 
-from compressed_tensors.config.base import CompressionConfig
+from compressed_tensors.config import CompressionConfig, CompressionFormat
 
 
 __all__ = ["BitmaskConfig"]
 
 
-@CompressionConfig.register(name="sparse_bitmask")
+@CompressionConfig.register(name=CompressionFormat.sparse_bitmask.value)
 class BitmaskConfig(CompressionConfig):
     """
     Configuration for storing a sparse model using
@@ -31,6 +31,6 @@ class BitmaskConfig(CompressionConfig):
     "unstructured", "2:4", "8:16" etc
     """
 
-    format: str = "sparse_bitmask"
+    format: str = CompressionFormat.sparse_bitmask.value
     global_sparsity: Optional[float] = 0.0
     sparsity_structure: Optional[str] = "unstructured"
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
index 8de3c233..1c8dd29f 100644
--- a/src/compressed_tensors/utils/helpers.py
+++ b/src/compressed_tensors/utils/helpers.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 from pathlib import Path
-from typing import Dict, Optional, Union
+from typing import Dict, Literal, Optional, Union
 
 from compressed_tensors.base import CONFIG_NAME
 from compressed_tensors.compressors import ModelCompressor
-from compressed_tensors.config import CompressionConfig
+from compressed_tensors.config import CompressionConfig, CompressionFormat
 from safetensors import safe_open
 from safetensors.torch import save_file
 from torch import Tensor
@@ -51,46 +51,46 @@ def infer_compressor_from_model_config(
 def save_compressed(
     tensors: Dict[str, Tensor],
     save_path: Union[str, Path],
-    compression_config: Optional[CompressionConfig] = None,
-) -> Optional[CompressionConfig]:
+    compression_format: Optional[
+        Literal[CompressionFormat.sparse_bitmask, CompressionFormat.dense_sparsity]
+    ] = None,
+):
     """
     Save compressed tensors to disk. If tensors are not compressed,
     save them as is.
 
     :param tensors: dictionary of tensors to compress
     :param save_path: path to save compressed tensors
-    :param compression_config: compression config to use for compressing tensors.
-        Can be either inferred from tensors or provided explicitly
+    :param compression_format: compression format used for the tensors
     :return: compression config, if tensors were compressed - None otherwise
     """
     if tensors is None or len(tensors) == 0:
         raise ValueError("No tensors or empty tensors provided to compress")
 
-    # create compression config if not provided
-    # TODO: Not implemented, need to get this in ASAP
-    # compression_config = compression_config or infer_compression_config(tensors)
-
-    if compression_config is None:
+    if compression_format is None:
         # no compression applied
         save_file(tensors, save_path)
-        return None
+        return
+
+    if not (
+        compression_format in ModelCompressor.registered_names()
+        or compression_format in ModelCompressor.registered_aliases()
+    ):
+        raise ValueError(
+            f"Unknown compression format: {compression_format}. "
+            f"Must be one of {set(ModelCompressor.registered_names() + ModelCompressor.registered_aliases())}"  # noqa E501
+        )
 
     # compress
-    compression_format = compression_config.format
-    compressor = ModelCompressor.load_from_registry(
-        compression_format, config=compression_config
-    )
+    compressor = ModelCompressor.load_from_registry(compression_format)
     # save compressed tensors
     compressed_tensors = compressor.compress(tensors)
     save_file(compressed_tensors, save_path)
 
-    # return compression_config as dict
-    return {CONFIG_NAME: compression_config.model_dump(exclude_unset=True)}
-
 
 def load_compressed(
     compressed_tensors: Union[str, Path],
-    compression_config: Optional[CompressionConfig] = None,
+    compression_config: CompressionConfig = None,
     device: Optional[str] = "cpu",
 ) -> Dict[str, Tensor]:
     """
@@ -99,7 +99,6 @@ def load_compressed(
 
     :param compressed_tensors: path to compressed tensors
     :param compression_config: compression config to use for decompressing tensors.
-        Can be either inferred from tensors or provided explicitly.
     :param device: device to move tensors to. If None, tensors are loaded on CPU.
     :return decompressed tensors
     """
@@ -107,10 +106,6 @@ def load_compressed(
     if compressed_tensors is None or not Path(compressed_tensors).exists():
         raise ValueError("No compressed tensors provided to load")
 
-    # create compression config if not provided
-    # TODO: Not implemented, need to get this in ASAP
-    # compression_config = compression_config or infer_compression_config(tensors)
-
     if compression_config is None:
         # no compression applied
         tensors = {}
diff --git a/tests/test_registry.py b/tests/test_registry.py
index a183d77d..ffe66b85 100644
--- a/tests/test_registry.py
+++ b/tests/test_registry.py
@@ -17,6 +17,7 @@
     BitmaskCompressor,
     BitmaskConfig,
     CompressionConfig,
+    CompressionFormat,
     DenseCompressor,
     DenseSparsityConfig,
     ModelCompressor,
@@ -26,8 +27,8 @@
 @pytest.mark.parametrize(
     "name,type",
     [
-        ["sparse_bitmask", BitmaskConfig],
-        ["dense_sparsity", DenseSparsityConfig],
+        [CompressionFormat.sparse_bitmask.value, BitmaskConfig],
+        [CompressionFormat.dense_sparsity.value, DenseSparsityConfig],
     ],
 )
 def test_configs(name, type):
@@ -38,7 +39,10 @@ def test_configs(name, type):
 
 @pytest.mark.parametrize(
     "name,type",
-    [["sparse_bitmask", BitmaskCompressor], ["dense_sparsity", DenseCompressor]],
+    [
+        [CompressionFormat.sparse_bitmask.value, BitmaskCompressor],
+        [CompressionFormat.dense_sparsity.value, DenseCompressor],
+    ],
 )
 def test_compressors(name, type):
     compressor = ModelCompressor.load_from_registry(
diff --git a/tests/test_utils/test_helpers.py b/tests/test_utils/test_helpers.py
index 54b6b1f9..f643233c 100644
--- a/tests/test_utils/test_helpers.py
+++ b/tests/test_utils/test_helpers.py
@@ -19,47 +19,44 @@
 
 
 @pytest.fixture
-def tensors_and_config_sparse():
+def tensors():
     tensors = {"tensor_1": torch.Tensor([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]])}
-    expected_config_json = {
-        "compression_config": {
-            "format": "sparse_bitmask",
-            "global_sparsity": (
-                tensors["tensor_1"].sum() / tensors["tensor_1"].numel()
-            ).item(),
-            "sparsity_structure": "unstructured",
-        }
-    }
-    return tensors, expected_config_json
-
-
-@pytest.fixture
-def tensors_dense():
-    tensors = {"tensor_1": torch.Tensor([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])}
     return tensors
 
 
-def test_save_compressed_sparse(tmp_path, tensors_and_config_sparse):
-    tensors, expected_config_json = tensors_and_config_sparse
-
-    config_json = save_compressed(
+def test_save_compressed_sparse_bitmask(tmp_path, tensors):
+    save_compressed(
         tensors,
-        compression_config=BitmaskConfig(**expected_config_json["compression_config"]),
+        compression_format="sparse-bitmask",
         save_path=tmp_path / "model.safetensors",
     )
     assert (tmp_path / "model.safetensors").exists()
-    assert config_json == expected_config_json
 
 
-def test_save_compressed_dense(tmp_path, tensors_dense):
-    tensors = tensors_dense
+def test_save_compressed_dense_sparsity(tmp_path, tensors):
+    save_compressed(
+        tensors,
+        compression_format="dense-sparsity",
+        save_path=tmp_path / "model.safetensors",
+    )
+    assert (tmp_path / "model.safetensors").exists()
 
-    config_json = save_compressed(
+
+def test_save_compressed_no_compression(tmp_path, tensors):
+    save_compressed(
         tensors,
         save_path=tmp_path / "model.safetensors",
     )
     assert (tmp_path / "model.safetensors").exists()
-    assert config_json is None
+
+
+def test_save_compressed_rubbish_compression_format(tmp_path, tensors):
+    with pytest.raises(Exception):
+        save_compressed(
+            tensors,
+            compression_format="this_is_not_a_valid_format",
+            save_path=tmp_path / "model.safetensors",
+        )
 
 
 def test_save_compressed_empty():
@@ -71,24 +68,37 @@ def test_save_compressed_empty():
         save_compressed(None, "")
 
 
-def test_load_compressed_sparse(tmp_path, tensors_and_config_sparse):
-    tensors, expected_config_json = tensors_and_config_sparse
-    compression_config = BitmaskConfig(**expected_config_json["compression_config"])
+def test_load_compressed_sparse_bitmask(tmp_path, tensors):
     save_compressed(
         tensors,
-        compression_config=compression_config,
+        compression_format="sparse-bitmask",
         save_path=tmp_path / "model.safetensors",
     )
+    compression_config = BitmaskConfig(
+        format="sparse-bitmask",
+    )
     loaded_tensors = load_compressed(tmp_path / "model.safetensors", compression_config)
     for key in tensors:
         assert torch.allclose(tensors[key], loaded_tensors[key])
 
 
-def test_load_compressed_dense(tmp_path, tensors_dense):
+def test_load_compressed_dense_sparsity(tmp_path, tensors):
     save_compressed(
-        tensors_dense,
+        tensors,
+        compression_format="dense-sparsity",
+        save_path=tmp_path / "model.safetensors",
+    )
+    compression_config = BitmaskConfig(format="dense-sparsity")
+    loaded_tensors = load_compressed(tmp_path / "model.safetensors", compression_config)
+    # loaded_tensors is empty -> decompression returns empty dict
+    assert not loaded_tensors
+
+
+def test_load_compressed_no_compression(tmp_path, tensors):
+    save_compressed(
+        tensors,
         save_path=tmp_path / "model.safetensors",
     )
     loaded_tensors = load_compressed(tmp_path / "model.safetensors")
-    for key in tensors_dense:
-        assert torch.allclose(tensors_dense[key], loaded_tensors[key])
+    for key in tensors:
+        assert torch.allclose(tensors[key], loaded_tensors[key])

From b074530b8b41b9ad293263f4844483569f657727 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 23 Apr 2024 09:17:20 +0000
Subject: [PATCH 09/13] cosmetic changes

---
 README.md                               | 48 +++----------------------
 src/compressed_tensors/utils/helpers.py |  6 ++--
 2 files changed, 6 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index cf292014..aae87d21 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,6 @@ The compressed format stores the data much more efficiently by taking advantage
 - Sparse tensors -> due to a large number of entries that are equal to zero.
 - Quantized -> due to their low precision representation.
 
-
 ### Introduce an elegant interface to save/load compressed tensors
 
 The library provides the user with the ability to compress/decompress tensors. The properties of tensors are defined by human-readable configs, allowing the users to understand the compression format at a quick glance.
@@ -34,7 +33,7 @@ pip install -e .
 
 ## Getting started
 
-### Saving
+### Saving/Loading Compressed Tensors (Bitmask Compression)
 
 The function `save_compressed` uses the `compression_format` argument to apply compression to tensors.
 The function `load_compressed` reverses the process: converts the compressed weights on disk to decompressed weights in device memory.
@@ -57,47 +56,8 @@ tensors: Dict[str, Tensor] = {"tensor_1": Tensor(
 save_compressed(tensors, "model.safetensors", compression_format=compression_config.format)
 
 # decompress tensors (load the uncompressed representation to device memory)
-tensors = load_compressed("model.safetensors", device="cpu", compression_config = compression_config)
-```
-
-## Benefits
-TODO
-
-## SafeTensors File Format
-
-For each parameter in the uncompressed state_dict, we store the following attributes needed for decompression in the compressed state_dict:
-
-- Compressed tensor
-- Bitmask
-- Uncompressed shape
-- Row offsets
-
-```python
-# Dense
-{
-    PARAM_NAME: uncompressed_tensor
-}
-
-# Compressed
-{
-    PARAM_NAME.compressed: compressed_tensor,  # 1d tensor
-    PARAM_NAME.bitmask: value,  # 2d bitmask tensor (nrows x (ncols / 8))
-    PARAM_NAME.shape: value,  # Uncompressed shape tensor
-    PARAM_NAME.row_offsets: value  # 1d offsets tensor
-}
+tensors = load_compressed("model.safetensors", compression_config = compression_config)
 ```
 
-The library provides pathways to automatically add the config information to the HF config file.
-
-```json
-// config.json
-{
-    "compression_config": {
-        "format": "sparse_bitmask", // "dense_sparsity" for the original tensor format
-
-        // Informational
-        "sparsity_structure": "unstructured", // Or 2:4, 8:16, etc.
-        "global_sparsity": "0.5"
-    }
-}
-```
\ No newline at end of file
+### Saving Compressed Model Weights (Using Quantization)
+// TODO
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
index 1c8dd29f..71b2170b 100644
--- a/src/compressed_tensors/utils/helpers.py
+++ b/src/compressed_tensors/utils/helpers.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from pathlib import Path
-from typing import Dict, Literal, Optional, Union
+from typing import Dict, Optional, Union
 
 from compressed_tensors.base import CONFIG_NAME
 from compressed_tensors.compressors import ModelCompressor
@@ -51,9 +51,7 @@ def infer_compressor_from_model_config(
 def save_compressed(
     tensors: Dict[str, Tensor],
     save_path: Union[str, Path],
-    compression_format: Optional[
-        Literal[CompressionFormat.sparse_bitmask, CompressionFormat.dense_sparsity]
-    ] = None,
+    compression_format: Optional[CompressionFormat] = None,
 ):
     """
     Save compressed tensors to disk. If tensors are not compressed,

From fe6bb7ea880bea2987fcf0914d5096dfdf4f43e5 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Wed, 24 Apr 2024 10:40:23 +0000
Subject: [PATCH 10/13] finishing the PR

---
 README.md                                     |  22 +-
 examples/bitmask_compression.ipynb            | 252 ++++++++++++++++++
 setup.py                                      |   2 +-
 src/compressed_tensors/README.md              | 162 -----------
 src/compressed_tensors/base.py                |   2 +-
 src/compressed_tensors/compressors/base.py    |   4 +-
 src/compressed_tensors/compressors/dense.py   |   2 +-
 src/compressed_tensors/utils/helpers.py       |  65 +++--
 .../test_bitmask_compression_ipynb.py         |  28 ++
 tests/test_utils/test_helpers.py              |  50 +++-
 10 files changed, 393 insertions(+), 196 deletions(-)
 create mode 100644 examples/bitmask_compression.ipynb
 delete mode 100644 src/compressed_tensors/README.md
 create mode 100644 tests/test_examples/test_bitmask_compression_ipynb.py

diff --git a/README.md b/README.md
index aae87d21..29bb0b11 100644
--- a/README.md
+++ b/README.md
@@ -59,5 +59,23 @@ save_compressed(tensors, "model.safetensors", compression_format=compression_con
 tensors = load_compressed("model.safetensors", compression_config = compression_config)
 ```
 
-### Saving Compressed Model Weights (Using Quantization)
-// TODO
+## Saving/Loading Compressed Models (Bitmask Compression)
+
+We can apply bitmask compression to a whole model. For more detailed example see `example` directory.
+```python
+from compressed_tensors import save_compressed_model, load_compressed, BitmaskConfig
+from transformers import AutoModelForCausalLM
+
+model_name = "neuralmagic/llama2.c-stories110M-pruned50"
+model = AutoModelForCausalLM.from_pretrained(model_name)
+
+original_state_dict = model.state_dict()
+
+compression_config = BitmaskConfig()
+
+# save compressed model weights
+save_compressed_model(model, "compressed_model.safetensors", compression_format=compression_config.format)
+
+# load compressed model weights
+state_dict = load_compressed("compressed_model.safetensors", compression_config)
+```
diff --git a/examples/bitmask_compression.ipynb b/examples/bitmask_compression.ipynb
new file mode 100644
index 00000000..7658a67a
--- /dev/null
+++ b/examples/bitmask_compression.ipynb
@@ -0,0 +1,252 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Bitmask Compression Example ##\n",
+    "\n",
+    "Bitmask compression allows for storing sparse tensors efficiently on the disk. \n",
+    "\n",
+    "Instead of storing each zero element represented as an actual number, we use bitmask to indicate which tensor entries correspond to zero elements. This approach is useful when the matrix is mostly zero values, as it saves space by not wastefully storing those zeros explicitly.\n",
+    "\n",
+    "The example below shows how to save and load sparse tensors using bitmask compression. It also demonstrates the benefits of the bitmask compression over \"dense\" representation, and finally, introduces the enhanced `safetensors` file format for storing sparse weights."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "from safetensors import safe_open\n",
+    "from safetensors.torch import save_model\n",
+    "from compressed_tensors import save_compressed_model, load_compressed, BitmaskConfig\n",
+    "from transformers import AutoModelForCausalLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LlamaForCausalLM(\n",
+       "  (model): LlamaModel(\n",
+       "    (embed_tokens): Embedding(32000, 768)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-11): 12 x LlamaDecoderLayer(\n",
+       "        (self_attn): LlamaSdpaAttention(\n",
+       "          (q_proj): Linear(in_features=768, out_features=768, bias=False)\n",
+       "          (k_proj): Linear(in_features=768, out_features=768, bias=False)\n",
+       "          (v_proj): Linear(in_features=768, out_features=768, bias=False)\n",
+       "          (o_proj): Linear(in_features=768, out_features=768, bias=False)\n",
+       "          (rotary_emb): LlamaRotaryEmbedding()\n",
+       "        )\n",
+       "        (mlp): LlamaMLP(\n",
+       "          (gate_proj): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "          (up_proj): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "          (down_proj): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): LlamaRMSNorm()\n",
+       "        (post_attention_layernorm): LlamaRMSNorm()\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): LlamaRMSNorm()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=768, out_features=32000, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# load a tiny, pruned llama2 model\n",
+    "model_name = \"neuralmagic/llama2.c-stories110M-pruned50\"\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The example layer model.layers.0.self_attn.q_proj.weight has sparsity 0.50%\n"
+     ]
+    }
+   ],
+   "source": [
+    "# most of the weights of the model are pruned to 50% (except for few layers such as lm_head or embeddings)\n",
+    "state_dict = model.state_dict()\n",
+    "state_dict.keys()\n",
+    "example_layer = \"model.layers.0.self_attn.q_proj.weight\"\n",
+    "print(f\"The example layer {example_layer} has sparsity {torch.sum(state_dict[example_layer] == 0).item() / state_dict[example_layer].numel():.2f}%\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The model is 31.67% sparse overall\n"
+     ]
+    }
+   ],
+   "source": [
+    "# we can inspect to total sparisity of the state_dict\n",
+    "total_num_parameters = 0\n",
+    "total_num_zero_parameters = 0\n",
+    "for key in state_dict:\n",
+    "    total_num_parameters += state_dict[key].numel()\n",
+    "    total_num_zero_parameters += state_dict[key].eq(0).sum().item()\n",
+    "print(f\"The model is {total_num_zero_parameters/total_num_parameters*100:.2f}% sparse overall\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Compressing model: 100%|██████████| 111/111 [00:06<00:00, 17.73it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size of the model's weights on disk using safetensors: 417.83 MB\n",
+      "Size of the model's weights on disk using compressed-tensors: 366.82 MB\n",
+      "The compression ratio is x1.14\n"
+     ]
+    }
+   ],
+   "source": [
+    "# let's save the model on disk using safetensors and compressed-tensors and compare the size on disk\n",
+    "\n",
+    "## save the model using safetensors ##\n",
+    "save_model(model, \"model.safetensors\")\n",
+    "size_on_disk_mb = os.path.getsize('model.safetensors') / 1024 / 1024\n",
+    "\n",
+    "## save the model using compressed-tensors ##\n",
+    "save_compressed_model(model, \"compressed_model.safetensors\", compression_format=\"sparse-bitmask\")\n",
+    "compressed_size_on_disk_mb = os.path.getsize('compressed_model.safetensors') / 1024 / 1024\n",
+    "\n",
+    "print(f\"Size of the model's weights on disk using safetensors: {size_on_disk_mb:.2f} MB\")\n",
+    "print(f\"Size of the model's weights on disk using compressed-tensors: {compressed_size_on_disk_mb:.2f} MB\")\n",
+    "print(\"The compression ratio is x{:.2f}\".format(size_on_disk_mb / compressed_size_on_disk_mb))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Storing weights with around 30% of zero entries requires significantly less disk space when using `compressed-tensors`. The compression ratio improves radically for more sparse models. \n",
+    "\n",
+    "We can load back the `state_dict` from the compressed and uncompressed representation on disk and confirm, that they represent same tensors in memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Once loaded, the state_dicts from safetensors and compressed-tensors are equal: True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load the safetensor and the compressed-tensor and show that they have the same representation\n",
+    "\n",
+    "## load the uncompressed safetensors to memory ##\n",
+    "state_dict_1 = {}\n",
+    "with safe_open('model.safetensors', framework=\"pt\") as f:\n",
+    "   for key in f.keys():\n",
+    "       state_dict_1[key] = f.get_tensor(key)\n",
+    "\n",
+    "## load the compressed-tensors to memory ##\n",
+    "config = BitmaskConfig() # we need to specify the method for decompression\n",
+    "state_dict_2 = load_compressed(\"compressed_model.safetensors\", config)\n",
+    "\n",
+    "tensors_equal = all(torch.equal(state_dict_1[key], state_dict_2[key]) for key in state_dict_1)\n",
+    "\n",
+    "print(f\"Once loaded, the state_dicts from safetensors and compressed-tensors are equal: {tensors_equal}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SafeTensors File Format\n",
+    "\n",
+    "The reason why the introduced bitmask compression is much more efficient, is imbibing the information about the compression in the header of the `.safetensors` file.\n",
+    "For each parameter in the uncompressed `state_dict`, we store the following attributes needed for decompression in the compressed `state_dict`:\n",
+    "\n",
+    "* Compressed tensor\n",
+    "* Bitmask\n",
+    "* Uncompressed shape\n",
+    "* Row offsets\n",
+    "\n",
+    "```bash\n",
+    "# Dense\n",
+    "{\n",
+    "    PARAM_NAME: uncompressed_tensor\n",
+    "}\n",
+    "\n",
+    "# Compressed\n",
+    "{\n",
+    "    PARAM_NAME.compressed: compressed_tensor,  # 1d tensor\n",
+    "    PARAM_NAME.bitmask: value,  # 2d bitmask tensor (nrows x (ncols / 8))\n",
+    "    PARAM_NAME.shape: value,  # Uncompressed shape tensor\n",
+    "    PARAM_NAME.row_offsets: value  # 1d offsets tensor\n",
+    "}\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/setup.py b/setup.py
index 225d7b8d..c6e4b380 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def _setup_install_requires() -> List:
     return ["torch>=1.7.0", "transformers<=4.40", "pydantic<2.7"]
 
 def _setup_extras() -> Dict:
-    return {"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0",]}
+    return {"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"]}
 
 setup(
     name="compressed-tensors",
diff --git a/src/compressed_tensors/README.md b/src/compressed_tensors/README.md
deleted file mode 100644
index 107eca65..00000000
--- a/src/compressed_tensors/README.md
+++ /dev/null
@@ -1,162 +0,0 @@
-# Save/Load Compressed SafeTensors
-
-## Motivation
-
-* Reduce disk space by saving in a compressed format for sparse models. Models in this compressed format will be loaded by vLLM for more efficient inference
-* Set up the save/load architecture such that we can easily expand to additional compression formats in the future. The config should be human readable so users can understand the compression format at a quick glance
-
-## SafeTensors File Format
-
-For each parameter in the uncompressed state_dict, we store the following attributes 
-needed for decompression in the compressed state_dict:
-
-* compressed tensor
-* bitmask
-* uncompressed shape
-* row offsets
-
-```python
-# dense
-{
-    PARAM_NAME: uncompressed_tensor
-}
-
-# compressed
-{
-    PARAM_NAME.compressed: compressed_tensor # 1d tensor
-    PARAM_NAME.bitmask: value # 2d bitmask tensor (nrows x (ncols / 8))
-    PARAM_NAME.shape: value # uncompressed shape tensor
-    PARAM_NAME.row_offsets: value # 1d offsets tensor
-}
-```
-
-Config information gets stored in the HF config file
-```json
-// config.json
-{
-    "compression_config": {
-        "format": "sparse_bitmask", // "dense_sparsity" for original tensor format
-
-        // informational
-        "sparsity_structure": "unstructured", // or 2:4, 8:16 etc...
-        "global_sparsity": "0.5"
-    }
-}
-```
-
-## Saving/Loading Interface 
-
-Loading in a compressed model requires no interface changes
-
-```python
-from sparseml.transformers.utils import SparseAutoModelForCausalLM
-
-# should contain model.safetensors or model.safetensors.index.json
-model_path = "/PATH/TO/COMPRESSED_MODEL"
-
-model = SparseAutoModelForCausalLM.from_pretrained(
-    model_name_or_path=model_path,
-    **model_kwargs,
-)
-```
-
-Saving a compressed model with an explicitly provided compression config. The config
-is saved to the model's `config.json` file. **Note:** the model must have been 
-initialized with SparseAutoModelForCausalLM.from_pretrained()
-
-```python
-from compressed_tensors import BitmaskConfig
-
-output_dir = "/PATH/TO/SAVE/COMPRESSED_MODEL"
-sparsity_config = BitmaskConfig()
-
-model.save_pretrained(
-    save_directory=output_dir,
-    sparsity_config=sparsity_config,
-)
-```
-
-Saving a compressed model, inferring the config from the model attributes
-
-```python
-model.save_pretrained(
-    save_directory=output_dir,
-    save_compressed=True
-)
-```
-
-Saving a model in the dense format. If the model has at least 5% global sparsity a 
-sparsity config will still be included in `config.json` with format `dense_sparsity`
-
-```python
-model.save_pretrained(
-    save_directory=output_dir
-)
-```
-
-Saving a model in the dense format, bypassing the sparsity config calculation. When the
-`skip_compression_stats` flag is set, no sparsity config will be written to 
-`config.json`
-
-```python
-model.save_pretrained(
-    save_directory=output_dir
-    skip_compression_stats=True
-)
-```
-
-## Enable Compression During One-Shot and Sparse Finetunining
-Models that are saved in a supported compressed format on disk will automatically be
-decompressed when loaded as input to `sparseml.transformers.oneshot` or 
-`sparseml.transformers.train`
-
-To enable compression on save after oneshot or finetuning simply add the 
-`save_compressed=True` argument to `sparseml.transformers.oneshot` or 
-`sparseml.transformers.train`
-
-```python
-from sparseml.transformers import train
-
-train(
-    save_compressed=True,
-    model="neuralmagic/TinyLlama-1.1B-Chat-v1.0-pruned2.4",
-    recipe=RECIPE,
-    dataset=DATASET
-)
-```
-
-
-## Example Code
-
-Loads a 60% sparse model, compresses it using the inferred bitmask compression, then 
-reloads the compressed model.
-
-```python
-from sparseml.transformers import SparseAutoModelForCausalLM
-from sparseml.utils.pytorch.utils import measure_cuda_memory
-import torch
-
-MODEL_PATH = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60"
-OUTPUT_PATH = "./test_compress_output"
-RECIPE = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60"
-
-torch.cuda.set_device(0)
-with measure_cuda_memory() as m:
-    model = SparseAutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="cuda:0")
-print(f"Load dense model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
-
-sparsity_config = getattr(model,"sparsity_config", None)
-print(f"Sparsity config before compression: {sparsity_config}")
-with measure_cuda_memory() as m:
-    model.save_pretrained(OUTPUT_PATH, save_compressed=True)
-print(f"Save compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
-
-torch.cuda.set_device(1)
-with measure_cuda_memory() as m:
-    model_again = SparseAutoModelForCausalLM.from_pretrained(
-        OUTPUT_PATH, device_map="cuda:1"
-    )
-print(f"Load compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
-sparsity_config = getattr(model_again,"sparsity_config", None)
-print(f"Sparsity config after compression: {sparsity_config}")
-```
diff --git a/src/compressed_tensors/base.py b/src/compressed_tensors/base.py
index 964b3087..f01a055f 100644
--- a/src/compressed_tensors/base.py
+++ b/src/compressed_tensors/base.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-CONFIG_NAME = "compression_config"
+SPARSITY_CONFIG_NAME = "sparsity_config"
diff --git a/src/compressed_tensors/compressors/base.py b/src/compressed_tensors/compressors/base.py
index 50c34da8..f40c8106 100644
--- a/src/compressed_tensors/compressors/base.py
+++ b/src/compressed_tensors/compressors/base.py
@@ -15,7 +15,7 @@
 import operator
 from typing import Dict, Generator, Optional, Tuple
 
-from compressed_tensors.base import CONFIG_NAME
+from compressed_tensors.base import SPARSITY_CONFIG_NAME
 from compressed_tensors.config import CompressionConfig
 from compressed_tensors.registry import RegistryMixin
 from torch import Tensor
@@ -74,4 +74,4 @@ def overwrite_weights(self, model_path: str, model: Module):
             data_old = operator.attrgetter(name)(model)
             data_old.data = data_new.data
 
-        setattr(model, CONFIG_NAME, self.config)
+        setattr(model, SPARSITY_CONFIG_NAME, self.config)
diff --git a/src/compressed_tensors/compressors/dense.py b/src/compressed_tensors/compressors/dense.py
index 97a62a81..c9a1c00c 100644
--- a/src/compressed_tensors/compressors/dense.py
+++ b/src/compressed_tensors/compressors/dense.py
@@ -29,6 +29,6 @@ def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
         return model_state
 
     def decompress(
-        self, path_to_model_or_tensors: str
+        self, path_to_model_or_tensors: str, device: str
     ) -> Generator[Tuple[str, Tensor], None, None]:
         return iter([])
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
index 71b2170b..8d6c26cf 100644
--- a/src/compressed_tensors/utils/helpers.py
+++ b/src/compressed_tensors/utils/helpers.py
@@ -15,16 +15,25 @@
 from pathlib import Path
 from typing import Dict, Optional, Union
 
-from compressed_tensors.base import CONFIG_NAME
+import torch
+from compressed_tensors.base import SPARSITY_CONFIG_NAME
 from compressed_tensors.compressors import ModelCompressor
-from compressed_tensors.config import CompressionConfig, CompressionFormat
-from safetensors import safe_open
+from compressed_tensors.config import (
+    CompressionConfig,
+    CompressionFormat,
+    DenseSparsityConfig,
+)
 from safetensors.torch import save_file
 from torch import Tensor
 from transformers import AutoConfig
 
 
-__all__ = ["infer_compressor_from_model_config", "load_compressed", "save_compressed"]
+__all__ = [
+    "infer_compressor_from_model_config",
+    "load_compressed",
+    "save_compressed",
+    "save_compressed_model",
+]
 
 
 def infer_compressor_from_model_config(
@@ -38,7 +47,7 @@ def infer_compressor_from_model_config(
     :return: matching compressor if config contains a sparsity config
     """
     config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
-    sparsity_config = getattr(config, CONFIG_NAME, None)
+    sparsity_config = getattr(config, SPARSITY_CONFIG_NAME, None)
     if sparsity_config is None:
         return None
 
@@ -65,10 +74,8 @@ def save_compressed(
     if tensors is None or len(tensors) == 0:
         raise ValueError("No tensors or empty tensors provided to compress")
 
-    if compression_format is None:
-        # no compression applied
-        save_file(tensors, save_path)
-        return
+    # if no compression_format specified, default to `dense_sparsity`
+    compression_format = compression_format or CompressionFormat.dense_sparsity.value
 
     if not (
         compression_format in ModelCompressor.registered_names()
@@ -104,17 +111,41 @@ def load_compressed(
     if compressed_tensors is None or not Path(compressed_tensors).exists():
         raise ValueError("No compressed tensors provided to load")
 
-    if compression_config is None:
-        # no compression applied
-        tensors = {}
-        with safe_open(compressed_tensors, framework="pt", device="cpu") as f:
-            for key in f.keys():
-                tensors[key] = f.get_tensor(key)
-        return tensors
+    # if no compression_config specified, default to `dense_sparsity`
+    compression_config = compression_config or DenseSparsityConfig()
 
     # decompress
     compression_format = compression_config.format
     compressor = ModelCompressor.load_from_registry(
         compression_format, config=compression_config
     )
-    return dict(compressor.decompress(compressed_tensors))
+    return dict(compressor.decompress(compressed_tensors, device=device))
+
+
+def save_compressed_model(
+    model: torch.nn.Module,
+    filename: str,
+    compression_format: Optional[CompressionFormat] = None,
+    force_contiguous: bool = True,
+):
+    """
+    Wrapper around safetensors `save_model` helper function, which allows for
+    saving compressed model to disk.
+
+    Note: The model is assumed to have a
+        state_dict with  unique entries
+
+    :param model: model to save on disk
+    :param filename: filename location to save the file
+    :param compression_format: compression format used for the model
+    :param force_contiguous: forcing the state_dict to be saved as contiguous tensors
+    """
+    state_dict = model.state_dict()
+    if force_contiguous:
+        state_dict = {k: v.contiguous() for k, v in state_dict.items()}
+    try:
+        save_compressed(state_dict, filename, compression_format=compression_format)
+    except ValueError as e:
+        msg = str(e)
+        msg += " Or use save_compressed_model(..., force_contiguous=True), read the docs for potential caveats."  # noqa E501
+        raise ValueError(msg)
diff --git a/tests/test_examples/test_bitmask_compression_ipynb.py b/tests/test_examples/test_bitmask_compression_ipynb.py
new file mode 100644
index 00000000..bf0bddd8
--- /dev/null
+++ b/tests/test_examples/test_bitmask_compression_ipynb.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nbformat
+import pytest
+from nbconvert.preprocessors import ExecutePreprocessor
+
+
+@pytest.mark.parametrize("notebook", ["examples/bitmask_compression.ipynb"])
+def test_notebook_exec(notebook):
+    with open(notebook) as f:
+        nb = nbformat.read(f, as_version=4)
+        ep = ExecutePreprocessor(timeout=600, kernel_name="python3")
+        try:
+            assert ep.preprocess(nb) is not None, f"Got empty notebook for {notebook}"
+        except Exception:
+            assert False, f"Failed executing {notebook}"
diff --git a/tests/test_utils/test_helpers.py b/tests/test_utils/test_helpers.py
index f643233c..eeff70de 100644
--- a/tests/test_utils/test_helpers.py
+++ b/tests/test_utils/test_helpers.py
@@ -14,8 +14,11 @@
 
 import pytest
 import torch
-from compressed_tensors import load_compressed, save_compressed
+from compressed_tensors import load_compressed, save_compressed, save_compressed_model
 from compressed_tensors.config import BitmaskConfig
+from safetensors import safe_open
+from safetensors.torch import save_model
+from transformers import AutoModelForCausalLM
 
 
 @pytest.fixture
@@ -24,6 +27,13 @@ def tensors():
     return tensors
 
 
+@pytest.fixture
+def llama_model(tmp_path):
+    model_name = "neuralmagic/llama2.c-stories110M-pruned50"
+    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=tmp_path)
+    yield model
+
+
 def test_save_compressed_sparse_bitmask(tmp_path, tensors):
     save_compressed(
         tensors,
@@ -88,17 +98,37 @@ def test_load_compressed_dense_sparsity(tmp_path, tensors):
         compression_format="dense-sparsity",
         save_path=tmp_path / "model.safetensors",
     )
-    compression_config = BitmaskConfig(format="dense-sparsity")
-    loaded_tensors = load_compressed(tmp_path / "model.safetensors", compression_config)
+
+    loaded_tensors = load_compressed(tmp_path / "model.safetensors")
     # loaded_tensors is empty -> decompression returns empty dict
     assert not loaded_tensors
 
 
-def test_load_compressed_no_compression(tmp_path, tensors):
-    save_compressed(
-        tensors,
-        save_path=tmp_path / "model.safetensors",
+def test_save_compressed_model(tmp_path, llama_model):
+    path_to_uncompressed = tmp_path / "model_uncompressed.safetensors"
+    path_to_compressed = tmp_path / "model_compressed.safetensors"
+
+    # save uncompressed model
+    save_model(llama_model, path_to_uncompressed)
+    size_uncompressed_kb = path_to_uncompressed.stat().st_size / 1024
+
+    # save compressed model
+    save_compressed_model(
+        llama_model, path_to_compressed, compression_format="sparse-bitmask"
     )
-    loaded_tensors = load_compressed(tmp_path / "model.safetensors")
-    for key in tensors:
-        assert torch.allclose(tensors[key], loaded_tensors[key])
+    size_compressed_kb = path_to_compressed.stat().st_size / 1024
+
+    # compare that the are the same after loading
+    state_dict_1 = {}
+    with safe_open(path_to_uncompressed, framework="pt") as f:
+        for key in f.keys():
+            state_dict_1[key] = f.get_tensor(key)
+    state_dict_2 = load_compressed(
+        path_to_compressed, BitmaskConfig(format="sparse-bitmask")
+    )
+    assert all(
+        torch.allclose(state_dict_1[key], state_dict_2[key]) for key in state_dict_1
+    )
+    # make sure that compressed model is smaller
+    # than uncompressed by roughly 1.14 (value established empirically)
+    assert pytest.approx(size_uncompressed_kb / size_compressed_kb, 0.01) == 1.14

From 10f976b7362062f8a86762a9565c71c67d8d522c Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Thu, 25 Apr 2024 12:02:16 +0000
Subject: [PATCH 11/13] finalize the PR

---
 README.md                                     | 11 ++--
 examples/bitmask_compression.ipynb            | 18 +++---
 makefile                                      |  6 +-
 .../compressors/__init__.py                   |  7 ++-
 src/compressed_tensors/compressors/helpers.py | 52 +++++++++-------
 tests/test_utils/test_helpers.py              | 59 ++++++++++++-------
 6 files changed, 95 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 29bb0b11..361a68f9 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,6 @@ from typing import Dict
 # tensors with large number of zero entries 
 compression_config = BitmaskConfig()
 
-
 tensors: Dict[str, Tensor] = {"tensor_1": Tensor(
     [[0.0, 0.0, 0.0], 
      [1.0, 1.0, 1.0]]
@@ -55,8 +54,10 @@ tensors: Dict[str, Tensor] = {"tensor_1": Tensor(
 # compress tensors using BitmaskConfig compression format (save them efficiently on disk)
 save_compressed(tensors, "model.safetensors", compression_format=compression_config.format)
 
-# decompress tensors (load the uncompressed representation to device memory)
-tensors = load_compressed("model.safetensors", compression_config = compression_config)
+# decompress tensors (load_compressed returns a generator for memory efficiency)
+decompressed_tensors = {}
+for tensor_name, tensor in load_compressed("model.safetensors", compression_config = compression_config):
+    decompressed_tensors[tensor_name] = tensor
 ```
 
 ## Saving/Loading Compressed Models (Bitmask Compression)
@@ -76,6 +77,6 @@ compression_config = BitmaskConfig()
 # save compressed model weights
 save_compressed_model(model, "compressed_model.safetensors", compression_format=compression_config.format)
 
-# load compressed model weights
-state_dict = load_compressed("compressed_model.safetensors", compression_config)
+# load compressed model weights (`dict` turns generator into a dictionary)
+state_dict = dict(load_compressed("compressed_model.safetensors", compression_config))
 ```
diff --git a/examples/bitmask_compression.ipynb b/examples/bitmask_compression.ipynb
index 7658a67a..995629c4 100644
--- a/examples/bitmask_compression.ipynb
+++ b/examples/bitmask_compression.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -63,7 +63,7 @@
        ")"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -77,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -98,7 +98,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -121,14 +121,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Compressing model: 100%|██████████| 111/111 [00:06<00:00, 17.73it/s]\n"
+      "Compressing model: 100%|██████████| 111/111 [00:06<00:00, 17.92it/s]\n"
      ]
     },
     {
@@ -168,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -190,7 +190,7 @@
     "\n",
     "## load the compressed-tensors to memory ##\n",
     "config = BitmaskConfig() # we need to specify the method for decompression\n",
-    "state_dict_2 = load_compressed(\"compressed_model.safetensors\", config)\n",
+    "state_dict_2 = dict(load_compressed(\"compressed_model.safetensors\", config)) # load_compressed returns a generator, we convert it to a dict\n",
     "\n",
     "tensors_equal = all(torch.equal(state_dict_1[key], state_dict_2[key]) for key in state_dict_1)\n",
     "\n",
diff --git a/makefile b/makefile
index 435a37b9..255514f9 100644
--- a/makefile
+++ b/makefile
@@ -1,4 +1,4 @@
-BUILDDIR := $(PWD)
+
 PYCHECKDIRS := src tests
 PYCHECKGLOBS := 'src/**/*.py' 'tests/**/*.py' 'utils/**/*.py' 'examples/**/*.py' setup.py
 # run checks on all files for the repo
@@ -23,6 +23,10 @@ test:
 	@echo "Running python tests";
 	pytest tests;
 
+# creates wheel file
+build:
+	python3 setup.py sdist bdist_wheel $(BUILD_ARGS)
+
 # clean package
 clean:
 	@echo "Cleaning up";
diff --git a/src/compressed_tensors/compressors/__init__.py b/src/compressed_tensors/compressors/__init__.py
index 50d569e4..c93f1346 100644
--- a/src/compressed_tensors/compressors/__init__.py
+++ b/src/compressed_tensors/compressors/__init__.py
@@ -16,5 +16,10 @@
 
 from .base import ModelCompressor
 from .dense import DenseCompressor
-from .helpers import infer_compressor_from_model_config
+from .helpers import (
+    infer_compressor_from_model_config,
+    load_compressed,
+    save_compressed,
+    save_compressed_model,
+)
 from .sparse_bitmask import BitmaskCompressor, BitmaskTensor
diff --git a/src/compressed_tensors/compressors/helpers.py b/src/compressed_tensors/compressors/helpers.py
index 8d6c26cf..1ba75636 100644
--- a/src/compressed_tensors/compressors/helpers.py
+++ b/src/compressed_tensors/compressors/helpers.py
@@ -13,16 +13,14 @@
 # limitations under the License.
 
 from pathlib import Path
-from typing import Dict, Optional, Union
+from typing import Dict, Generator, Optional, Tuple, Union
 
 import torch
 from compressed_tensors.base import SPARSITY_CONFIG_NAME
 from compressed_tensors.compressors import ModelCompressor
-from compressed_tensors.config import (
-    CompressionConfig,
-    CompressionFormat,
-    DenseSparsityConfig,
-)
+from compressed_tensors.config import CompressionConfig, CompressionFormat
+from compressed_tensors.utils.safetensors_load import get_weight_mappings
+from safetensors import safe_open
 from safetensors.torch import save_file
 from torch import Tensor
 from transformers import AutoConfig
@@ -97,29 +95,41 @@ def load_compressed(
     compressed_tensors: Union[str, Path],
     compression_config: CompressionConfig = None,
     device: Optional[str] = "cpu",
-) -> Dict[str, Tensor]:
+) -> Generator[Tuple[str, Tensor], None, None]:
     """
-    Load compressed tensors from disk. If tensors are not compressed,
-    load them as is.
+    Load compressed tensors from disk.
+    If tensors are not compressed, load them as is.
 
-    :param compressed_tensors: path to compressed tensors
+    :param compressed_tensors: path to compressed tensors.
+        This can be a path to a file or a directory containing
+        one or multiple safetensor files (if multiple - in the format
+        assumed by huggingface)
     :param compression_config: compression config to use for decompressing tensors.
     :param device: device to move tensors to. If None, tensors are loaded on CPU.
-    :return decompressed tensors
+    :param return_dict: if True, return a dictionary of decompressed tensors
+    :return a generator that yields the name and tensor of the decompressed tensor
     """
-
     if compressed_tensors is None or not Path(compressed_tensors).exists():
         raise ValueError("No compressed tensors provided to load")
 
-    # if no compression_config specified, default to `dense_sparsity`
-    compression_config = compression_config or DenseSparsityConfig()
-
-    # decompress
-    compression_format = compression_config.format
-    compressor = ModelCompressor.load_from_registry(
-        compression_format, config=compression_config
-    )
-    return dict(compressor.decompress(compressed_tensors, device=device))
+    if (
+        compression_config is None
+        or compression_config.format == CompressionFormat.dense_sparsity.value
+    ):
+        # if no compression_config specified, or `dense_sparsity` format specified,
+        # assume tensors are not compressed on disk
+        weight_mappings = get_weight_mappings(compressed_tensors)
+        for weight_name, file_with_weight_name in weight_mappings.items():
+            with safe_open(file_with_weight_name, framework="pt", device=device) as f:
+                weight = f.get_tensor(weight_name)
+                yield weight_name, weight
+    else:
+        # decompress tensors
+        compression_format = compression_config.format
+        compressor = ModelCompressor.load_from_registry(
+            compression_format, config=compression_config
+        )
+        yield from compressor.decompress(compressed_tensors, device=device)
 
 
 def save_compressed_model(
diff --git a/tests/test_utils/test_helpers.py b/tests/test_utils/test_helpers.py
index eeff70de..7ae0799d 100644
--- a/tests/test_utils/test_helpers.py
+++ b/tests/test_utils/test_helpers.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 import pytest
 import torch
 from compressed_tensors import load_compressed, save_compressed, save_compressed_model
 from compressed_tensors.config import BitmaskConfig
-from safetensors import safe_open
 from safetensors.torch import save_model
 from transformers import AutoModelForCausalLM
 
@@ -60,7 +61,13 @@ def test_save_compressed_no_compression(tmp_path, tensors):
     assert (tmp_path / "model.safetensors").exists()
 
 
-def test_save_compressed_rubbish_compression_format(tmp_path, tensors):
+def test_save_compressed_error(tmp_path):
+    with pytest.raises(Exception):
+        save_compressed({}, "")
+
+    with pytest.raises(Exception):
+        save_compressed(None, "")
+
     with pytest.raises(Exception):
         save_compressed(
             tensors,
@@ -69,15 +76,6 @@ def test_save_compressed_rubbish_compression_format(tmp_path, tensors):
         )
 
 
-def test_save_compressed_empty():
-    # make sure function raises error
-    with pytest.raises(Exception):
-        save_compressed({}, "")
-
-    with pytest.raises(Exception):
-        save_compressed(None, "")
-
-
 def test_load_compressed_sparse_bitmask(tmp_path, tensors):
     save_compressed(
         tensors,
@@ -87,7 +85,9 @@ def test_load_compressed_sparse_bitmask(tmp_path, tensors):
     compression_config = BitmaskConfig(
         format="sparse-bitmask",
     )
-    loaded_tensors = load_compressed(tmp_path / "model.safetensors", compression_config)
+    loaded_tensors = dict(
+        load_compressed(tmp_path / "model.safetensors", compression_config)
+    )
     for key in tensors:
         assert torch.allclose(tensors[key], loaded_tensors[key])
 
@@ -98,10 +98,30 @@ def test_load_compressed_dense_sparsity(tmp_path, tensors):
         compression_format="dense-sparsity",
         save_path=tmp_path / "model.safetensors",
     )
+    save_compressed(
+        tensors,
+        save_path=tmp_path / "model_.safetensors",
+    )
+
+    loaded_tensors = dict(load_compressed(tmp_path / "model.safetensors"))
+    loaded_tensors_ = dict(load_compressed(tmp_path / "model_.safetensors"))
+    # loaded_tensors should be equal to loaded_tensors_
+    for key in tensors:
+        assert torch.allclose(loaded_tensors[key], loaded_tensors_[key])
+
 
-    loaded_tensors = load_compressed(tmp_path / "model.safetensors")
-    # loaded_tensors is empty -> decompression returns empty dict
-    assert not loaded_tensors
+def test_load_compressed_sharded(tmp_path, llama_model):
+    sharded_model_path = tmp_path / "shared_model"
+    llama_model.save_pretrained(sharded_model_path, max_shard_size="2MB")
+    # make sure that model is shared on disk
+    assert len(os.listdir(sharded_model_path)) > 1
+    loaded_state_dict = dict(load_compressed(sharded_model_path))
+    for key, value in llama_model.state_dict().items():
+        if key == "lm_head.weight":
+            # lm_head doesn't have separate weights.
+            # It shares its weight tensor with the token embedding layer.
+            continue
+        assert torch.allclose(value, loaded_state_dict[key])
 
 
 def test_save_compressed_model(tmp_path, llama_model):
@@ -119,12 +139,9 @@ def test_save_compressed_model(tmp_path, llama_model):
     size_compressed_kb = path_to_compressed.stat().st_size / 1024
 
     # compare that the are the same after loading
-    state_dict_1 = {}
-    with safe_open(path_to_uncompressed, framework="pt") as f:
-        for key in f.keys():
-            state_dict_1[key] = f.get_tensor(key)
-    state_dict_2 = load_compressed(
-        path_to_compressed, BitmaskConfig(format="sparse-bitmask")
+    state_dict_1 = dict(load_compressed(path_to_uncompressed))
+    state_dict_2 = dict(
+        load_compressed(path_to_compressed, BitmaskConfig(format="sparse-bitmask"))
     )
     assert all(
         torch.allclose(state_dict_1[key], state_dict_2[key]) for key in state_dict_1

From 9c15e3ebf4f8f564bc60c4c9131c94c260764e4f Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Thu, 25 Apr 2024 14:08:23 +0200
Subject: [PATCH 12/13] Update
 src/compressed_tensors/compressors/sparse_bitmask.py

---
 src/compressed_tensors/compressors/sparse_bitmask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compressed_tensors/compressors/sparse_bitmask.py b/src/compressed_tensors/compressors/sparse_bitmask.py
index 4a946fb9..dec359c3 100644
--- a/src/compressed_tensors/compressors/sparse_bitmask.py
+++ b/src/compressed_tensors/compressors/sparse_bitmask.py
@@ -76,7 +76,7 @@ def decompress(
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a bitmask compressed state dict located at path_to_model_or_tensors
-        and returns agenerator for sequentially decompressing back to a dense state dict
+        and returns a generator for sequentially decompressing back to a dense state dict
 
         :param model_path: path to compressed safetensors model (directory with
             one or more safetensors files) or compressed tensors file

From 354ef8b603fcff66a8f1f95c19d0873146c2cd08 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Thu, 25 Apr 2024 12:16:22 +0000
Subject: [PATCH 13/13] disable ipynb test

---
 tests/test_examples/test_bitmask_compression_ipynb.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_examples/test_bitmask_compression_ipynb.py b/tests/test_examples/test_bitmask_compression_ipynb.py
index bf0bddd8..a3ee9d2a 100644
--- a/tests/test_examples/test_bitmask_compression_ipynb.py
+++ b/tests/test_examples/test_bitmask_compression_ipynb.py
@@ -17,6 +17,9 @@
 from nbconvert.preprocessors import ExecutePreprocessor
 
 
+@pytest.mark.skip(
+    reason="GHA not setup yet to run those tests. The test should work locally"
+)
 @pytest.mark.parametrize("notebook", ["examples/bitmask_compression.ipynb"])
 def test_notebook_exec(notebook):
     with open(notebook) as f: