From 6bd6207e27f820453f38a5f1e5d7e8b5add939d1 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 26 Apr 2024 13:46:41 +0000
Subject: [PATCH] initial commit

---
 src/compressed_tensors/__init__.py            |   1 +
 .../compressors/sparse_bitmask.py             |   5 +-
 src/compressed_tensors/model/__init__.py      |  16 ++
 src/compressed_tensors/model/sparse_model.py  |  42 +++++
 src/compressed_tensors/model/utils.py         | 155 ++++++++++++++++++
 tests/test_model/test_sparse_model.py         | 118 +++++++++++++
 6 files changed, 335 insertions(+), 2 deletions(-)
 create mode 100644 src/compressed_tensors/model/__init__.py
 create mode 100644 src/compressed_tensors/model/sparse_model.py
 create mode 100644 src/compressed_tensors/model/utils.py
 create mode 100644 tests/test_model/test_sparse_model.py

diff --git a/src/compressed_tensors/__init__.py b/src/compressed_tensors/__init__.py
index 0833dd42..ac0c5df4 100644
--- a/src/compressed_tensors/__init__.py
+++ b/src/compressed_tensors/__init__.py
@@ -17,5 +17,6 @@
 # flake8: noqa
 from .compressors import *
 from .config import *
+from .model import *
 from .quantization import QuantizationConfig, QuantizationStatus
 from .utils import *
diff --git a/src/compressed_tensors/compressors/sparse_bitmask.py b/src/compressed_tensors/compressors/sparse_bitmask.py
index dec359c3..09386f42 100644
--- a/src/compressed_tensors/compressors/sparse_bitmask.py
+++ b/src/compressed_tensors/compressors/sparse_bitmask.py
@@ -75,8 +75,9 @@ def decompress(
         self, path_to_model_or_tensors: str, device: str = "cpu"
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
-        Reads a bitmask compressed state dict located at path_to_model_or_tensors
-        and returns a generator for sequentially decompressing back to a dense state dict
+        Reads a bitmask compressed state dict located at
+        path_to_model_or_tensors and returns a generator for
+        sequentially decompressing back to a dense state dict
 
         :param model_path: path to compressed safetensors model (directory with
             one or more safetensors files) or compressed tensors file
diff --git a/src/compressed_tensors/model/__init__.py b/src/compressed_tensors/model/__init__.py
new file mode 100644
index 00000000..c30b8d98
--- /dev/null
+++ b/src/compressed_tensors/model/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+from .sparse_model import SparseAutoModelForCausalLM
diff --git a/src/compressed_tensors/model/sparse_model.py b/src/compressed_tensors/model/sparse_model.py
new file mode 100644
index 00000000..046da9db
--- /dev/null
+++ b/src/compressed_tensors/model/sparse_model.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from compressed_tensors import infer_compressor_from_model_config
+from compressed_tensors.model.utils import SparseAutoModelMixin
+from transformers import AutoModelForCausalLM, PreTrainedModel
+
+
+__all__ = ["SparseAutoModelForCausalLM"]
+
+
+class SparseAutoModelForCausalLM(AutoModelForCausalLM, SparseAutoModelMixin):
+    """
+    Wrapper class for transformers AutoModelForCausalLM that
+    provides methods support for saving and loading compressed-tensors weights
+    """
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path, *model_args, **kwargs
+    ) -> PreTrainedModel:
+        compressor = infer_compressor_from_model_config(pretrained_model_name_or_path)
+        model = super(AutoModelForCausalLM, cls).from_pretrained(
+            pretrained_model_name_or_path, *model_args, **kwargs
+        )
+        cls.modify_save_pretrained(model)
+        if compressor:
+            cls.decompress_weights_on_load(
+                model=model, compressor=compressor, cache_dir=kwargs.get("cache_dir")
+            )
+        return model
diff --git a/src/compressed_tensors/model/utils.py b/src/compressed_tensors/model/utils.py
new file mode 100644
index 00000000..e1668b95
--- /dev/null
+++ b/src/compressed_tensors/model/utils.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import weakref
+from functools import wraps
+from typing import Any, Callable, Dict, Optional, Union
+
+from compressed_tensors import SPARSITY_CONFIG_NAME, ModelCompressor
+from compressed_tensors.config import CompressionConfig
+from compressed_tensors.utils import get_safetensors_folder
+from transformers import CONFIG_NAME, PreTrainedModel
+
+
+__all__ = ["SparseAutoModelMixin"]
+
+
+class SparseAutoModelMixin:
+    """
+    Class that provides methods for saving and loading AutoModel models
+    with compressed-tensors support
+    """
+
+    def decompress_weights_on_load(
+        model: PreTrainedModel,
+        compressor: ModelCompressor,
+        cache_dir: Union[str, os.PathLike, None] = None,
+    ):
+        """
+        Dynamically decompresses a model's weights on load using the provided compressor
+
+        :param model: the model to decompress
+        :param compressor: the compressor to use for decompression
+        :param cache_dir: optional cache directory to use when loading the model
+        """
+        model_path = get_safetensors_folder(model.name_or_path, cache_dir)
+        compressor.overwrite_weights(model_path=model_path, model=model)
+
+    def modify_save_pretrained(model: PreTrainedModel):
+        """
+        Overrides a PreTrainedModel's save_pretrained()
+        method with a wrapped version that
+        supports compression
+
+        :param model: the model to modify
+        """
+        model.save_pretrained = save_pretrained_compressed(model.save_pretrained)
+
+
+def save_pretrained_compressed(save_pretrained_method: Callable) -> Callable:
+    """
+    Wraps a PreTrainedModel's save_pretrained() method with a version that supports
+    compression
+
+    :param save_pretrained_method: the original save_pretrained method to wrap
+    :return: the wrapped save_pretrained method
+    """
+    if getattr(save_pretrained_method, "_overridden", False):
+        # `model.save_pretrained` has already been replaced, return.
+        return save_pretrained_method
+
+    # keep a weak reference to the model class and unbound save_pretrained
+    # method so we can call the original
+    model_ref = weakref.ref(save_pretrained_method.__self__)
+    original_save_pretrained = save_pretrained_method.__func__
+    model_class = model_ref().__class__
+    # remove the reference to the original method
+    del save_pretrained_method
+
+    @wraps(original_save_pretrained)
+    def save_pretrained_wrapper(
+        save_directory: Union[str, os.PathLike],
+        compression_config: Optional[CompressionConfig] = None,
+        **kwargs,
+    ):
+        """
+        Wrapper around PreTrainedModel.save_pretrained(), adds functionality for
+        saving models in a compressed format on disk. The compression format is
+        saved to the model's config file.
+
+        :param save_directory: directory where the model should be saved
+        :param compression_config: the compression config to use when saving
+            the model
+        :param kwargs: additional keyword arguments to pass to the original
+            save_pretrained method
+        """
+        model = model_ref()
+        state_dict = model.state_dict()
+
+        compression_config = compression_config or infer_compression_config_from_kwargs(
+            kwargs
+        )
+
+        if compression_config is None:
+            # model is not sparse, save as dense
+            return original_save_pretrained.__get__(model, model_class)(
+                save_directory, **kwargs
+            )
+
+        # save compressed weights and add compression config to model config
+        compressor = ModelCompressor.load_from_registry(
+            compression_config.format, config=compression_config
+        )
+        compressed_state_dict = compressor.compress(state_dict)
+        kwargs.update(dict(state_dict=compressed_state_dict, safe_serialization=True))
+        original_save_pretrained.__get__(model, model_class)(save_directory, **kwargs)
+        add_compression_config_to_model_config(save_directory, compression_config)
+
+    save_pretrained_wrapper._overriden = True
+    return save_pretrained_wrapper
+
+
+def infer_compression_config_from_kwargs(
+    config_args: Optional[Dict[str, Any]] = None
+) -> Optional[CompressionConfig]:
+    """
+    If provided arguments match the expected CompressionConfig format,
+    infer the compression config from the provided arguments.
+    """
+    # Not implemented yet
+    return None
+
+
+def add_compression_config_to_model_config(
+    save_directory: Union[str, os.PathLike], compression_config: CompressionConfig
+):
+    """
+    Add the compression config to the model's config file.
+    The compression config is added under the `SPARSITY_CONFIG_NAME` key
+    in the model's config file.
+
+    :param save_directory: directory where the model's config file is saved
+    :param compression_config: the compression config to add to the model's config file
+    """
+    compression_config: Dict[str, Any] = compression_config.model_dump(
+        exclude_unset=True
+    )
+    config_file_path = os.path.join(save_directory, CONFIG_NAME)
+    with open(config_file_path, "r") as config_file:
+        config_data = json.load(config_file)
+    config_data[SPARSITY_CONFIG_NAME] = compression_config
+    with open(config_file_path, "w") as config_file:
+        json.dump(config_data, config_file, indent=2, sort_keys=True)
diff --git a/tests/test_model/test_sparse_model.py b/tests/test_model/test_sparse_model.py
new file mode 100644
index 00000000..ad16527e
--- /dev/null
+++ b/tests/test_model/test_sparse_model.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from compressed_tensors import SparseAutoModelForCausalLM
+from compressed_tensors.config import CompressionConfig
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig
+
+
+@pytest.fixture()
+def size_bytes_uncompressed():
+    return 438131648
+
+
+@pytest.fixture()
+def size_bytes_compressed():
+    return 384641204
+
+
+@pytest.mark.parametrize("model_name", ["neuralmagic/llama2.c-stories110M-pruned50"])
+class TestSparseAutoModelSave:
+    """
+    Loading a model that initially does not have compressed weights
+    """
+
+    @pytest.fixture
+    def setup(self, model_name, size_bytes_uncompressed, size_bytes_compressed):
+        yield model_name, size_bytes_uncompressed, size_bytes_compressed
+
+    def test_save_pretrained_dense(self, tmp_path, setup):
+        model_name, size_bytes, _ = setup
+
+        model = SparseAutoModelForCausalLM.from_pretrained(model_name)
+        hf_config = AutoConfig.from_pretrained(model_name)
+
+        model.save_pretrained(tmp_path)
+
+        # check if the model is saved in the correct format
+        assert (tmp_path / "model.safetensors").exists()
+        size_bytes_ = (tmp_path / "model.safetensors").stat().st_size
+        assert pytest.approx(size_bytes, rel=0.1) == size_bytes_
+
+        # check that hf_config has not been modified
+        assert (
+            hf_config.to_dict().keys()
+            == AutoConfig.from_pretrained(tmp_path).to_dict().keys()
+        )
+
+        # check that the model can be loaded
+        assert SparseAutoModelForCausalLM.from_pretrained(model_name)
+
+    def test_save_pretrained_sparse(self, tmp_path, setup):
+        model_name, _, size_bytes = setup
+
+        model = SparseAutoModelForCausalLM.from_pretrained(model_name)
+        hf_config = AutoConfig.from_pretrained(model_name)
+
+        compression_config = CompressionConfig.load_from_registry(
+            "sparse-bitmask",
+            **dict(global_sparsity=4.20, sparsity_structure="dummy_sparsity"),
+        )
+
+        model.save_pretrained(tmp_path, compression_config)
+
+        # check if the model is saved in the correct format
+        assert (tmp_path / "model.safetensors").exists()
+        size_bytes_ = (tmp_path / "model.safetensors").stat().st_size
+        assert pytest.approx(size_bytes, rel=0.1) == size_bytes_
+
+        # check that hf_config has not been modified
+        # TODO: Add better test here
+        assert "sparsity_config" not in hf_config.to_dict()
+        
+        hf_config = AutoConfig.from_pretrained(tmp_path)
+        assert hf_config.sparsity_config == compression_config.model_dump(exclude_unset=True)
+
+        # check that the model can be loaded
+        assert SparseAutoModelForCausalLM.from_pretrained(model_name)
+
+
+@pytest.mark.parametrize(
+    "model_name", ["nm-testing/llama2.c-stories110M-pruned50-compressed-tensors"]
+)
+class TestSparseAutoModelLoad:
+    """
+    Loading a model that initially does not have compressed weights
+    """
+
+    @pytest.fixture
+    def setup(self, model_name):
+        yield model_name
+
+    def test_from_pretrained(self, setup):
+        model_name = setup
+        assert SparseAutoModelForCausalLM.from_pretrained(model_name)
+
+    def test_from_pretrained_local(self, tmp_path, setup):
+        model_name = setup
+        downloaded_model_dir = snapshot_download(model_name, local_dir=tmp_path)
+        assert SparseAutoModelForCausalLM.from_pretrained(downloaded_model_dir)
+
+    def test_from_pretrained_cache(self, tmp_path, setup):
+        model_name = setup
+        assert SparseAutoModelForCausalLM.from_pretrained(
+            model_name, cache_dir=tmp_path
+        )