Review comments from @mgoin @kylesayrs and tms

vllm-project · Jan 22, 2025 · 62e0108 · 62e0108
1 parent c31219b
commit 62e0108
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 13 deletions.
diff --git a/examples/sparse_2of4_quantization_fp8/README.md b/examples/sparse_2of4_quantization_fp8/README.md
@@ -111,11 +111,11 @@ Output Directories:
 To save the model on disk without sparse compression:
 
 ```python
-model.save_pretrained(save_dir, save_compressed=True, no_sparse_compression=True)
+model.save_pretrained(save_dir, save_compressed=True, disable_sparse_compression=True)
 tokenizer.save_pretrained(save_dir)
 ```
 
-> **Note:** This will compress the model using the quantization compressor; however, instead of using the optimal sparsity compressor, the dense sparsity compressor will be used. This affects only how the model is saved on disk and does not change the actual pruning/quantization process.
+> **Note:** Saving a model with both the `save_compressed` and `disable_sparse_compression` options will compress the model using the quantization compressor; however, instead of using the more disk-efficient sparsity compressor(s), the dense sparsity compressor will be used. The `dense` sparsity compressor saves model params as is, and does not leverage sparsity for disk-efficient storage. These options only affect how the model(s) are saved on disk and do not impact the actual pruning or quantization processes.
 
 ### Validation
 

diff --git a/src/llmcompressor/transformers/compression/sparsity_config.py b/src/llmcompressor/transformers/compression/sparsity_config.py
@@ -8,6 +8,7 @@
     is_module_quantized,
     iter_named_leaf_modules,
 )
+from loguru import logger
 from torch import Tensor
 from torch.nn import Module
 
@@ -81,7 +82,7 @@ def from_pretrained(
         state_dict: Optional[Dict[str, Tensor]] = None,
         compress: bool = False,
         quantization_format: Optional[CompressionFormat] = None,
-        no_sparse_compression: bool = False,
+        disable_sparse_compression: bool = False,
     ) -> Optional["SparsityCompressionConfig"]:
         """
         Determines compression type and informational parameters for a given model
@@ -92,7 +93,7 @@ def from_pretrained(
         :param compress: whether or not to compress the model on disk
         :param quantization_format: the quantization compression format being used
             for the model
-        :param no_sparse_compression: whether or not to compress the model with
+        :param disable_sparse_compression: whether or not to compress the model with
             sparse compressors, If True, the sparse compression format will
             be dense, default is False.
         :return: compression config inferred from the model
@@ -108,7 +109,10 @@ def from_pretrained(
         sparsity_structure = SparsityConfigMetadata.infer_sparsity_structure(
             model=model
         )
-        if no_sparse_compression or quantization_format == CompressionFormat.marlin_24:
+        if (
+            disable_sparse_compression
+            or quantization_format == CompressionFormat.marlin_24
+        ):
             # sparse compressor should be dense
             # when no_sparse_compression is True
             # or when marlin_24 is used
@@ -160,7 +164,7 @@ def is_sparse24_bitmask_supported(
     ) -> bool:
         """
         Determines if sparse 24 bitmask sparse compressor is supported for a given model
-        and it's sparsity structure in vLLM
+        and its sparsity structure in vLLM
 
         :param model: pytorch model to check for sparse 24 bit sparsity support
         :param sparsity_structure: sparsity structure of the model, if
@@ -201,10 +205,19 @@ def is_sparse24_bitmask_supported(
                             and scheme.type in supported_scheme_types
                         )
                         if not scheme_supported:
+                            logger.info(
+                                "Quantization scheme not supported,"
+                                " turning off sparse 24 compression."
+                                f" Invalid Scheme: {scheme}"
+                            )
                             return False
 
                 elif weight_scheme or input_scheme:
                     # weight only quantization
+                    logger.info(
+                        "Weight only quantization detected, "
+                        "turning off sparse 24 compression."
+                    )
                     return False
 
         return True
diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -125,7 +125,7 @@ def save_pretrained_wrapper(
             quantization_format: Optional[str] = None,
             save_compressed: bool = True,
             skip_compression_stats: bool = False,
-            no_sparse_compression: bool = False,
+            disable_sparse_compression: bool = False,
             **kwargs,
         ):
             """
@@ -142,8 +142,8 @@ def save_pretrained_wrapper(
             :param skip_compression_stats: whether to skip the calculation of
                 compression statistics (such as global sparsity and sparsity structure)
                 when saving a model in dense format
-            :param no_sparse_compression: whether to skip sparse compression and save,
-                default is False
+            :param disable_sparse_compression: whether to skip sparse compression
+                during save, default is False
             :param kwargs: additional kwargs to pass on to model.save_pretrained
             """
 
@@ -173,7 +173,7 @@ def skip(*args, **kwargs):
                 save_compressed=save_compressed,
                 skip_compression_stats=skip_compression_stats,
                 state_dict=state_dict,
-                no_sparse_compression=no_sparse_compression,
+                disable_sparse_compression=disable_sparse_compression,
             )
 
             if compressor is None:
@@ -265,7 +265,7 @@ def get_model_compressor(
     save_compressed: bool = True,
     skip_compression_stats: bool = False,
     state_dict: Optional[Dict] = None,
-    no_sparse_compression: bool = False,
+    disable_sparse_compression: bool = False,
 ):
     """
     Obtain the compressor based on the config and the
@@ -279,7 +279,7 @@ def get_model_compressor(
         format
     :param skip_compression_stats: bool allowing compression stats on std out
     :param state_dict: state_dict of the model
-    :param no_sparse_compression: bool to skip sparse compression
+    :param disable_sparse_compression: bool to skip sparse compression
     """
 
     # find offloaded state dict if none is provided
@@ -312,7 +312,7 @@ def get_model_compressor(
             state_dict=state_dict,
             compress=save_compressed,
             quantization_format=quantization_format,
-            no_sparse_compression=no_sparse_compression,
+            disable_sparse_compression=disable_sparse_compression,
         )
 
     return ModelCompressor.from_pretrained_model(