diff --git a/examples/sparse_2of4_quantization_fp8/README.md b/examples/sparse_2of4_quantization_fp8/README.md index dd6f13f5b..99fc3c545 100644 --- a/examples/sparse_2of4_quantization_fp8/README.md +++ b/examples/sparse_2of4_quantization_fp8/README.md @@ -111,11 +111,11 @@ Output Directories: To save the model on disk without sparse compression: ```python -model.save_pretrained(save_dir, save_compressed=True, no_sparse_compression=True) +model.save_pretrained(save_dir, save_compressed=True, disable_sparse_compression=True) tokenizer.save_pretrained(save_dir) ``` -> **Note:** This will compress the model using the quantization compressor; however, instead of using the optimal sparsity compressor, the dense sparsity compressor will be used. This affects only how the model is saved on disk and does not change the actual pruning/quantization process. +> **Note:** Saving a model with both the `save_compressed` and `disable_sparse_compression` options will compress the model using the quantization compressor; however, instead of using the more disk-efficient sparsity compressor(s), the dense sparsity compressor will be used. The `dense` sparsity compressor saves model params as is, and does not leverage sparsity for disk-efficient storage. These options only affect how the model(s) are saved on disk and do not impact the actual pruning or quantization processes. ### Validation diff --git a/src/llmcompressor/transformers/compression/sparsity_config.py b/src/llmcompressor/transformers/compression/sparsity_config.py index 0efbadaf4..1183023b3 100644 --- a/src/llmcompressor/transformers/compression/sparsity_config.py +++ b/src/llmcompressor/transformers/compression/sparsity_config.py @@ -8,6 +8,7 @@ is_module_quantized, iter_named_leaf_modules, ) +from loguru import logger from torch import Tensor from torch.nn import Module @@ -81,7 +82,7 @@ def from_pretrained( state_dict: Optional[Dict[str, Tensor]] = None, compress: bool = False, quantization_format: Optional[CompressionFormat] = None, - no_sparse_compression: bool = False, + disable_sparse_compression: bool = False, ) -> Optional["SparsityCompressionConfig"]: """ Determines compression type and informational parameters for a given model @@ -92,7 +93,7 @@ def from_pretrained( :param compress: whether or not to compress the model on disk :param quantization_format: the quantization compression format being used for the model - :param no_sparse_compression: whether or not to compress the model with + :param disable_sparse_compression: whether or not to compress the model with sparse compressors, If True, the sparse compression format will be dense, default is False. :return: compression config inferred from the model @@ -108,7 +109,10 @@ def from_pretrained( sparsity_structure = SparsityConfigMetadata.infer_sparsity_structure( model=model ) - if no_sparse_compression or quantization_format == CompressionFormat.marlin_24: + if ( + disable_sparse_compression + or quantization_format == CompressionFormat.marlin_24 + ): # sparse compressor should be dense # when no_sparse_compression is True # or when marlin_24 is used @@ -160,7 +164,7 @@ def is_sparse24_bitmask_supported( ) -> bool: """ Determines if sparse 24 bitmask sparse compressor is supported for a given model - and it's sparsity structure in vLLM + and its sparsity structure in vLLM :param model: pytorch model to check for sparse 24 bit sparsity support :param sparsity_structure: sparsity structure of the model, if @@ -201,10 +205,19 @@ def is_sparse24_bitmask_supported( and scheme.type in supported_scheme_types ) if not scheme_supported: + logger.info( + "Quantization scheme not supported," + " turning off sparse 24 compression." + f" Invalid Scheme: {scheme}" + ) return False elif weight_scheme or input_scheme: # weight only quantization + logger.info( + "Weight only quantization detected, " + "turning off sparse 24 compression." + ) return False return True diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py index 584464507..ec9951f6a 100644 --- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py @@ -125,7 +125,7 @@ def save_pretrained_wrapper( quantization_format: Optional[str] = None, save_compressed: bool = True, skip_compression_stats: bool = False, - no_sparse_compression: bool = False, + disable_sparse_compression: bool = False, **kwargs, ): """ @@ -142,8 +142,8 @@ def save_pretrained_wrapper( :param skip_compression_stats: whether to skip the calculation of compression statistics (such as global sparsity and sparsity structure) when saving a model in dense format - :param no_sparse_compression: whether to skip sparse compression and save, - default is False + :param disable_sparse_compression: whether to skip sparse compression + during save, default is False :param kwargs: additional kwargs to pass on to model.save_pretrained """ @@ -173,7 +173,7 @@ def skip(*args, **kwargs): save_compressed=save_compressed, skip_compression_stats=skip_compression_stats, state_dict=state_dict, - no_sparse_compression=no_sparse_compression, + disable_sparse_compression=disable_sparse_compression, ) if compressor is None: @@ -265,7 +265,7 @@ def get_model_compressor( save_compressed: bool = True, skip_compression_stats: bool = False, state_dict: Optional[Dict] = None, - no_sparse_compression: bool = False, + disable_sparse_compression: bool = False, ): """ Obtain the compressor based on the config and the @@ -279,7 +279,7 @@ def get_model_compressor( format :param skip_compression_stats: bool allowing compression stats on std out :param state_dict: state_dict of the model - :param no_sparse_compression: bool to skip sparse compression + :param disable_sparse_compression: bool to skip sparse compression """ # find offloaded state dict if none is provided @@ -312,7 +312,7 @@ def get_model_compressor( state_dict=state_dict, compress=save_compressed, quantization_format=quantization_format, - no_sparse_compression=no_sparse_compression, + disable_sparse_compression=disable_sparse_compression, ) return ModelCompressor.from_pretrained_model(