Skip to content

Commit

Permalink
Review comments from @mgoin @kylesayrs and tms
Browse files Browse the repository at this point in the history
  • Loading branch information
rahul-tuli committed Jan 22, 2025
1 parent c31219b commit 62e0108
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 13 deletions.
4 changes: 2 additions & 2 deletions examples/sparse_2of4_quantization_fp8/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,11 @@ Output Directories:
To save the model on disk without sparse compression:

```python
model.save_pretrained(save_dir, save_compressed=True, no_sparse_compression=True)
model.save_pretrained(save_dir, save_compressed=True, disable_sparse_compression=True)
tokenizer.save_pretrained(save_dir)
```

> **Note:** This will compress the model using the quantization compressor; however, instead of using the optimal sparsity compressor, the dense sparsity compressor will be used. This affects only how the model is saved on disk and does not change the actual pruning/quantization process.
> **Note:** Saving a model with both the `save_compressed` and `disable_sparse_compression` options will compress the model using the quantization compressor; however, instead of using the more disk-efficient sparsity compressor(s), the dense sparsity compressor will be used. The `dense` sparsity compressor saves model params as is, and does not leverage sparsity for disk-efficient storage. These options only affect how the model(s) are saved on disk and do not impact the actual pruning or quantization processes.
### Validation

Expand Down
21 changes: 17 additions & 4 deletions src/llmcompressor/transformers/compression/sparsity_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
is_module_quantized,
iter_named_leaf_modules,
)
from loguru import logger
from torch import Tensor
from torch.nn import Module

Expand Down Expand Up @@ -81,7 +82,7 @@ def from_pretrained(
state_dict: Optional[Dict[str, Tensor]] = None,
compress: bool = False,
quantization_format: Optional[CompressionFormat] = None,
no_sparse_compression: bool = False,
disable_sparse_compression: bool = False,
) -> Optional["SparsityCompressionConfig"]:
"""
Determines compression type and informational parameters for a given model
Expand All @@ -92,7 +93,7 @@ def from_pretrained(
:param compress: whether or not to compress the model on disk
:param quantization_format: the quantization compression format being used
for the model
:param no_sparse_compression: whether or not to compress the model with
:param disable_sparse_compression: whether or not to compress the model with
sparse compressors, If True, the sparse compression format will
be dense, default is False.
:return: compression config inferred from the model
Expand All @@ -108,7 +109,10 @@ def from_pretrained(
sparsity_structure = SparsityConfigMetadata.infer_sparsity_structure(
model=model
)
if no_sparse_compression or quantization_format == CompressionFormat.marlin_24:
if (
disable_sparse_compression
or quantization_format == CompressionFormat.marlin_24
):
# sparse compressor should be dense
# when no_sparse_compression is True
# or when marlin_24 is used
Expand Down Expand Up @@ -160,7 +164,7 @@ def is_sparse24_bitmask_supported(
) -> bool:
"""
Determines if sparse 24 bitmask sparse compressor is supported for a given model
and it's sparsity structure in vLLM
and its sparsity structure in vLLM
:param model: pytorch model to check for sparse 24 bit sparsity support
:param sparsity_structure: sparsity structure of the model, if
Expand Down Expand Up @@ -201,10 +205,19 @@ def is_sparse24_bitmask_supported(
and scheme.type in supported_scheme_types
)
if not scheme_supported:
logger.info(
"Quantization scheme not supported,"
" turning off sparse 24 compression."
f" Invalid Scheme: {scheme}"
)
return False

elif weight_scheme or input_scheme:
# weight only quantization
logger.info(
"Weight only quantization detected, "
"turning off sparse 24 compression."
)
return False

return True
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def save_pretrained_wrapper(
quantization_format: Optional[str] = None,
save_compressed: bool = True,
skip_compression_stats: bool = False,
no_sparse_compression: bool = False,
disable_sparse_compression: bool = False,
**kwargs,
):
"""
Expand All @@ -142,8 +142,8 @@ def save_pretrained_wrapper(
:param skip_compression_stats: whether to skip the calculation of
compression statistics (such as global sparsity and sparsity structure)
when saving a model in dense format
:param no_sparse_compression: whether to skip sparse compression and save,
default is False
:param disable_sparse_compression: whether to skip sparse compression
during save, default is False
:param kwargs: additional kwargs to pass on to model.save_pretrained
"""

Expand Down Expand Up @@ -173,7 +173,7 @@ def skip(*args, **kwargs):
save_compressed=save_compressed,
skip_compression_stats=skip_compression_stats,
state_dict=state_dict,
no_sparse_compression=no_sparse_compression,
disable_sparse_compression=disable_sparse_compression,
)

if compressor is None:
Expand Down Expand Up @@ -265,7 +265,7 @@ def get_model_compressor(
save_compressed: bool = True,
skip_compression_stats: bool = False,
state_dict: Optional[Dict] = None,
no_sparse_compression: bool = False,
disable_sparse_compression: bool = False,
):
"""
Obtain the compressor based on the config and the
Expand All @@ -279,7 +279,7 @@ def get_model_compressor(
format
:param skip_compression_stats: bool allowing compression stats on std out
:param state_dict: state_dict of the model
:param no_sparse_compression: bool to skip sparse compression
:param disable_sparse_compression: bool to skip sparse compression
"""

# find offloaded state dict if none is provided
Expand Down Expand Up @@ -312,7 +312,7 @@ def get_model_compressor(
state_dict=state_dict,
compress=save_compressed,
quantization_format=quantization_format,
no_sparse_compression=no_sparse_compression,
disable_sparse_compression=disable_sparse_compression,
)

return ModelCompressor.from_pretrained_model(
Expand Down

0 comments on commit 62e0108

Please sign in to comment.