diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py index 5b614f35..d685c0c0 100644 --- a/src/compressed_tensors/quantization/lifecycle/forward.py +++ b/src/compressed_tensors/quantization/lifecycle/forward.py @@ -20,7 +20,7 @@ from compressed_tensors.quantization.cache import QuantizedKVParameterCache from compressed_tensors.quantization.observers.helpers import ( calculate_range, - compute_memoryless_zp_and_scales, + compute_dynamic_scales_and_zp, ) from compressed_tensors.quantization.quant_args import ( QuantizationArgs, @@ -380,7 +380,7 @@ def maybe_calibrate_or_quantize( if args.dynamic: # dynamic quantization - no need to invoke observer - scale, zero_point = compute_memoryless_zp_and_scales(value=value, args=args) + scale, zero_point = compute_dynamic_scales_and_zp(value=value, args=args) else: # static quantization - get previous scale and zero point from layer scale = getattr(module, f"{base_name}_scale") diff --git a/src/compressed_tensors/quantization/observers/helpers.py b/src/compressed_tensors/quantization/observers/helpers.py index b7302510..875a05b3 100644 --- a/src/compressed_tensors/quantization/observers/helpers.py +++ b/src/compressed_tensors/quantization/observers/helpers.py @@ -29,13 +29,14 @@ "calculate_qparams", "get_observer_token_count", "calculate_range", - "compute_memoryless_zp_and_scales", + "compute_dynamic_scales_and_zp", ] -def compute_memoryless_zp_and_scales(value: Tensor, args: QuantizationArgs): +def compute_dynamic_scales_and_zp(value: Tensor, args: QuantizationArgs): """ - Returns the min and max values of observed tensor + Returns the computed scales and zero points for dynamic activation + qunatization. :param value: tensor to calculate quantization parameters for :param args: quantization args diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py index 920cd99b..339a2a66 100644 --- a/src/compressed_tensors/quantization/quant_args.py +++ b/src/compressed_tensors/quantization/quant_args.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings from enum import Enum from typing import Any, Dict, Optional, Union @@ -171,6 +172,8 @@ def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]: strategy = model.strategy group_size = model.group_size actorder = model.actorder + dynamic = model.dynamic + observer = model.observer # infer strategy if strategy is None: @@ -207,6 +210,27 @@ def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]: "activation ordering" ) + # if we have not set an observer and we + # are running static quantization, use minmax + if not observer and not dynamic: + model.observer = "minmax" + + if dynamic: + if strategy not in ( + QuantizationStrategy.TOKEN, + QuantizationStrategy.TENSOR, + ): + raise ValueError( + f"One of {QuantizationStrategy.TOKEN} or " + f"{QuantizationStrategy.TENSOR} must be used for dynamic ", + "quantization", + ) + if observer is not None: + warnings.warn( + "No observer is used for dynamic quantization, setting to None" + ) + model.observer = None + # write back modified values model.strategy = strategy return model