neuralmagic · dsikka · Dec 11, 2024 · Dec 11, 2024
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -387,7 +387,15 @@ def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
         if is_module_quantized(submodule):
             if submodule.quantization_scheme.weights is not None:
                 name = fix_fsdp_module_name(name)
-                quantized_modules_to_args[name] = submodule.quantization_scheme.weights
+                quantized_modules_to_args[name] = (
+                    submodule.quantization_scheme.weights,
+                )
+                if submodule.quantization_scheme.input_activations is not None:
+                    weight_args = quantized_modules_to_args.get(name)[0]
+                    quantized_modules_to_args[name] = (
+                        weight_args,
+                        submodule.quantization_scheme.input_activations,
+                    )
 
     return quantized_modules_to_args
 

diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -77,19 +77,24 @@ def compress(
         """
         compressed_dict = {}
         weight_suffix = ".weight"
+        input_zp_suffix = ".input_zero_point"
+        weight_zp_suffix = ".weight_zero_point"
         _LOGGER.debug(
             f"Compressing model with {len(model_state)} parameterized layers..."
         )
 
         for name, value in tqdm(model_state.items(), desc="Quantized Compression"):
+            weight_zp = name.endswith(weight_zp_suffix)
+            input_zp = name.endswith(input_zp_suffix)
+
             if name.endswith(weight_suffix):
                 prefix = name[: -(len(weight_suffix))]
                 scale = model_state.get(merge_names(prefix, "weight_scale"), None)
                 zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
                 g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
                 if scale is not None:
                     # weight is quantized, compress it
-                    quant_args = names_to_scheme[prefix]
+                    quant_args = names_to_scheme[prefix][0]
                     compressed_data = self.compress_weight(
                         weight=value,
                         scale=scale,
@@ -102,7 +107,15 @@ def compress(
                         compressed_dict[merge_names(prefix, key)] = value
                 else:
                     compressed_dict[name] = value.to("cpu")
-            elif name.endswith("zero_point") and torch.all(value == 0):
+            elif (
+                weight_zp
+                and names_to_scheme.get(name[: -(len(weight_zp_suffix))])[0].symmetric
+            ):
+                continue
+            elif (
+                input_zp
+                and names_to_scheme.get(name[: -(len(input_zp_suffix))])[1].symmetric
+            ):
                 continue
             elif name.endswith("g_idx") and torch.any(value <= -1):
                 continue