Format

neuralmagic · Jul 18, 2024 · b428604 · b428604
1 parent 7546f76
commit b428604
Showing 1 changed file with 21 additions and 142 deletions.
diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py
@@ -7,6 +7,7 @@
 from llmcompressor.transformers import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
+
 class BaseQuantizeConfig:
     """Configuration for model quantization.
 
@@ -24,6 +25,7 @@ class BaseQuantizeConfig:
             By default, "lm_head" is included to ignore the embedding
             Linear layer usually at the end of decoder LLMs
     """
+
     def __init__(
         self,
         quant_method: str = "fp8",
@@ -36,108 +38,41 @@ def __init__(
 
 
 class AutoFP8ForCausalLM:
-    def __init__(self, model: SparseAutoModelForCausalLM, quantize_config: BaseQuantizeConfig):
+    def __init__(
+        self, model: SparseAutoModelForCausalLM, quantize_config: BaseQuantizeConfig
+    ):
         self.model = model
         self.model_type = self.model.config.model_type
         self.config = self.model.config
-<<<<<<< HEAD
-
-        # Gather the Linear module names that we want to ignore
-        quantize_config.ignored_layers = get_layers_to_ignore(
-            self.model, quantize_config.ignore_patterns
-        )
-
-        if quantize_config.kv_cache_quant_targets:
-<<<<<<< HEAD
-<<<<<<< HEAD
-            kv_cache_quant_layers = get_kv_cache_quant_layers(
-=======
-            kv_cache_quant_layers = get_kv_cache_quant_layer(
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
-            kv_cache_quant_layers = get_kv_cache_quant_layers(
->>>>>>> c3acdee (Switch from output_scale to kv_scale)
-                self.model, quantize_config.kv_cache_quant_targets
-            )
-            if len(kv_cache_quant_layers) == 0:
-                raise ValueError(
-                    f"Could not find any kv cache layers using kv_cache_quant_targets={quantize_config.kv_cache_quant_targets}, please fix your argument."
-                )
-            quantize_config.kv_cache_quant_layers = kv_cache_quant_layers
-
-=======
->>>>>>> ba7d420 (Switch backend to use llm-compressor)
         self.quantize_config = quantize_config
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, quantize_config: BaseQuantizeConfig, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        quantize_config: BaseQuantizeConfig,
+        **kwargs,
+    ):
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
         model = SparseAutoModelForCausalLM.from_pretrained(
             pretrained_model_name_or_path,
             config=config,
             device_map="auto",
             torch_dtype="auto",
-            **kwargs
+            **kwargs,
         )
         return cls(model, quantize_config)
 
-<<<<<<< HEAD
-    def quantize(self, calibration_tokens: Optional[torch.Tensor] = None):
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
-        def _prepare_calibration_data(calibration_tokens):
-            if hasattr(calibration_tokens, "input_ids"):
-                return calibration_tokens.input_ids
-            return calibration_tokens
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
->>>>>>> 2739d61 (Add Qwen test)
-
-        # Always quantize the weights as they do not require calibration data
-        quantize_weights(self.model, self.quantize_config)
-
-        if self.quantize_config.activation_scheme == "static":
-            assert (
-                calibration_tokens is not None
-            ), "Calibration tokens required for activation quantization"
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
->>>>>>> 2739d61 (Add Qwen test)
-
-
-            def _prepare_calibration_data(calibration_tokens):
-                if hasattr(calibration_tokens, "input_ids"):
-                    return calibration_tokens.input_ids
-                return calibration_tokens
-
-<<<<<<< HEAD
-=======
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
->>>>>>> 2739d61 (Add Qwen test)
-            quantize_activations(
-                self.model,
-                self.quantize_config,
-                _prepare_calibration_data(calibration_tokens),
-            )
-
-    def save_quantized(self, save_dir):
-        save_quantized_model(
-            self.model,
-            quant_config=self.quantize_config,
-            save_dir=save_dir,
-=======
     def quantize(self, dataset: Optional[Dataset] = None):
-        assert self.quantize_config.activation_scheme == "static"
-        assert dataset is not None, "Calibration tokens required for static activation quantization"
+        assert (
+            self.quantize_config.activation_scheme == "static"
+        ), "Dynamic isn't supported yet"
+        assert (
+            dataset is not None
+        ), "Calibration tokens required for static activation quantization"
 
         recipe = QuantizationModifier(
-            targets="Linear",
-            scheme="FP8",
-            ignore=self.quantize_config.ignore_patterns
->>>>>>> ba7d420 (Switch backend to use llm-compressor)
+            targets="Linear", scheme="FP8", ignore=self.quantize_config.ignore_patterns
         )
 
         oneshot(
@@ -149,64 +84,8 @@ def quantize(self, dataset: Optional[Dataset] = None):
     def save_quantized(self, save_directory: str):
         self.save_pretrained(save_directory, save_compressed=True)
 
-<<<<<<< HEAD
-    for name, linear in model.named_modules():
-        if not isinstance(linear, torch.nn.Linear):
-            continue
-
-        for ignore_pattern in ignore_patterns:
-            regex_prefix = "re:"
-            if ignore_pattern.startswith(regex_prefix):
-                # check if name matches regex and add to set if true
-                regex_pattern = ignore_pattern[len(regex_prefix) :]
-                if re.search(regex_pattern, name):
-                    ignored_layers.add(name)
-            else:
-                # else, exact match
-                if ignore_pattern == name:
-                    ignored_layers.add(name)
-
-    return list(ignored_layers)
-
-
-<<<<<<< HEAD
-<<<<<<< HEAD
-def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
-    kv_cache_quant_layers = []
-=======
-def get_kv_cache_quant_layer(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
-    kv_cache_quant_layers = set()
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
-def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
-    kv_cache_quant_layers = []
->>>>>>> c3acdee (Switch from output_scale to kv_scale)
-
-    for name, linear in model.named_modules():
-        if not isinstance(linear, torch.nn.Linear):
-            continue
-
-        for output_quant_target in kv_cache_quant_targets:
-            if name.endswith(output_quant_target):
-<<<<<<< HEAD
-<<<<<<< HEAD
-                kv_cache_quant_layers.append(name)
-
-    return kv_cache_quant_layers
-=======
-                kv_cache_quant_layers.add(name)
-
-    return list(kv_cache_quant_layers)
->>>>>>> 3ee9283 (Support calibrating kv cache scales)
-=======
-                kv_cache_quant_layers.append(name)
-
-    return kv_cache_quant_layers
->>>>>>> c3acdee (Switch from output_scale to kv_scale)
-=======
-    def save_pretrained(self, save_directory: str, save_compressed: bool = True):   
+    def save_pretrained(self, save_directory: str, save_compressed: bool = True):
         self.model.save_pretrained(save_directory, save_compressed=save_compressed)
         tokenizer = AutoTokenizer.from_pretrained(self.model.config._name_or_path)
         tokenizer.save_pretrained(save_directory)
-        print(f"Saved final checkpoint to {os.path.abspath(save_directory)}")
->>>>>>> ba7d420 (Switch backend to use llm-compressor)
+        print(f"Saved final checkpoint to {os.path.abspath(save_directory)}")