diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py index 2a4637a..eb4d2ba 100644 --- a/auto_fp8/modeling.py +++ b/auto_fp8/modeling.py @@ -7,6 +7,7 @@ from llmcompressor.transformers import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier + class BaseQuantizeConfig: """Configuration for model quantization. @@ -24,6 +25,7 @@ class BaseQuantizeConfig: By default, "lm_head" is included to ignore the embedding Linear layer usually at the end of decoder LLMs """ + def __init__( self, quant_method: str = "fp8", @@ -36,108 +38,41 @@ def __init__( class AutoFP8ForCausalLM: - def __init__(self, model: SparseAutoModelForCausalLM, quantize_config: BaseQuantizeConfig): + def __init__( + self, model: SparseAutoModelForCausalLM, quantize_config: BaseQuantizeConfig + ): self.model = model self.model_type = self.model.config.model_type self.config = self.model.config -<<<<<<< HEAD - - # Gather the Linear module names that we want to ignore - quantize_config.ignored_layers = get_layers_to_ignore( - self.model, quantize_config.ignore_patterns - ) - - if quantize_config.kv_cache_quant_targets: -<<<<<<< HEAD -<<<<<<< HEAD - kv_cache_quant_layers = get_kv_cache_quant_layers( -======= - kv_cache_quant_layers = get_kv_cache_quant_layer( ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= - kv_cache_quant_layers = get_kv_cache_quant_layers( ->>>>>>> c3acdee (Switch from output_scale to kv_scale) - self.model, quantize_config.kv_cache_quant_targets - ) - if len(kv_cache_quant_layers) == 0: - raise ValueError( - f"Could not find any kv cache layers using kv_cache_quant_targets={quantize_config.kv_cache_quant_targets}, please fix your argument." - ) - quantize_config.kv_cache_quant_layers = kv_cache_quant_layers - -======= ->>>>>>> ba7d420 (Switch backend to use llm-compressor) self.quantize_config = quantize_config @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, quantize_config: BaseQuantizeConfig, **kwargs): + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + quantize_config: BaseQuantizeConfig, + **kwargs, + ): config = AutoConfig.from_pretrained(pretrained_model_name_or_path) model = SparseAutoModelForCausalLM.from_pretrained( pretrained_model_name_or_path, config=config, device_map="auto", torch_dtype="auto", - **kwargs + **kwargs, ) return cls(model, quantize_config) -<<<<<<< HEAD - def quantize(self, calibration_tokens: Optional[torch.Tensor] = None): -<<<<<<< HEAD -<<<<<<< HEAD -======= - def _prepare_calibration_data(calibration_tokens): - if hasattr(calibration_tokens, "input_ids"): - return calibration_tokens.input_ids - return calibration_tokens ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= ->>>>>>> 2739d61 (Add Qwen test) - - # Always quantize the weights as they do not require calibration data - quantize_weights(self.model, self.quantize_config) - - if self.quantize_config.activation_scheme == "static": - assert ( - calibration_tokens is not None - ), "Calibration tokens required for activation quantization" -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> 2739d61 (Add Qwen test) - - - def _prepare_calibration_data(calibration_tokens): - if hasattr(calibration_tokens, "input_ids"): - return calibration_tokens.input_ids - return calibration_tokens - -<<<<<<< HEAD -======= ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= ->>>>>>> 2739d61 (Add Qwen test) - quantize_activations( - self.model, - self.quantize_config, - _prepare_calibration_data(calibration_tokens), - ) - - def save_quantized(self, save_dir): - save_quantized_model( - self.model, - quant_config=self.quantize_config, - save_dir=save_dir, -======= def quantize(self, dataset: Optional[Dataset] = None): - assert self.quantize_config.activation_scheme == "static" - assert dataset is not None, "Calibration tokens required for static activation quantization" + assert ( + self.quantize_config.activation_scheme == "static" + ), "Dynamic isn't supported yet" + assert ( + dataset is not None + ), "Calibration tokens required for static activation quantization" recipe = QuantizationModifier( - targets="Linear", - scheme="FP8", - ignore=self.quantize_config.ignore_patterns ->>>>>>> ba7d420 (Switch backend to use llm-compressor) + targets="Linear", scheme="FP8", ignore=self.quantize_config.ignore_patterns ) oneshot( @@ -149,64 +84,8 @@ def quantize(self, dataset: Optional[Dataset] = None): def save_quantized(self, save_directory: str): self.save_pretrained(save_directory, save_compressed=True) -<<<<<<< HEAD - for name, linear in model.named_modules(): - if not isinstance(linear, torch.nn.Linear): - continue - - for ignore_pattern in ignore_patterns: - regex_prefix = "re:" - if ignore_pattern.startswith(regex_prefix): - # check if name matches regex and add to set if true - regex_pattern = ignore_pattern[len(regex_prefix) :] - if re.search(regex_pattern, name): - ignored_layers.add(name) - else: - # else, exact match - if ignore_pattern == name: - ignored_layers.add(name) - - return list(ignored_layers) - - -<<<<<<< HEAD -<<<<<<< HEAD -def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]: - kv_cache_quant_layers = [] -======= -def get_kv_cache_quant_layer(model, kv_cache_quant_targets: Tuple[str]) -> List[str]: - kv_cache_quant_layers = set() ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= -def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]: - kv_cache_quant_layers = [] ->>>>>>> c3acdee (Switch from output_scale to kv_scale) - - for name, linear in model.named_modules(): - if not isinstance(linear, torch.nn.Linear): - continue - - for output_quant_target in kv_cache_quant_targets: - if name.endswith(output_quant_target): -<<<<<<< HEAD -<<<<<<< HEAD - kv_cache_quant_layers.append(name) - - return kv_cache_quant_layers -======= - kv_cache_quant_layers.add(name) - - return list(kv_cache_quant_layers) ->>>>>>> 3ee9283 (Support calibrating kv cache scales) -======= - kv_cache_quant_layers.append(name) - - return kv_cache_quant_layers ->>>>>>> c3acdee (Switch from output_scale to kv_scale) -======= - def save_pretrained(self, save_directory: str, save_compressed: bool = True): + def save_pretrained(self, save_directory: str, save_compressed: bool = True): self.model.save_pretrained(save_directory, save_compressed=save_compressed) tokenizer = AutoTokenizer.from_pretrained(self.model.config._name_or_path) tokenizer.save_pretrained(save_directory) - print(f"Saved final checkpoint to {os.path.abspath(save_directory)}") ->>>>>>> ba7d420 (Switch backend to use llm-compressor) + print(f"Saved final checkpoint to {os.path.abspath(save_directory)}") \ No newline at end of file