Skip to content

Commit

Permalink
Format
Browse files Browse the repository at this point in the history
  • Loading branch information
mgoin committed Jul 18, 2024
1 parent 7546f76 commit b428604
Showing 1 changed file with 21 additions and 142 deletions.
163 changes: 21 additions & 142 deletions auto_fp8/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier


class BaseQuantizeConfig:
"""Configuration for model quantization.
Expand All @@ -24,6 +25,7 @@ class BaseQuantizeConfig:
By default, "lm_head" is included to ignore the embedding
Linear layer usually at the end of decoder LLMs
"""

def __init__(
self,
quant_method: str = "fp8",
Expand All @@ -36,108 +38,41 @@ def __init__(


class AutoFP8ForCausalLM:
def __init__(self, model: SparseAutoModelForCausalLM, quantize_config: BaseQuantizeConfig):
def __init__(
self, model: SparseAutoModelForCausalLM, quantize_config: BaseQuantizeConfig
):
self.model = model
self.model_type = self.model.config.model_type
self.config = self.model.config
<<<<<<< HEAD

# Gather the Linear module names that we want to ignore
quantize_config.ignored_layers = get_layers_to_ignore(
self.model, quantize_config.ignore_patterns
)

if quantize_config.kv_cache_quant_targets:
<<<<<<< HEAD
<<<<<<< HEAD
kv_cache_quant_layers = get_kv_cache_quant_layers(
=======
kv_cache_quant_layers = get_kv_cache_quant_layer(
>>>>>>> 3ee9283 (Support calibrating kv cache scales)
=======
kv_cache_quant_layers = get_kv_cache_quant_layers(
>>>>>>> c3acdee (Switch from output_scale to kv_scale)
self.model, quantize_config.kv_cache_quant_targets
)
if len(kv_cache_quant_layers) == 0:
raise ValueError(
f"Could not find any kv cache layers using kv_cache_quant_targets={quantize_config.kv_cache_quant_targets}, please fix your argument."
)
quantize_config.kv_cache_quant_layers = kv_cache_quant_layers

=======
>>>>>>> ba7d420 (Switch backend to use llm-compressor)
self.quantize_config = quantize_config

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: str, quantize_config: BaseQuantizeConfig, **kwargs):
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
quantize_config: BaseQuantizeConfig,
**kwargs,
):
config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
model = SparseAutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path,
config=config,
device_map="auto",
torch_dtype="auto",
**kwargs
**kwargs,
)
return cls(model, quantize_config)

<<<<<<< HEAD
def quantize(self, calibration_tokens: Optional[torch.Tensor] = None):
<<<<<<< HEAD
<<<<<<< HEAD
=======
def _prepare_calibration_data(calibration_tokens):
if hasattr(calibration_tokens, "input_ids"):
return calibration_tokens.input_ids
return calibration_tokens
>>>>>>> 3ee9283 (Support calibrating kv cache scales)
=======
>>>>>>> 2739d61 (Add Qwen test)

# Always quantize the weights as they do not require calibration data
quantize_weights(self.model, self.quantize_config)

if self.quantize_config.activation_scheme == "static":
assert (
calibration_tokens is not None
), "Calibration tokens required for activation quantization"
<<<<<<< HEAD
<<<<<<< HEAD
=======
>>>>>>> 2739d61 (Add Qwen test)


def _prepare_calibration_data(calibration_tokens):
if hasattr(calibration_tokens, "input_ids"):
return calibration_tokens.input_ids
return calibration_tokens

<<<<<<< HEAD
=======
>>>>>>> 3ee9283 (Support calibrating kv cache scales)
=======
>>>>>>> 2739d61 (Add Qwen test)
quantize_activations(
self.model,
self.quantize_config,
_prepare_calibration_data(calibration_tokens),
)

def save_quantized(self, save_dir):
save_quantized_model(
self.model,
quant_config=self.quantize_config,
save_dir=save_dir,
=======
def quantize(self, dataset: Optional[Dataset] = None):
assert self.quantize_config.activation_scheme == "static"
assert dataset is not None, "Calibration tokens required for static activation quantization"
assert (
self.quantize_config.activation_scheme == "static"
), "Dynamic isn't supported yet"
assert (
dataset is not None
), "Calibration tokens required for static activation quantization"

recipe = QuantizationModifier(
targets="Linear",
scheme="FP8",
ignore=self.quantize_config.ignore_patterns
>>>>>>> ba7d420 (Switch backend to use llm-compressor)
targets="Linear", scheme="FP8", ignore=self.quantize_config.ignore_patterns
)

oneshot(
Expand All @@ -149,64 +84,8 @@ def quantize(self, dataset: Optional[Dataset] = None):
def save_quantized(self, save_directory: str):
self.save_pretrained(save_directory, save_compressed=True)

<<<<<<< HEAD
for name, linear in model.named_modules():
if not isinstance(linear, torch.nn.Linear):
continue

for ignore_pattern in ignore_patterns:
regex_prefix = "re:"
if ignore_pattern.startswith(regex_prefix):
# check if name matches regex and add to set if true
regex_pattern = ignore_pattern[len(regex_prefix) :]
if re.search(regex_pattern, name):
ignored_layers.add(name)
else:
# else, exact match
if ignore_pattern == name:
ignored_layers.add(name)

return list(ignored_layers)


<<<<<<< HEAD
<<<<<<< HEAD
def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
kv_cache_quant_layers = []
=======
def get_kv_cache_quant_layer(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
kv_cache_quant_layers = set()
>>>>>>> 3ee9283 (Support calibrating kv cache scales)
=======
def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
kv_cache_quant_layers = []
>>>>>>> c3acdee (Switch from output_scale to kv_scale)

for name, linear in model.named_modules():
if not isinstance(linear, torch.nn.Linear):
continue

for output_quant_target in kv_cache_quant_targets:
if name.endswith(output_quant_target):
<<<<<<< HEAD
<<<<<<< HEAD
kv_cache_quant_layers.append(name)

return kv_cache_quant_layers
=======
kv_cache_quant_layers.add(name)

return list(kv_cache_quant_layers)
>>>>>>> 3ee9283 (Support calibrating kv cache scales)
=======
kv_cache_quant_layers.append(name)

return kv_cache_quant_layers
>>>>>>> c3acdee (Switch from output_scale to kv_scale)
=======
def save_pretrained(self, save_directory: str, save_compressed: bool = True):
def save_pretrained(self, save_directory: str, save_compressed: bool = True):
self.model.save_pretrained(save_directory, save_compressed=save_compressed)
tokenizer = AutoTokenizer.from_pretrained(self.model.config._name_or_path)
tokenizer.save_pretrained(save_directory)
print(f"Saved final checkpoint to {os.path.abspath(save_directory)}")
>>>>>>> ba7d420 (Switch backend to use llm-compressor)
print(f"Saved final checkpoint to {os.path.abspath(save_directory)}")

0 comments on commit b428604

Please sign in to comment.