From a28f2317a1ade51798c359ed311be669adc01ae1 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 15 Jan 2025 16:59:56 +0000 Subject: [PATCH] apply style Signed-off-by: Kyle Sayers --- examples/quantization_w4a16/gptj_example.py | 55 +++++++++++++++++++ examples/quantizing_moe/deepseek_moe_w4a16.py | 10 ++-- .../transformers/tracing/__init__.py | 4 +- .../deepseek_v2/configuration_deepseek.py | 3 +- .../tracing/deepseek_v2/modeling_deepseek.py | 3 +- 5 files changed, 68 insertions(+), 7 deletions(-) create mode 100644 examples/quantization_w4a16/gptj_example.py diff --git a/examples/quantization_w4a16/gptj_example.py b/examples/quantization_w4a16/gptj_example.py new file mode 100644 index 000000000..b0fb59af8 --- /dev/null +++ b/examples/quantization_w4a16/gptj_example.py @@ -0,0 +1,55 @@ +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier +from llmcompressor.transformers import oneshot + +# Select model and load it. +MODEL_ID = "EleutherAI/gpt-j-6B" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, + device_map="auto", + torch_dtype="auto", +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + +# Select number of samples. 512 samples is a good place to start. +# Increasing the number of samples can improve accuracy. +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Configure the quantization algorithm to run. +# * quantize the weights to 4 bit with GPTQ with a group size 128 +recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]), +] + +# Apply algorithms. +oneshot( + model=model, + dataset="ultrachat-200k", + splits={"calibration": f"train_sft[:{NUM_CALIBRATION_SAMPLES}]"}, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py index 31e08fb81..8169c0541 100644 --- a/examples/quantizing_moe/deepseek_moe_w4a16.py +++ b/examples/quantizing_moe/deepseek_moe_w4a16.py @@ -1,11 +1,13 @@ -from llmcompressor.transformers.tracing.deepseek_v2.configuration_deepseek import DeepseekV2Config import torch from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoTokenizer from llmcompressor.transformers import oneshot from llmcompressor.transformers.compression.helpers import calculate_offload_device_map from llmcompressor.transformers.tracing import TraceableDeepseekV2ForCausalLM +from llmcompressor.transformers.tracing.deepseek_v2.configuration_deepseek import ( + DeepseekV2Config, +) # NOTE: transformers 4.48.0 has an import error with DeepSeek. # Please consider either downgrading your transformers version to a @@ -24,7 +26,7 @@ trust_remote_code=True, ) -#model = AutoModelForCausalLM.from_pretrained( +# model = AutoModelForCausalLM.from_pretrained( config = DeepseekV2Config.from_pretrained(MODEL_ID) config.moe_top_k_activation = True model = TraceableDeepseekV2ForCausalLM.from_pretrained( @@ -32,7 +34,7 @@ device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True, - config=config + config=config, ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/src/llmcompressor/transformers/tracing/__init__.py b/src/llmcompressor/transformers/tracing/__init__.py index e096f41a6..9728e9939 100644 --- a/src/llmcompressor/transformers/tracing/__init__.py +++ b/src/llmcompressor/transformers/tracing/__init__.py @@ -5,7 +5,9 @@ from .mllama import ( MllamaForConditionalGeneration as TraceableMllamaForConditionalGeneration, ) -from .deepseek_v2.modeling_deepseek import DeepseekV2ForCausalLM as TraceableDeepseekV2ForCausalLM +from .deepseek_v2.modeling_deepseek import ( + DeepseekV2ForCausalLM as TraceableDeepseekV2ForCausalLM +) __all__ = [ "TraceableLlavaForConditionalGeneration", diff --git a/src/llmcompressor/transformers/tracing/deepseek_v2/configuration_deepseek.py b/src/llmcompressor/transformers/tracing/deepseek_v2/configuration_deepseek.py index 833d86ba5..555ce2417 100644 --- a/src/llmcompressor/transformers/tracing/deepseek_v2/configuration_deepseek.py +++ b/src/llmcompressor/transformers/tracing/deepseek_v2/configuration_deepseek.py @@ -1,3 +1,4 @@ +# flake8: noqa from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging @@ -206,4 +207,4 @@ def __init__( eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, - ) \ No newline at end of file + ) diff --git a/src/llmcompressor/transformers/tracing/deepseek_v2/modeling_deepseek.py b/src/llmcompressor/transformers/tracing/deepseek_v2/modeling_deepseek.py index d79f6c2de..d6272cb79 100644 --- a/src/llmcompressor/transformers/tracing/deepseek_v2/modeling_deepseek.py +++ b/src/llmcompressor/transformers/tracing/deepseek_v2/modeling_deepseek.py @@ -1,3 +1,4 @@ +# flake8: noqa # coding=utf-8 # Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved. # @@ -1925,4 +1926,4 @@ def forward( past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, - ) \ No newline at end of file + )