diff --git a/examples/multimodal_audio/qwen2_audio_example.py b/examples/multimodal_audio/qwen2_audio_example.py index 70db24523..ff3c9be25 100644 --- a/examples/multimodal_audio/qwen2_audio_example.py +++ b/examples/multimodal_audio/qwen2_audio_example.py @@ -1,6 +1,9 @@ import torch from datasets import load_dataset -from transformers import AutoProcessor +from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration +import soundfile as sf +from io import BytesIO +from urllib.request import urlopen from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot @@ -11,98 +14,238 @@ # Select model and load it. MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct" -model = TraceableQwen2AudioForConditionalGeneration.from_pretrained( +#model = TraceableQwen2AudioForConditionalGeneration.from_pretrained( +model = Qwen2AudioForConditionalGeneration.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", ) processor = AutoProcessor.from_pretrained(MODEL_ID) -# Select calibration dataset. -DATASET_ID = "MLCommons/peoples_speech" -DATASET_SUBSET = "test" -DATASET_SPLIT = "test" - -# Select number of samples. 512 samples is a good place to start. -# Increasing the number of samples can improve accuracy. -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - -# Load dataset and preprocess. -ds = load_dataset( - DATASET_ID, - DATASET_SUBSET, - split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]", - trust_remote_code=True, -) +# # Select calibration dataset. +# DATASET_ID = "MLCommons/peoples_speech" +# DATASET_SUBSET = "test" +# DATASET_SPLIT = "test" + +# # Select number of samples. 512 samples is a good place to start. +# # Increasing the number of samples can improve accuracy. +# NUM_CALIBRATION_SAMPLES = 1 #512 +# MAX_SEQUENCE_LENGTH = 2048 + +# # Load dataset and preprocess. +# ds = load_dataset( +# DATASET_ID, +# DATASET_SUBSET, +# split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]", +# trust_remote_code=True, +# ) + + +# def preprocess(example): +# messages = [ +# # {"role": "system", "content": "You are a helpful assistant."}, +# {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}, +# # {"role": "user", "content": [{"type": "text", "text": "What does the person say?"}]}, +# ]} +# ] + +# audio_data = example["audio"]["array"] +# sample_rate = example["audio"]["sampling_rate"] + +# # import librosa +# # new_sr = processor.feature_extractor.sampling_rate +# # audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr) +# # sample_rate = new_sr + +# #processor.feature_extractor.sampling_rate + +# # # Create an in-memory buffer +# # import io +# # buffer = io.BytesIO() + +# # # Write the audio data to the in-memory buffer in WAV format +# # sf.write(buffer, audio_data, sample_rate, format='WAV') + +# # import librosa +# # audio_data, sample_rate = librosa.load(buffer, sr=sample_rate) + +# import librosa +# audio_data = librosa.load( +# BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()), +# sr=processor.feature_extractor.sampling_rate +# )[0] + +# return { +# "text": processor.apply_chat_template( +# messages, add_generation_prompt=True, tokenize=False +# ), +# #"audios": [example["audio"]["array"]], +# "audios": [audio_data], +# #"array": example["audio"]["array"], +# #"sampling_rate": example["audio"]["sampling_rate"], +# "sampling_rate": sample_rate, +# #"sampling_rate": processor.feature_extractor.sampling_rate +# } + + +# ds = ds.map(preprocess, remove_columns=ds.column_names) + + +# # Tokenize inputs. +# def tokenize(sample): +# return processor(**sample, return_tensors="pt") + +# # Process inputs. +# def process(sample): + +# messages = [ +# {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}]} +# ] + +# # import librosa +# # new_sr = processor.feature_extractor.sampling_rate +# # audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr) +# # sample_rate = new_sr +# #processor.feature_extractor.sampling_rate -def preprocess(example): - messages = [ - { - "role": "user", - "content": [ - {"type": "audio", "audio": None}, - {"type": "text", "text": "What does the person say?"}, - ], - }, - ] +# # # Create an in-memory buffer +# # import io +# # buffer = io.BytesIO() - return { - "text": processor.apply_chat_template( - messages, add_generation_prompt=True, tokenize=False - ), - "audios": [example["audio"]["array"]], - "sampling_rate": example["audio"]["sampling_rate"], - } +# # # Write the audio data to the in-memory buffer in WAV format +# # sf.write(buffer, audio_data, sample_rate, format='WAV') +# # import librosa +# # audio_data, sample_rate = librosa.load(buffer, sr=sample_rate) -ds = ds.map(preprocess, remove_columns=ds.column_names) +# import librosa +# audio_data = librosa.load( +# BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()), +# sr=processor.feature_extractor.sampling_rate +# )[0] +# return processor( +# text=processor.apply_chat_template( +# messages, add_generation_prompt=True, tokenize=False +# ), +# #audio=sample["array"], +# audios=[audio_data], +# #sampling_rate=sample["sampling_rate"], +# #sampling_rate=sample["sampling_rate"], +# #add_special_tokens=True, +# return_tensors="pt", +# padding=True +# ) -# Tokenize inputs. -def tokenize(sample): - return processor(**sample, return_tensors="pt") -ds = ds.map(tokenize, remove_columns=ds.column_names) + +# audio_inputs = processor( +# text=sample["text"], +# #audio=sample["array"], +# audios=sample["audios"], +# #sampling_rate=sample["sampling_rate"], +# #sampling_rate=sample["sampling_rate"], +# #add_special_tokens=True, +# return_tensors="pt", +# padding=True +# ) +# return audio_inputs + +# text_inputs = processor( +# text=sample["text"], add_special_tokens=True, return_tensors="pt" +# ) +# text_inputs["decoder_input_ids"] = text_inputs["input_ids"] +# del text_inputs["input_ids"] + +# return dict(**audio_inputs, **text_inputs) + + +# #ds = ds.map(tokenize, remove_columns=ds.column_names) +# ds = ds.map(process, remove_columns=ds.column_names) + +messages = [ + {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}]} +] + +# import librosa +# new_sr = processor.feature_extractor.sampling_rate +# audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr) +# sample_rate = new_sr + +#processor.feature_extractor.sampling_rate + +# # Create an in-memory buffer +# import io +# buffer = io.BytesIO() + +# # Write the audio data to the in-memory buffer in WAV format +# sf.write(buffer, audio_data, sample_rate, format='WAV') + +# import librosa +# audio_data, sample_rate = librosa.load(buffer, sr=sample_rate) + +import librosa +audio_data = librosa.load( + BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()), + sr=processor.feature_extractor.sampling_rate +)[0] + +text = processor.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False +) + +breakpoint() +sample_input = processor( + text=text, + #audio=sample["array"], + audios=[audio_data], + #sampling_rate=sample["sampling_rate"], + #sampling_rate=sample["sampling_rate"], + #add_special_tokens=True, + return_tensors="pt", + padding=True +) +breakpoint() # Define a oneshot data collator for multimodal inputs. -def data_collator(batch): - assert len(batch) == 1 - return {key: torch.tensor(value) for key, value in batch[0].items()} +# def data_collator(batch): +# assert len(batch) == 1 +# return {key: torch.tensor(value) for key, value in batch[0].items()} # Configure the quantization algorithm to run. -# * quantize the weights to 4 bit with GPTQ with a group size 128 -recipe = GPTQModifier( - targets="Linear", - scheme="W4A16", - ignore=[ - # "re:audio_tower.*", - #"re:multi_modal_projector.*", - "lm_head", - ], # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing -) +# # * quantize the weights to 4 bit with GPTQ with a group size 128 +# recipe = GPTQModifier( +# targets="Linear", +# scheme="W4A16", +# ignore=[ +# # "re:audio_tower.*", +# # "re:multi_modal_projector.*", +# "lm_head", +# ], # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing +# ) # Apply algorithms. -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - data_collator=data_collator, -) +# oneshot( +# model=model, +# dataset=ds, +# recipe=recipe, +# max_seq_length=MAX_SEQUENCE_LENGTH, +# num_calibration_samples=NUM_CALIBRATION_SAMPLES, +# data_collator=data_collator, +# ) # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") breakpoint() -sample_input = data_collator([next(iter(ds))]) +#sample_input = data_collator([next(iter(ds))]) +#sample_input = ds[0] sample_input = {k: v.to(model.device) for k, v in sample_input.items()} -output = model.generate(**sample_input) +output = model.generate(**sample_input, max_new_tokens=256) print(processor.batch_decode(output, skip_special_tokens=True)[0]) print("==========================================\n\n") # that's where you have a lot of windows in the south no actually that's passive solar @@ -110,6 +253,6 @@ def data_collator(batch): # and it was a great thing for what it was at the time but it's not a passive house # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) +# SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" +# model.save_pretrained(SAVE_DIR, save_compressed=True) +# processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/janus_example.py b/examples/multimodal_vision/janus_example.py new file mode 100644 index 000000000..59f800eb1 --- /dev/null +++ b/examples/multimodal_vision/janus_example.py @@ -0,0 +1,76 @@ +import requests +import torch +from PIL import Image +from transformers import AutoModelForCausalLM, AutoProcessor + +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.transformers import oneshot +from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration + +# Load model. +model_id = "deepseek-ai/Janus-Pro-7B" +model = AutoModelForCausalLM.from_pretrained( + model_id, device_map="auto", torch_dtype="auto" +) +processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) + +# Oneshot arguments +DATASET_ID = "flickr30k" +DATASET_SPLIT = {"calibration": "test[:512]"} +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Define a oneshot data collator for multimodal inputs. +def data_collator(batch): + assert len(batch) == 1 + return {key: torch.tensor(value) for key, value in batch[0].items()} + + +# Recipe +recipe = [ + GPTQModifier( + targets="Linear", + scheme="W4A16", + sequential_targets=["LlamaDecoderLayer"], + ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"], + ), +] + +# Perform oneshot +oneshot( + model=model, + tokenizer=model_id, + dataset=DATASET_ID, + splits=DATASET_SPLIT, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, + data_collator=data_collator, +) + +# Confirm generations of the quantized model look sane. +print("========== SAMPLE GENERATION ==============") +messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Please describe the animal in this image\n"}, + {"type": "image"}, + ], + }, +] +prompt = processor.apply_chat_template(messages, add_generation_prompt=True) +image_url = "http://images.cocodataset.org/train2017/000000231895.jpg" +raw_image = Image.open(requests.get(image_url, stream=True).raw) + +inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda") +output = model.generate(**inputs, max_new_tokens=100) +print(processor.decode(output[0], skip_special_tokens=True)) +print("==========================================") + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 5e8a6b47e..42492e363 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -251,6 +251,8 @@ def on_initialize(self, state: State, **kwargs) -> bool: ) if isinstance(exception, unfixable_errors): raise exception + + raise exception warnings.warn("Falling back to layer_sequential pipeline") try: diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index 4945ba01e..743b2b138 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -70,11 +70,15 @@ def trace_subgraphs( tracer = get_tracer(model, sequential_targets, ignore) concrete_args = populate_concrete_args(model, sample_input) + # trace with ( calibration_forward_context(model), HooksMixin.disable_hooks(), ): + breakpoint() + model(**sample_input, **concrete_args) + exit(0) graph = GraphModule( model, tracer.trace( diff --git a/src/llmcompressor/transformers/finetune/data/peoples_speech.py b/src/llmcompressor/transformers/finetune/data/peoples_speech.py new file mode 100644 index 000000000..2e00b14a2 --- /dev/null +++ b/src/llmcompressor/transformers/finetune/data/peoples_speech.py @@ -0,0 +1,31 @@ +from copy import deepcopy +from typing import TYPE_CHECKING + +from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.typing import Processor + +if TYPE_CHECKING: + from llmcompressor.transformers import DataTrainingArguments as DataArgs + + +@TextGenerationDataset.register(name="peoples_speech") +class PeoplesSpeech(TextGenerationDataset): + """ + :param data_args: configuration settings for dataset loading + :param split: split from dataset to load, for instance `test` or `train[:5%]` + :param processor: processor or tokenizer to use on dataset + """ + + def __init__(self, data_args: "DataArgs", split: str, processor: Processor): + data_args = deepcopy(data_args) + data_args.dataset = "MLCommons/peoples_speech" + data_args.dataset_config_name = "test" + + super().__init__(data_args=data_args, split=split, processor=processor) + + def dataset_template(self, example): + return { + "audio": example["audio"]["array"], + "sampling_rate": example["audio"]["sampling_rate"], + "text": " " + example["text"].capitalize(), + } diff --git a/src/llmcompressor/transformers/tracing/qwen2_audio.py b/src/llmcompressor/transformers/tracing/qwen2_audio.py index 06dc1ac8e..ee89e44ef 100644 --- a/src/llmcompressor/transformers/tracing/qwen2_audio.py +++ b/src/llmcompressor/transformers/tracing/qwen2_audio.py @@ -1,4 +1,3 @@ -# flake8: noqa # coding=utf-8 # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # @@ -13,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# vllm-project: no copyright """PyTorch Qwen2Audio model.""" import math @@ -225,7 +223,6 @@ class Qwen2AudioFlashAttention2(Qwen2AudioAttention): flash attention and deal with padding tokens in case the input contains any of them. """ - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -859,6 +856,9 @@ def __init__(self, config: Qwen2AudioConfig): self.multi_modal_projector = Qwen2AudioMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides self.post_init() @@ -897,18 +897,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights - def tie_weights(self): - return self.language_model.tie_weights() - - # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: - model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) - # update vocab size - self.config.text_config.vocab_size = model_embeds.num_embeddings - self.vocab_size = model_embeds.num_embeddings - return model_embeds - def _merge_input_ids_with_audio_features( self, audio_features, num_audio_tokens, inputs_embeds, input_ids, attention_mask, labels ): @@ -1092,9 +1080,7 @@ def _merge_input_ids_with_audio_features( audio_to_overwrite &= val - # TRACING - #if audio_to_overwrite.sum() != num_audio_tokens.sum(): - if False: + if audio_to_overwrite.sum() != num_audio_tokens.sum(): raise ValueError( f"The input provided to the model are wrong. The number of audio tokens is {num_special_audio_tokens} while" f" the number of audio given to the model is {num_audios}. This prevents correct indexing and breaks batch generation." @@ -1202,9 +1188,34 @@ def forward( selected_audio_feature = audio_outputs.last_hidden_state audio_features = self.multi_modal_projector(selected_audio_feature) - inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features( - audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels - ) + # if we have consecutive audio tokens, then it means we expanded input_ids in processing + audio_tokens = input_ids == self.config.audio_token_index + legacy_processing = (audio_tokens[:, :-1] & audio_tokens[:, 1:]).sum() == 0 + + if legacy_processing: + logger.warning_once( + "Expanding inputs for audio tokens in Qwen2Audio should be done in processing." + ) + inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features( + audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels + ) + else: + num_audios, max_audio_tokens, embed_dim = audio_features.shape + audio_features_mask = torch.arange(max_audio_tokens, device=audio_output_lengths.device)[None, :] + audio_features_mask = audio_features_mask < audio_output_lengths[:, None] + audio_features = audio_features[audio_features_mask] + + n_audio_tokens = (input_ids == self.config.audio_token_index).sum().item() + n_audio_features = audio_features.shape[0] + + if n_audio_tokens != n_audio_features: + raise ValueError( + f"Audio features and audio tokens do not match: tokens: {n_audio_tokens}, features {n_audio_features}" + ) + special_audio_mask = (input_ids == self.config.audio_token_index).to(inputs_embeds.device) + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds) + audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features) outputs = self.language_model( attention_mask=attention_mask, @@ -1368,3 +1379,6 @@ def _update_model_kwargs_for_generation( def _reorder_cache(self, *args, **kwargs): return self.language_model._reorder_cache(*args, **kwargs) + + +__all__ = ["Qwen2AudioForConditionalGeneration", "Qwen2AudioPreTrainedModel", "Qwen2AudioEncoder"] \ No newline at end of file