Skip to content

Commit

Permalink
gibberish is produced, even when the model is exactly copied
Browse files Browse the repository at this point in the history
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
  • Loading branch information
kylesayrs committed Jan 28, 2025
1 parent f8ebc5c commit 8c40a65
Show file tree
Hide file tree
Showing 6 changed files with 358 additions and 88 deletions.
277 changes: 210 additions & 67 deletions examples/multimodal_audio/qwen2_audio_example.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import torch
from datasets import load_dataset
from transformers import AutoProcessor
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
import soundfile as sf
from io import BytesIO
from urllib.request import urlopen

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
Expand All @@ -11,105 +14,245 @@
# Select model and load it.
MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"

model = TraceableQwen2AudioForConditionalGeneration.from_pretrained(
#model = TraceableQwen2AudioForConditionalGeneration.from_pretrained(
model = Qwen2AudioForConditionalGeneration.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
processor = AutoProcessor.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "MLCommons/peoples_speech"
DATASET_SUBSET = "test"
DATASET_SPLIT = "test"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(
DATASET_ID,
DATASET_SUBSET,
split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
trust_remote_code=True,
)
# # Select calibration dataset.
# DATASET_ID = "MLCommons/peoples_speech"
# DATASET_SUBSET = "test"
# DATASET_SPLIT = "test"

# # Select number of samples. 512 samples is a good place to start.
# # Increasing the number of samples can improve accuracy.
# NUM_CALIBRATION_SAMPLES = 1 #512
# MAX_SEQUENCE_LENGTH = 2048

# # Load dataset and preprocess.
# ds = load_dataset(
# DATASET_ID,
# DATASET_SUBSET,
# split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
# trust_remote_code=True,
# )


# def preprocess(example):
# messages = [
# # {"role": "system", "content": "You are a helpful assistant."},
# {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
# # {"role": "user", "content": [{"type": "text", "text": "What does the person say?"}]},
# ]}
# ]

# audio_data = example["audio"]["array"]
# sample_rate = example["audio"]["sampling_rate"]

# # import librosa
# # new_sr = processor.feature_extractor.sampling_rate
# # audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr)
# # sample_rate = new_sr

# #processor.feature_extractor.sampling_rate

# # # Create an in-memory buffer
# # import io
# # buffer = io.BytesIO()

# # # Write the audio data to the in-memory buffer in WAV format
# # sf.write(buffer, audio_data, sample_rate, format='WAV')

# # import librosa
# # audio_data, sample_rate = librosa.load(buffer, sr=sample_rate)

# import librosa
# audio_data = librosa.load(
# BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()),
# sr=processor.feature_extractor.sampling_rate
# )[0]

# return {
# "text": processor.apply_chat_template(
# messages, add_generation_prompt=True, tokenize=False
# ),
# #"audios": [example["audio"]["array"]],
# "audios": [audio_data],
# #"array": example["audio"]["array"],
# #"sampling_rate": example["audio"]["sampling_rate"],
# "sampling_rate": sample_rate,
# #"sampling_rate": processor.feature_extractor.sampling_rate
# }


# ds = ds.map(preprocess, remove_columns=ds.column_names)


# # Tokenize inputs.
# def tokenize(sample):
# return processor(**sample, return_tensors="pt")

# # Process inputs.
# def process(sample):

# messages = [
# {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}]}
# ]

# # import librosa
# # new_sr = processor.feature_extractor.sampling_rate
# # audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr)
# # sample_rate = new_sr

# #processor.feature_extractor.sampling_rate

def preprocess(example):
messages = [
{
"role": "user",
"content": [
{"type": "audio", "audio": None},
{"type": "text", "text": "What does the person say?"},
],
},
]
# # # Create an in-memory buffer
# # import io
# # buffer = io.BytesIO()

return {
"text": processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False
),
"audios": [example["audio"]["array"]],
"sampling_rate": example["audio"]["sampling_rate"],
}
# # # Write the audio data to the in-memory buffer in WAV format
# # sf.write(buffer, audio_data, sample_rate, format='WAV')

# # import librosa
# # audio_data, sample_rate = librosa.load(buffer, sr=sample_rate)

ds = ds.map(preprocess, remove_columns=ds.column_names)
# import librosa
# audio_data = librosa.load(
# BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()),
# sr=processor.feature_extractor.sampling_rate
# )[0]

# return processor(
# text=processor.apply_chat_template(
# messages, add_generation_prompt=True, tokenize=False
# ),
# #audio=sample["array"],
# audios=[audio_data],
# #sampling_rate=sample["sampling_rate"],
# #sampling_rate=sample["sampling_rate"],
# #add_special_tokens=True,
# return_tensors="pt",
# padding=True
# )

# Tokenize inputs.
def tokenize(sample):
return processor(**sample, return_tensors="pt")


ds = ds.map(tokenize, remove_columns=ds.column_names)

# audio_inputs = processor(
# text=sample["text"],
# #audio=sample["array"],
# audios=sample["audios"],
# #sampling_rate=sample["sampling_rate"],
# #sampling_rate=sample["sampling_rate"],
# #add_special_tokens=True,
# return_tensors="pt",
# padding=True
# )
# return audio_inputs

# text_inputs = processor(
# text=sample["text"], add_special_tokens=True, return_tensors="pt"
# )
# text_inputs["decoder_input_ids"] = text_inputs["input_ids"]
# del text_inputs["input_ids"]

# return dict(**audio_inputs, **text_inputs)


# #ds = ds.map(tokenize, remove_columns=ds.column_names)
# ds = ds.map(process, remove_columns=ds.column_names)

messages = [
{"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}]}
]

# import librosa
# new_sr = processor.feature_extractor.sampling_rate
# audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr)
# sample_rate = new_sr

#processor.feature_extractor.sampling_rate

# # Create an in-memory buffer
# import io
# buffer = io.BytesIO()

# # Write the audio data to the in-memory buffer in WAV format
# sf.write(buffer, audio_data, sample_rate, format='WAV')

# import librosa
# audio_data, sample_rate = librosa.load(buffer, sr=sample_rate)

import librosa
audio_data = librosa.load(
BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()),
sr=processor.feature_extractor.sampling_rate
)[0]

text = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False
)

breakpoint()
sample_input = processor(
text=text,
#audio=sample["array"],
audios=[audio_data],
#sampling_rate=sample["sampling_rate"],
#sampling_rate=sample["sampling_rate"],
#add_special_tokens=True,
return_tensors="pt",
padding=True
)
breakpoint()


# Define a oneshot data collator for multimodal inputs.
def data_collator(batch):
assert len(batch) == 1
return {key: torch.tensor(value) for key, value in batch[0].items()}
# def data_collator(batch):
# assert len(batch) == 1
# return {key: torch.tensor(value) for key, value in batch[0].items()}


# Configure the quantization algorithm to run.
# * quantize the weights to 4 bit with GPTQ with a group size 128
recipe = GPTQModifier(
targets="Linear",
scheme="W4A16",
ignore=[
# "re:audio_tower.*",
#"re:multi_modal_projector.*",
"lm_head",
], # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing
)
# # * quantize the weights to 4 bit with GPTQ with a group size 128
# recipe = GPTQModifier(
# targets="Linear",
# scheme="W4A16",
# ignore=[
# # "re:audio_tower.*",
# # "re:multi_modal_projector.*",
# "lm_head",
# ], # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing
# )

# Apply algorithms.
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
data_collator=data_collator,
)
# oneshot(
# model=model,
# dataset=ds,
# recipe=recipe,
# max_seq_length=MAX_SEQUENCE_LENGTH,
# num_calibration_samples=NUM_CALIBRATION_SAMPLES,
# data_collator=data_collator,
# )

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
breakpoint()
sample_input = data_collator([next(iter(ds))])
#sample_input = data_collator([next(iter(ds))])
#sample_input = ds[0]
sample_input = {k: v.to(model.device) for k, v in sample_input.items()}
output = model.generate(**sample_input)
output = model.generate(**sample_input, max_new_tokens=256)
print(processor.batch_decode(output, skip_special_tokens=True)[0])
print("==========================================\n\n")
# that's where you have a lot of windows in the south no actually that's passive solar
# and passive solar is something that was developed and designed in the 1960s and 70s
# and it was a great thing for what it was at the time but it's not a passive house

# Save to disk compressed.
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
# SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
# model.save_pretrained(SAVE_DIR, save_compressed=True)
# processor.save_pretrained(SAVE_DIR)
76 changes: 76 additions & 0 deletions examples/multimodal_vision/janus_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import requests
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration

# Load model.
model_id = "deepseek-ai/Janus-Pro-7B"
model = AutoModelForCausalLM.from_pretrained(
model_id, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

# Oneshot arguments
DATASET_ID = "flickr30k"
DATASET_SPLIT = {"calibration": "test[:512]"}
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048


# Define a oneshot data collator for multimodal inputs.
def data_collator(batch):
assert len(batch) == 1
return {key: torch.tensor(value) for key, value in batch[0].items()}


# Recipe
recipe = [
GPTQModifier(
targets="Linear",
scheme="W4A16",
sequential_targets=["LlamaDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]

# Perform oneshot
oneshot(
model=model,
tokenizer=model_id,
dataset=DATASET_ID,
splits=DATASET_SPLIT,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
data_collator=data_collator,
)

# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please describe the animal in this image\n"},
{"type": "image"},
],
},
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
raw_image = Image.open(requests.get(image_url, stream=True).raw)

inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
output = model.generate(**inputs, max_new_tokens=100)
print(processor.decode(output[0], skip_special_tokens=True))
print("==========================================")

# Save to disk compressed.
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
Loading

0 comments on commit 8c40a65

Please sign in to comment.