diff --git a/examples/multimodal_audio/qwen2_audio_example.py b/examples/multimodal_audio/qwen2_audio_example.py
index 70db24523..ff3c9be25 100644
--- a/examples/multimodal_audio/qwen2_audio_example.py
+++ b/examples/multimodal_audio/qwen2_audio_example.py
@@ -1,6 +1,9 @@
 import torch
 from datasets import load_dataset
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+import soundfile as sf
+from io import BytesIO
+from urllib.request import urlopen
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
@@ -11,98 +14,238 @@
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
 
-model = TraceableQwen2AudioForConditionalGeneration.from_pretrained(
+#model = TraceableQwen2AudioForConditionalGeneration.from_pretrained(
+model = Qwen2AudioForConditionalGeneration.from_pretrained(
     MODEL_ID,
     device_map="auto",
     torch_dtype="auto",
 )
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 
-# Select calibration dataset.
-DATASET_ID = "MLCommons/peoples_speech"
-DATASET_SUBSET = "test"
-DATASET_SPLIT = "test"
-
-# Select number of samples. 512 samples is a good place to start.
-# Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-# Load dataset and preprocess.
-ds = load_dataset(
-    DATASET_ID,
-    DATASET_SUBSET,
-    split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
-    trust_remote_code=True,
-)
+# # Select calibration dataset.
+# DATASET_ID = "MLCommons/peoples_speech"
+# DATASET_SUBSET = "test"
+# DATASET_SPLIT = "test"
+
+# # Select number of samples. 512 samples is a good place to start.
+# # Increasing the number of samples can improve accuracy.
+# NUM_CALIBRATION_SAMPLES = 1 #512
+# MAX_SEQUENCE_LENGTH = 2048
+
+# # Load dataset and preprocess.
+# ds = load_dataset(
+#     DATASET_ID,
+#     DATASET_SUBSET,
+#     split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
+#     trust_remote_code=True,
+# )
+
+
+# def preprocess(example):
+#     messages = [
+#         # {"role": "system", "content": "You are a helpful assistant."},
+#         {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
+#         # {"role": "user", "content": [{"type": "text", "text": "What does the person say?"}]},
+#     ]}
+#     ]
+
+#     audio_data = example["audio"]["array"]
+#     sample_rate = example["audio"]["sampling_rate"]
+
+#     # import librosa
+#     # new_sr = processor.feature_extractor.sampling_rate
+#     # audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr)
+#     # sample_rate = new_sr
+
+#     #processor.feature_extractor.sampling_rate
+
+#     # # Create an in-memory buffer
+#     # import io
+#     # buffer = io.BytesIO()
+
+#     # # Write the audio data to the in-memory buffer in WAV format
+#     # sf.write(buffer, audio_data, sample_rate, format='WAV')
+
+#     # import librosa
+#     # audio_data, sample_rate = librosa.load(buffer, sr=sample_rate)
+
+#     import librosa
+#     audio_data = librosa.load(
+#         BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()), 
+#         sr=processor.feature_extractor.sampling_rate
+#     )[0]
+
+#     return {
+#         "text": processor.apply_chat_template(
+#             messages, add_generation_prompt=True, tokenize=False
+#         ),
+#         #"audios": [example["audio"]["array"]],
+#         "audios": [audio_data],
+#         #"array": example["audio"]["array"],
+#         #"sampling_rate": example["audio"]["sampling_rate"],
+#         "sampling_rate": sample_rate,
+#         #"sampling_rate": processor.feature_extractor.sampling_rate
+#     }
+
+
+# ds = ds.map(preprocess, remove_columns=ds.column_names)
+
+
+# # Tokenize inputs.
+# def tokenize(sample):
+#     return processor(**sample, return_tensors="pt")
+
+# # Process inputs.
+# def process(sample):
+
+#     messages = [
+#         {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}]}
+#     ]
+
+#     # import librosa
+#     # new_sr = processor.feature_extractor.sampling_rate
+#     # audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr)
+#     # sample_rate = new_sr
 
+#     #processor.feature_extractor.sampling_rate
 
-def preprocess(example):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "audio", "audio": None},
-                {"type": "text", "text": "What does the person say?"},
-            ],
-        },
-    ]
+#     # # Create an in-memory buffer
+#     # import io
+#     # buffer = io.BytesIO()
 
-    return {
-        "text": processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=False
-        ),
-        "audios": [example["audio"]["array"]],
-        "sampling_rate": example["audio"]["sampling_rate"],
-    }
+#     # # Write the audio data to the in-memory buffer in WAV format
+#     # sf.write(buffer, audio_data, sample_rate, format='WAV')
 
+#     # import librosa
+#     # audio_data, sample_rate = librosa.load(buffer, sr=sample_rate)
 
-ds = ds.map(preprocess, remove_columns=ds.column_names)
+#     import librosa
+#     audio_data = librosa.load(
+#         BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()), 
+#         sr=processor.feature_extractor.sampling_rate
+#     )[0]
 
+#     return processor(
+#         text=processor.apply_chat_template(
+#             messages, add_generation_prompt=True, tokenize=False
+#         ),
+#         #audio=sample["array"],
+#         audios=[audio_data],
+#         #sampling_rate=sample["sampling_rate"],
+#         #sampling_rate=sample["sampling_rate"],
+#         #add_special_tokens=True,
+#         return_tensors="pt",
+#         padding=True
+#     )
 
-# Tokenize inputs.
-def tokenize(sample):
-    return processor(**sample, return_tensors="pt")
 
 
-ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+#     audio_inputs = processor(
+#         text=sample["text"],
+#         #audio=sample["array"],
+#         audios=sample["audios"],
+#         #sampling_rate=sample["sampling_rate"],
+#         #sampling_rate=sample["sampling_rate"],
+#         #add_special_tokens=True,
+#         return_tensors="pt",
+#         padding=True
+#     )
+#     return audio_inputs
+
+#     text_inputs = processor(
+#         text=sample["text"], add_special_tokens=True, return_tensors="pt"
+#     )
+#     text_inputs["decoder_input_ids"] = text_inputs["input_ids"]
+#     del text_inputs["input_ids"]
+
+#     return dict(**audio_inputs, **text_inputs)
+
+
+# #ds = ds.map(tokenize, remove_columns=ds.column_names)
+# ds = ds.map(process, remove_columns=ds.column_names)
+
+messages = [
+    {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}]}
+]
+
+# import librosa
+# new_sr = processor.feature_extractor.sampling_rate
+# audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr)
+# sample_rate = new_sr
+
+#processor.feature_extractor.sampling_rate
+
+# # Create an in-memory buffer
+# import io
+# buffer = io.BytesIO()
+
+# # Write the audio data to the in-memory buffer in WAV format
+# sf.write(buffer, audio_data, sample_rate, format='WAV')
+
+# import librosa
+# audio_data, sample_rate = librosa.load(buffer, sr=sample_rate)
+
+import librosa
+audio_data = librosa.load(
+    BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()), 
+    sr=processor.feature_extractor.sampling_rate
+)[0]
+
+text = processor.apply_chat_template(
+    messages, add_generation_prompt=True, tokenize=False
+)
+
+breakpoint()
+sample_input = processor(
+    text=text,
+    #audio=sample["array"],
+    audios=[audio_data],
+    #sampling_rate=sample["sampling_rate"],
+    #sampling_rate=sample["sampling_rate"],
+    #add_special_tokens=True,
+    return_tensors="pt",
+    padding=True
+)
+breakpoint()
 
 
 # Define a oneshot data collator for multimodal inputs.
-def data_collator(batch):
-    assert len(batch) == 1
-    return {key: torch.tensor(value) for key, value in batch[0].items()}
+# def data_collator(batch):
+#     assert len(batch) == 1
+#     return {key: torch.tensor(value) for key, value in batch[0].items()}
 
 
 # Configure the quantization algorithm to run.
-#   * quantize the weights to 4 bit with GPTQ with a group size 128
-recipe = GPTQModifier(
-    targets="Linear",
-    scheme="W4A16",
-    ignore=[
-        # "re:audio_tower.*",
-        #"re:multi_modal_projector.*",
-        "lm_head",
-    ],  # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing
-)
+# #   * quantize the weights to 4 bit with GPTQ with a group size 128
+# recipe = GPTQModifier(
+#     targets="Linear",
+#     scheme="W4A16",
+#     ignore=[
+#         # "re:audio_tower.*",
+#         # "re:multi_modal_projector.*",
+#         "lm_head",
+#     ],  # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing
+# )
 
 # Apply algorithms.
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    data_collator=data_collator,
-)
+# oneshot(
+#     model=model,
+#     dataset=ds,
+#     recipe=recipe,
+#     max_seq_length=MAX_SEQUENCE_LENGTH,
+#     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+#     data_collator=data_collator,
+# )
 
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 breakpoint()
-sample_input = data_collator([next(iter(ds))])
+#sample_input = data_collator([next(iter(ds))])
+#sample_input = ds[0]
 sample_input = {k: v.to(model.device) for k, v in sample_input.items()}
-output = model.generate(**sample_input)
+output = model.generate(**sample_input, max_new_tokens=256)
 print(processor.batch_decode(output, skip_special_tokens=True)[0])
 print("==========================================\n\n")
 # that's where you have a lot of windows in the south no actually that's passive solar
@@ -110,6 +253,6 @@ def data_collator(batch):
 # and it was a great thing for what it was at the time but it's not a passive house
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
+# SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+# model.save_pretrained(SAVE_DIR, save_compressed=True)
+# processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/janus_example.py b/examples/multimodal_vision/janus_example.py
new file mode 100644
index 000000000..59f800eb1
--- /dev/null
+++ b/examples/multimodal_vision/janus_example.py
@@ -0,0 +1,76 @@
+import requests
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
+
+# Load model.
+model_id = "deepseek-ai/Janus-Pro-7B"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id, device_map="auto", torch_dtype="auto"
+)
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+# Oneshot arguments
+DATASET_ID = "flickr30k"
+DATASET_SPLIT = {"calibration": "test[:512]"}
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {key: torch.tensor(value) for key, value in batch[0].items()}
+
+
+# Recipe
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W4A16",
+        sequential_targets=["LlamaDecoderLayer"],
+        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
+    ),
+]
+
+# Perform oneshot
+oneshot(
+    model=model,
+    tokenizer=model_id,
+    dataset=DATASET_ID,
+    splits=DATASET_SPLIT,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+    data_collator=data_collator,
+)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Please describe the animal in this image\n"},
+            {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
+raw_image = Image.open(requests.get(image_url, stream=True).raw)
+
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
+print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index 5e8a6b47e..42492e363 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -251,6 +251,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:
                 )
             if isinstance(exception, unfixable_errors):
                 raise exception
+            
+            raise exception
 
             warnings.warn("Falling back to layer_sequential pipeline")
             try:
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index 4945ba01e..743b2b138 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -70,11 +70,15 @@ def trace_subgraphs(
     tracer = get_tracer(model, sequential_targets, ignore)
     concrete_args = populate_concrete_args(model, sample_input)
 
+
     # trace
     with (
         calibration_forward_context(model),
         HooksMixin.disable_hooks(),
     ):
+        breakpoint()
+        model(**sample_input, **concrete_args)
+        exit(0)
         graph = GraphModule(
             model,
             tracer.trace(
diff --git a/src/llmcompressor/transformers/finetune/data/peoples_speech.py b/src/llmcompressor/transformers/finetune/data/peoples_speech.py
new file mode 100644
index 000000000..2e00b14a2
--- /dev/null
+++ b/src/llmcompressor/transformers/finetune/data/peoples_speech.py
@@ -0,0 +1,31 @@
+from copy import deepcopy
+from typing import TYPE_CHECKING
+
+from llmcompressor.transformers.finetune.data import TextGenerationDataset
+from llmcompressor.typing import Processor
+
+if TYPE_CHECKING:
+    from llmcompressor.transformers import DataTrainingArguments as DataArgs
+
+
+@TextGenerationDataset.register(name="peoples_speech")
+class PeoplesSpeech(TextGenerationDataset):
+    """
+    :param data_args: configuration settings for dataset loading
+    :param split: split from dataset to load, for instance `test` or `train[:5%]`
+    :param processor: processor or tokenizer to use on dataset
+    """
+
+    def __init__(self, data_args: "DataArgs", split: str, processor: Processor):
+        data_args = deepcopy(data_args)
+        data_args.dataset = "MLCommons/peoples_speech"
+        data_args.dataset_config_name = "test"
+
+        super().__init__(data_args=data_args, split=split, processor=processor)
+
+    def dataset_template(self, example):
+        return {
+            "audio": example["audio"]["array"],
+            "sampling_rate": example["audio"]["sampling_rate"],
+            "text": " " + example["text"].capitalize(),
+        }
diff --git a/src/llmcompressor/transformers/tracing/qwen2_audio.py b/src/llmcompressor/transformers/tracing/qwen2_audio.py
index 06dc1ac8e..ee89e44ef 100644
--- a/src/llmcompressor/transformers/tracing/qwen2_audio.py
+++ b/src/llmcompressor/transformers/tracing/qwen2_audio.py
@@ -1,4 +1,3 @@
-# flake8: noqa
 # coding=utf-8
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
@@ -13,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# vllm-project: no copyright
 """PyTorch Qwen2Audio model."""
 
 import math
@@ -225,7 +223,6 @@ class Qwen2AudioFlashAttention2(Qwen2AudioAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -859,6 +856,9 @@ def __init__(self, config: Qwen2AudioConfig):
         self.multi_modal_projector = Qwen2AudioMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self._padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
         self.post_init()
@@ -897,18 +897,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        # update vocab size
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-
     def _merge_input_ids_with_audio_features(
         self, audio_features, num_audio_tokens, inputs_embeds, input_ids, attention_mask, labels
     ):
@@ -1092,9 +1080,7 @@ def _merge_input_ids_with_audio_features(
 
         audio_to_overwrite &= val
 
-        # TRACING
-        #if audio_to_overwrite.sum() != num_audio_tokens.sum():
-        if False:
+        if audio_to_overwrite.sum() != num_audio_tokens.sum():
             raise ValueError(
                 f"The input provided to the model are wrong. The number of audio tokens is {num_special_audio_tokens} while"
                 f" the number of audio given to the model is {num_audios}. This prevents correct indexing and breaks batch generation."
@@ -1202,9 +1188,34 @@ def forward(
                 selected_audio_feature = audio_outputs.last_hidden_state
                 audio_features = self.multi_modal_projector(selected_audio_feature)
 
-                inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features(
-                    audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels
-                )
+                # if we have consecutive audio tokens, then it means we expanded input_ids in processing
+                audio_tokens = input_ids == self.config.audio_token_index
+                legacy_processing = (audio_tokens[:, :-1] & audio_tokens[:, 1:]).sum() == 0
+
+                if legacy_processing:
+                    logger.warning_once(
+                        "Expanding inputs for audio tokens in Qwen2Audio should be done in processing."
+                    )
+                    inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features(
+                        audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels
+                    )
+                else:
+                    num_audios, max_audio_tokens, embed_dim = audio_features.shape
+                    audio_features_mask = torch.arange(max_audio_tokens, device=audio_output_lengths.device)[None, :]
+                    audio_features_mask = audio_features_mask < audio_output_lengths[:, None]
+                    audio_features = audio_features[audio_features_mask]
+
+                    n_audio_tokens = (input_ids == self.config.audio_token_index).sum().item()
+                    n_audio_features = audio_features.shape[0]
+
+                    if n_audio_tokens != n_audio_features:
+                        raise ValueError(
+                            f"Audio features and audio tokens do not match: tokens: {n_audio_tokens}, features {n_audio_features}"
+                        )
+                    special_audio_mask = (input_ids == self.config.audio_token_index).to(inputs_embeds.device)
+                    special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds)
+                    audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -1368,3 +1379,6 @@ def _update_model_kwargs_for_generation(
 
     def _reorder_cache(self, *args, **kwargs):
         return self.language_model._reorder_cache(*args, **kwargs)
+
+
+__all__ = ["Qwen2AudioForConditionalGeneration", "Qwen2AudioPreTrainedModel", "Qwen2AudioEncoder"]
\ No newline at end of file