From be6eef2ea7fd37b2d189fd832cc825bcb661f594 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 18 Jul 2024 17:40:21 -0400
Subject: [PATCH] Update example

---
 example_dataset.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/example_dataset.py b/example_dataset.py
index 82d336e..bf6b6fd 100644
--- a/example_dataset.py
+++ b/example_dataset.py
@@ -9,17 +9,10 @@
 tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
 tokenizer.pad_token = tokenizer.eos_token
 
-MAX_SEQUENCE_LENGTH = 2048
 ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
 def preprocess(example):
     example = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-    return tokenizer(
-        example,
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
+    return tokenizer(example, max_length=2048, truncation=True, add_special_tokens=False)
 ds = ds.map(preprocess, remove_columns=ds.column_names)
 
 quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")