From be6eef2ea7fd37b2d189fd832cc825bcb661f594 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 18 Jul 2024 17:40:21 -0400 Subject: [PATCH] Update example --- example_dataset.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/example_dataset.py b/example_dataset.py index 82d336e..bf6b6fd 100644 --- a/example_dataset.py +++ b/example_dataset.py @@ -9,17 +9,10 @@ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) tokenizer.pad_token = tokenizer.eos_token -MAX_SEQUENCE_LENGTH = 2048 ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) def preprocess(example): example = tokenizer.apply_chat_template(example["messages"], tokenize=False) - return tokenizer( - example, - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) + return tokenizer(example, max_length=2048, truncation=True, add_special_tokens=False) ds = ds.map(preprocess, remove_columns=ds.column_names) quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")