From 4b2092c67d6d4785c29d0c7df8359d912c3ddfd5 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 1 Jul 2024 09:35:27 -0600 Subject: [PATCH] Update example_dataset.py --- example_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example_dataset.py b/example_dataset.py index e2a7265..204345f 100644 --- a/example_dataset.py +++ b/example_dataset.py @@ -9,7 +9,7 @@ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) tokenizer.pad_token = tokenizer.eos_token -ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) +ds = load_dataset("mgoin/ultrachat_2k", split="train_sft") examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")