diff --git a/src/llmcompressor/datasets/utils.py b/src/llmcompressor/datasets/utils.py index 4e7a55116..7de472492 100644 --- a/src/llmcompressor/datasets/utils.py +++ b/src/llmcompressor/datasets/utils.py @@ -20,9 +20,12 @@ def get_raw_dataset( local file directory in csv, json, parquet, etc. If local path is provided, it must be 1. Download path where HF dataset was downloaded to - 2. Path containing (train, test, validation) with the same extention. - Supported extentions are json, jsonl, csv, arrow, parquet, text, - and xlsx, + 2. File path containing any of train, test, validation in its name + with the supported extentions: json, jsonl, csv, arrow, parquet, text, + and xlsx. Ex. foo-train.csv, foo-test.csv + + If a custom name is to be used, its mapping can be specified using + `data_files` input_arg. :return: the requested dataset diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py index 54aa31c98..6c45a6cfa 100644 --- a/src/llmcompressor/transformers/finetune/data/base.py +++ b/src/llmcompressor/transformers/finetune/data/base.py @@ -184,7 +184,6 @@ def load_dataset(self): name=self.data_args.dataset_config_name, split=self.split, streaming=self.data_args.streaming, - trust_remove_code=self.data_args.trust_remote_code_data, **self.data_args.raw_kwargs, ) diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py index 64514b252..284edcfec 100644 --- a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py +++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py @@ -69,6 +69,7 @@ def test_no_padding_tokenization(self): split="train[5%:10%]", processor=self.tiny_llama_tokenizer, ) + dataset = op_manager.load_dataset() # load dataset = op_manager.map( # preprocess dataset,