fix test

vllm-project · Feb 4, 2025 · 3cec5cd · 3cec5cd
1 parent 201bbbf
commit 3cec5cd
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 4 deletions.
diff --git a/src/llmcompressor/datasets/utils.py b/src/llmcompressor/datasets/utils.py
@@ -20,9 +20,12 @@ def get_raw_dataset(
         local file directory in csv, json, parquet, etc.
         If local path is provided, it must be
             1. Download path where HF dataset was downloaded to
-            2. Path containing (train, test, validation) with the same extention.
-            Supported extentions are json, jsonl, csv, arrow, parquet, text,
-            and xlsx,
+            2. File path containing any of train, test, validation in its name
+            with the supported extentions: json, jsonl, csv, arrow, parquet, text,
+            and xlsx. Ex. foo-train.csv, foo-test.csv
+
+            If a custom name is to be used, its mapping can be specified using
+            `data_files` input_arg.
 
     :return: the requested dataset
 

diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
@@ -184,7 +184,6 @@ def load_dataset(self):
             name=self.data_args.dataset_config_name,
             split=self.split,
             streaming=self.data_args.streaming,
-            trust_remove_code=self.data_args.trust_remote_code_data,
             **self.data_args.raw_kwargs,
         )
 

diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
@@ -69,6 +69,7 @@ def test_no_padding_tokenization(self):
             split="train[5%:10%]",
             processor=self.tiny_llama_tokenizer,
         )
+
         dataset = op_manager.load_dataset()  # load
         dataset = op_manager.map(  # preprocess
             dataset,