-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_load_dataset.py
122 lines (95 loc) · 4.85 KB
/
test_load_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from datasets import Dataset, DatasetDict
def assert_dataset_equal(ds, hidden_text: str = None):
a, b = ds[0], ds[1]
assert a["is_masked"] is True
assert b["is_masked"] is False
assert "_____" in hidden_text
assert len(ds) == 2
def test_attr(template_instance, data, config):
attr = template_instance.get_template_attr(**data[0], **config)
assert "You are a multilingual professional writer." == attr.system_prompt
assert "_____" in attr.input
assert "# Role" in attr.instruction
assert "Custom Title" in attr.prompt_structure
assert "Custom Title" in attr.output
def test_load_dataset_from_dict(template_instance, data, config):
text_ds = template_instance.load_dataset(data, output_format='text', **config)
# Assert
assert_dataset_equal(text_ds, text_ds[0]["text"])
assert "Custom Title" in text_ds[0]["text"]
alpaca_ds = template_instance.load_dataset(data, output_format='alpaca', **config)
assert_dataset_equal(alpaca_ds, alpaca_ds[0]["input"])
assert "# Role" in alpaca_ds[0]["instruction"]
assert "Custom Title" in alpaca_ds[0]["input"]
assert "Custom Title" in alpaca_ds[0]["output"]
openai_ds = template_instance.load_dataset(data, output_format='openai', **config) # noqa
assert_dataset_equal(openai_ds, openai_ds[0]["messages"][1]["content"]) # noqa
assert "Custom Title" in openai_ds[0]["messages"][-1]["content"]
assert "# Role" in openai_ds[0]["messages"][0]["content"]
assert "Custom Title" in openai_ds[0]["messages"][1]["content"]
assert "Custom Title" in openai_ds[0]["messages"][2]["content"]
def test_load_dataset_from_Dataset(template_instance, data, config):
dataset = Dataset.from_list(data)
# Assert
text_ds = template_instance.load_dataset(dataset, output_format='text', **config)
assert_dataset_equal(text_ds, text_ds[0]["text"])
assert "# Role" in text_ds[0]["text"]
assert "Custom Title" in text_ds[0]["text"]
# Assert
alpaca_ds = template_instance.load_dataset(dataset, output_format='alpaca', **config)
assert_dataset_equal(alpaca_ds, alpaca_ds[0]["input"])
assert "# Role" in alpaca_ds[0]["instruction"]
assert "Custom Title" in alpaca_ds[0]["input"]
# Assert
openai_ds = template_instance.load_dataset(dataset, output_format='openai', **config) # noqa
assert_dataset_equal(openai_ds, openai_ds[0]["messages"][1]["content"])
assert "Custom Title" in openai_ds[0]["messages"][-1]["content"]
assert "# Role" in openai_ds[0]["messages"][0]["content"]
assert "Custom Title" in openai_ds[0]["messages"][1]["content"]
assert "Custom Title" in openai_ds[0]["messages"][2]["content"]
def test_load_dataset_from_DatasetDict(template_instance, data, config):
def split_datadict(ds):
assert isinstance(ds, DatasetDict)
return ds["train"], ds["test"]
dataset = Dataset.from_list(data * 2)
dataset = dataset.train_test_split(test_size=0.5)
text_ds = template_instance.load_dataset(dataset, output_format='text', **config)
train_ds, test_ds = split_datadict(text_ds)
# Assert Train
assert_dataset_equal(train_ds, train_ds[0]["text"])
# Assert Test
assert_dataset_equal(test_ds, test_ds[0]["text"])
# Assert Train
assert_dataset_equal(train_ds, train_ds[0]["text"])
assert "# Role" in train_ds[0]["text"]
assert "Custom Title" in train_ds[0]["text"]
# Assert Test
assert_dataset_equal(test_ds, test_ds[0]["text"])
assert "# Role" in test_ds[0]["text"]
assert "Custom Title" in test_ds[0]["text"]
alpaca_ds = template_instance.load_dataset(dataset, output_format='alpaca', **config)
train_ds, test_ds = split_datadict(alpaca_ds)
# Assert
assert_dataset_equal(train_ds, train_ds[0]["input"])
assert_dataset_equal(test_ds, test_ds[0]["input"])
# Assert Train
assert "# Role" in train_ds[0]["instruction"]
assert "Custom Title" in train_ds[0]["input"]
# Assert Test
assert "# Role" in test_ds[0]["instruction"]
assert "Custom Title" in test_ds[0]["input"]
openai_ds = template_instance.load_dataset(dataset, output_format='openai', **config)
train_ds, test_ds = split_datadict(openai_ds)
# Assert
assert_dataset_equal(train_ds, train_ds[0]["messages"][1]["content"])
assert_dataset_equal(test_ds, test_ds[0]["messages"][1]["content"]) # noqa
# Assert Train
assert "Custom Title" in train_ds[0]["messages"][-1]["content"]
assert "# Role" in train_ds[0]["messages"][0]["content"]
assert "Custom Title" in train_ds[0]["messages"][1]["content"]
assert "Custom Title" in train_ds[0]["messages"][2]["content"]
# Assert Test
assert "Custom Title" in test_ds[0]["messages"][-1]["content"]
assert "# Role" in test_ds[0]["messages"][0]["content"]
assert "Custom Title" in test_ds[0]["messages"][1]["content"]
assert "Custom Title" in test_ds[0]["messages"][2]["content"]