-
Notifications
You must be signed in to change notification settings - Fork 163
/
Copy pathtrain_tokenizer.py
155 lines (116 loc) · 5.47 KB
/
train_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import pandas as pd
import sentencepiece as spm
import tokenizers
from tokenizers import Tokenizer, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Punctuation, Digits, Metaspace
from tokenizers.normalizers import NFKC
from transformers import PreTrainedTokenizerFast
from config import PROJECT_ROOT
def check_dir_exits(dir: str) -> None:
'''
检查文件夹是否存在,如果不存在则创建文件夹
'''
if not os.path.exists(dir):
os.makedirs(dir)
def train_my_huggingface_wiki_tokenizer(cropus_file: str, max_train_line: int=None, vocab_size: int=40960,token_type: str='char') -> None:
'''
训练tokenizer with huggingface,至少需要32G内存,运行大概需要半个小时。
'''
tokenizer_slow_save_path = PROJECT_ROOT + '/model_save/hf_tokenizer_slow/hf_bpe_tokenizer.josn'
tokenizer_fast_save_path = PROJECT_ROOT + '/model_save/hf_tokenizer'
check_dir_exits(PROJECT_ROOT + '/model_save/hf_tokenizer_slow')
check_dir_exits(tokenizer_fast_save_path)
def get_training_corpus(buffer_size: int=1000, chunk_len: int=2048) -> list:
'''
一个文本块大小2048
'''
line_cnt = 0
buffer = []
with open(cropus_file, 'r', encoding='utf-8') as f_read:
cur_chunk_txt, txt_len = [], 0
for line in f_read:
cur_chunk_txt.append(line)
txt_len += len(line)
line_cnt += 1
if txt_len >= chunk_len:
buffer.append(
''.join(cur_chunk_txt)
)
cur_chunk_txt, txt_len = [], 0
if len(buffer) >= buffer_size:
yield buffer
buffer = []
if isinstance(max_train_line, int) and line_cnt > max_train_line: break
# yield last
if len(buffer) > 0: yield buffer
special_tokens = ["[PAD]","[EOS]","[SEP]","[BOS]", "[CLS]", "[MASK]", "[UNK]"]
if token_type =='char':
model = BPE(unk_token="[UNK]")
tokenizer = Tokenizer(model)
# 用兼容等价分解合并对utf编码进行等价组合,比如全角A转换为半角A
tokenizer.normalizer = tokenizers.normalizers.Sequence([NFKC()])
# 标点符号,数字,及Metaspace预分割(否则decode出来没有空格)
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence(
[Punctuation(), Digits(individual_digits=True), Metaspace()]
)
tokenizer.add_special_tokens(special_tokens)
tokenizer.decoder = decoders.Metaspace()
elif token_type == 'byte':
# byte BPE n不需要unk_token
model = BPE()
tokenizer = Tokenizer(model)
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=True)
tokenizer.add_special_tokens(special_tokens)
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, use_regex=True)
tokenizer.post_processor = tokenizers.processors.ByteLevel(trim_offsets=False)
else:
raise Exception(f'token type must be `char` or `byte`, but got {token_type}')
trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=100, show_progress=True, special_tokens=special_tokens)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
# add \t \n
if '\t' not in tokenizer.get_vocab():
tokenizer.add_tokens(['\t'])
if '\n' not in tokenizer.get_vocab():
tokenizer.add_tokens(['\n'])
tokenizer.save(tokenizer_slow_save_path)
# 将训练的tokenizer转换为PreTrainedTokenizerFast并保存
# 转换是为了方便作为`AutoTokenizer`传到其他`huggingface`组件使用。
# 转换时要手动指定`pad_token`、`eos_token`等特殊token,因为它不指定你原来的tokenizer中哪些字符是这些特殊字符
slow_tokenizer = tokenizer
fast_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=slow_tokenizer,
unk_token="[UNK]",
pad_token="[PAD]",
cls_token="[CLS]",
sep_token="[SEP]",
mask_token="[MASK]",
bos_token='[BOS]',
eos_token='[EOS]',
)
fast_tokenizer.save_pretrained(tokenizer_fast_save_path)
print(f'slow tokenizer save in path: {tokenizer_slow_save_path}')
print(f'fast tokenizer save in path: {tokenizer_fast_save_path}')
print(f"\ntrain tokenizer finished. you can use `AutoTokenizer.from_pretrained('{tokenizer_fast_save_path}')` to load and test your tokenizer.")
def train_my_BPE_tokenizer() -> None:
'''
使用sentencepiece训练BPE,缺点只能加载300万行,16G内存会OOM
'''
txt_corpus_file = PROJECT_ROOT + '/data/my_corpus.txt'
special_tokens = ["[PAD]", "[CLS]","[SEP]", "[MASK]", "[UNK]"]
tokenizer = spm.SentencePieceTrainer.train(
input=txt_corpus_file,
model_prefix='my_tokenizer',
vocab_size=40960,
user_defined_symbols=special_tokens,
max_sentence_length=1024,
shuffle_input_sentence=True,
# character_coverage=1.0,
model_type='bpe',
)
# 模型文件保存在my_tokenizer下
if __name__ == '__main__':
cropus_file = PROJECT_ROOT + '/data/wiki.simple.txt'
train_my_huggingface_wiki_tokenizer(cropus_file=cropus_file, token_type='char') # token_type must be 'char' or 'byte'