-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
71 lines (60 loc) · 1.98 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import glob
from sentencepiece import SentencePieceTrainer, SentencePieceProcessor
# TODO: refactor
class Tokenizer:
def __init__(self, input_file, vocab_size=4096, retrain=False):
model = glob.glob("*.model")
vocab = glob.glob("*.vocab")
self.tokenizer_options = dict(
input=input_file,
input_format="text",
model_prefix="tokenizer",
model_type="bpe",
vocab_size=vocab_size,
normalization_rule_name="identity",
remove_extra_whitespaces=False,
input_sentence_size=200_000_000,
max_sentence_length=4192,
seed_sentencepiece_size=1_000_000,
shuffle_input_sentence=True,
character_coverage=0.99995,
byte_fallback=True,
split_digits=True,
split_by_unicode_script=True,
split_by_whitespace=True,
split_by_number=True,
max_sentencepiece_length=16,
add_dummy_prefix=True,
allow_whitespace_only_pieces=True,
unk_id=0,
bos_id=1,
eos_id=2,
pad_id=3,
num_threads=os.cpu_count(),
minloglevel=10,
)
if retrain:
for m in model:
os.remove(m)
for v in vocab:
os.remove(v)
if len(model) > 0 and len(vocab) > 0 and not retrain:
print("Initializing tokenizer from file")
else:
print("Training tokenizer")
SentencePieceTrainer.train(**self.tokenizer_options)
self.sp = SentencePieceProcessor(glob.glob("*.model")[0])
def encode(self, text):
return self.sp.encode(text)
def decode(self, tokens):
return self.sp.decode(tokens)
@property
def pad_id(self):
return self.sp.pad_id()
@property
def eos_id(self):
return self.sp.eos_id()
@classmethod
def from_file(file_path):
pass