-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizers.py
89 lines (65 loc) · 2.18 KB
/
tokenizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import collections
import re
from jieba import tokenize as jieba_tokenize
from hanziconv import HanziConv as hanzi
WORD_RE_STR = r"""
(?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes.
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\*{1,}) # Asterisk runs.
|
(?:\S) # Everything else that isn't whitespace.
"""
WORD_RE = re.compile(r"(%s)" % WORD_RE_STR, re.VERBOSE | re.I | re.UNICODE)
def basic_unigram_tokenizer(s, lower=True):
words = WORD_RE.findall(s)
if lower:
words = [w.lower() for w in words]
return words
def heuristic_ending_tokenizer(s, lower=True):
words = basic_unigram_tokenizer(s, lower=lower)
return [seg for w in words for seg in heuristic_segmenter(w)]
ENDINGS = ['er', 'est', 'ish']
def heuristic_segmenter(word):
for ending in ENDINGS:
if word.endswith(ending):
return [word[:-len(ending)], '+' + ending]
return [word]
def whitespace_tokenizer(s, lower=True):
if lower:
s = s.lower()
return s.split()
def chinese_tokenizer(s, lower=True):
s = unicode(s)
if lower:
s = hanzi.toSimplified(s)
return [t[0] for t in jieba_tokenize(s)]
TOKENIZER_MAP = {
'en': heuristic_ending_tokenizer,
'zh': chinese_tokenizer,
}
def multilingual_tokenizer(s, lower=True):
assert isinstance(s, basestring) and u':' in s, repr(s)
lang, utt = s.split(u':', 1)
return TOKENIZER_MAP[lang](utt, lower=lower)
NOENDING_TOKENIZER_MAP = {
'en': basic_unigram_tokenizer,
'zh': chinese_tokenizer,
}
def multilingual_noending_tokenizer(s, lower=True):
assert isinstance(s, basestring) and u':' in s, repr(s)
lang, utt = s.split(u':', 1)
return NOENDING_TOKENIZER_MAP[lang](utt, lower=lower)
TOKENIZERS = {
'unigram': basic_unigram_tokenizer,
'ending': heuristic_ending_tokenizer,
'whitespace': whitespace_tokenizer,
'chinese': chinese_tokenizer,
'multilingual': multilingual_tokenizer,
'multilingual_noending': multilingual_noending_tokenizer,
}