-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvocab.py
141 lines (106 loc) · 4.12 KB
/
vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import numpy as np
class Vocab:
EMPTY_CHAR = ' '
POS_CODE2HUMAN = {
'COMP': 'наречие',
'GRND': 'деепричастие',
'PRED': 'предикатив',
'INTJ': 'междометие',
'PRTF': 'причастие',
'ADJS': 'краткое прилагательное',
'PRTS': 'глагол (сов. форма)',
'INFN': 'инфинитив (глагол)',
'CONJ': 'союз',
'PRCL': 'частицы',
'ADVB': 'наречие',
'NPRO': 'местоимение',
'ADJF': 'прилагательное',
'PREP': 'предлог',
'VERB': 'глагол',
'NOUN': 'существительное'
}
def __init__(self, data_reader):
self._data_reader = data_reader
self._char2index = {}
self._index2char = []
self._part2index = {}
self._index2part = []
self._char_freq_threshold = 0
self._skipped_chars = []
self._loaded = False
def _feed_char(self, char):
if char in self._skipped_chars:
return
if char not in self._char2index:
index = len(self._char2index)
self._char2index[char] = index
self._index2char.append(char)
def _feed_speech_part(self, speech_part):
if speech_part not in self._part2index:
index = len(self._part2index)
self._part2index[speech_part] = index
self._index2part.append(speech_part)
def _load_initial_chars(self):
self._feed_char(self.EMPTY_CHAR)
self._feed_char('{')
self._feed_char('}')
def _load_initial_parts(self):
self._feed_speech_part('UNKNOWN')
def _load_chars(self):
uniq_chars = self._data_reader.get_uniq_chars()
self._load_initial_chars()
for c in sorted(uniq_chars):
self._feed_char(c)
def _load_speech_parts(self):
uniq_speech_parts = self._data_reader.get_uniq_speech_parts()
self._load_initial_parts()
for part in sorted(uniq_speech_parts):
self._feed_speech_part(part)
def _calculate_char_freq_threshold(self):
_char, max_freq = self._data_reader.get_chars_freq()[-1]
self._char_freq_threshold = np.ceil(max_freq * 0.01) # 1% of max threshold
def _find_low_freq_chars(self):
self._calculate_char_freq_threshold()
for char, freq in self._data_reader.get_chars_freq():
if freq <= self._char_freq_threshold:
self._skipped_chars.append(char)
else:
break
def load(self):
self._find_low_freq_chars()
self._load_chars()
self._load_speech_parts()
self._loaded = True
def char_to_index(self, char):
if not self._loaded:
raise BaseException('chars not loaded')
return self._char2index.get(char) or self._char2index[self.EMPTY_CHAR]
def part_to_index(self, speech_part):
if not self._loaded:
raise BaseException('chars not loaded')
return self._part2index[speech_part]
def index_to_speech_part(self, index):
if not self._loaded:
raise BaseException('chars not loaded')
return self._index2part[index]
def indices_to_speech_part(self, indices):
if not self._loaded:
raise BaseException('chars not loaded')
return np.array(self._index2part)[indices]
def index_to_speech_part_human(self, index):
pos_code = self.index_to_speech_part(index)
return self.POS_CODE2HUMAN[pos_code]
def char_vocab_size(self):
return len(self._index2char)
def part_vocab_size(self):
return len(self._index2part)
if __name__ == '__main__':
from data_reader import OpenCorporaReader
from download_data import OPEN_CORPORA_DEST_FILE
data_reader = OpenCorporaReader(OPEN_CORPORA_DEST_FILE)
data_reader.load()
vocab = Vocab(data_reader)
vocab.load()
print('skipped chars threshold', vocab._char_freq_threshold)
print('skipped chars', vocab._skipped_chars)
print('vocab', vocab._char2index)