-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathperse_tokens.py
456 lines (388 loc) · 15.3 KB
/
perse_tokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
"""
extract tokens from bangla text
"""
import os
import sys
from collections import OrderedDict
SHOR_BORNO_CHARS = 'অআইঈউঊঋএঐওঔ'
BANJON_BORNO_CHARS = 'কখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহড়ঢ়য়'
BANJON_BORNO_SPATIAL = 'ং ঃ ৎ'
ENDING_SYMBOLS = 'ো ৌ ি ী'
AKAR_EKAR_OIKAR = 'া ে ৈ'
ENDING_SYMBOLS_SPATIAL = 'ু ূ ৃ ্ ঁ'
CHONDRO_BRINDO = 'ঁ'
# NORMAL_SYMBOLS = 'ক ষ ম র'
NORMAL_SYMBOLS = SHOR_BORNO_CHARS + BANJON_BORNO_CHARS
DUMMY_SYMBOLS = '1234567890' # this is added for handle index out of error
connector = '্'
# define colors
CRED = '\033[91m'
CEND = '\033[0m'
BN_MAX_CLASSES = 904
def get_en_unique_combination():
# unique_chars_set = [c for c in EN_CHARS_SET]
unique_chars_set = []
cls_cnt = 0
for c in EN_CHARS_SET:
for ch in EN_CHARS_SET:
unique_chars_set.append(c + ch)
cls_cnt += 1
if cls_cnt == BN_MAX_CLASSES:
return unique_chars_set
def get_bn_class():
bn_class = []
with open(BN_CLASS_FREQUENCY) as f:
for line in f:
line = line.strip(' \n')
_class_name, _ = line.split(' ')
bn_class.append(_class_name)
return bn_class
def init_en_bn_maping():
en_class = get_en_unique_combination()
bn_class = get_bn_class()
f_bn_to_en = open('bn_to_en_maping.txt', 'w')
f_en_to_bn = open('en_to_bn_maping.txt', 'w')
for en, bn in zip(en_class, bn_class):
f_en_to_bn.write(en + ' ' + bn + '\n')
f_bn_to_en.write(bn + ' ' + en + '\n')
EN_TO_BN_MAP[en] = bn
BN_TO_EN_MAP[bn] = en
def print_c(log_msg):
print(CRED + log_msg + CEND)
def read_bangla_corpus(dictionary_path):
with open(dictionary_path, 'r') as corpus:
big_line = ''
big_line_set = set()
for line in corpus:
line = line.strip('\n')
line = line.strip(' ')
# print(line)
big_line += line
big_line_set.add(line)
print('len big_line', len(big_line))
print('len big_line_set', len(big_line_set))
return big_line
def is_normal_char(char):
if char in NORMAL_SYMBOLS:
return True
return False
def is_ending_char(char):
if char in ENDING_SYMBOLS:
return True
return False
def is_ending_char_spatial(char):
if char in ENDING_SYMBOLS_SPATIAL:
return True
return False
def is_spatial_type_banjn_borno(char):
if char in BANJON_BORNO_SPATIAL:
return True
return False
def is_akar_ekar_oikar(char):
if char in AKAR_EKAR_OIKAR:
return True
return False
def is_jukto_borno(chars, start_index, log=False):
if log:
print('///////// In is_jukto_borno method ////////')
c = chars[start_index]
c_next = chars[start_index + 1]
c_next_next = chars[start_index + 2]
# variables for 2 connected jukto_borno
c_next_next_next = chars[start_index + 3]
c_next_next_next_next = chars[start_index + 4]
c_next_next_next_next_next = chars[start_index + 5]
jukto_borno = ''
is_found_jukto_borno = False
new_start_index = start_index
if log:
print('------> Start index ', start_index)
print('1', c, ' 2', c_next, ' 3', c_next_next, ' 4', c_next_next_next, ' 5',\
c_next_next_next_next, ' 5', c_next_next_next_next_next)
if is_normal_char(c) and is_ending_char_spatial(c_next) and \
c_next_next == connector and is_normal_char(c_next_next_next):
# example : গণঅভ্যুত্থানের = গ+ণ+অ+ভ+ু+্+য+ত+্+থ+া+ন+ে+র
is_found_jukto_borno = True
jukto_borno = c + c_next + c_next_next + c_next_next_next
new_start_index = start_index + 4
# check two connector jukto borno
# example: ষ + ্ + ক + ্ + র = ষ্ক্র
elif is_normal_char(c) and c_next == connector and is_normal_char(c_next_next) \
and c_next_next_next == connector and is_normal_char(c_next_next_next_next): \
# this is 2 connected juktorborno
if log:
print('...In two connector jukto-borno')
is_found_jukto_borno = True
# check two connector jukto-borno with 'ENDING SYMBOLS'
# example: ষ + ্ + ক + ্ + র + া = ষ্ক্রা
if is_ending_char(c_next_next_next_next_next):
if log:
print('In two connector jukto-borno with ending symbols')
jukto_borno = c + c_next + c_next_next + c_next_next_next + c_next_next_next_next \
+ c_next_next_next_next_next
new_start_index = start_index + 6
else:
# example: ষ + ্ + ক + ্ + র + া = ষ্ক্র
if log:
print('In two connector jukto-borno without ending symbols')
# this is two connector jukto-borno without 'ENDING SYMBOLS'
jukto_borno = c + c_next + c_next_next + c_next_next_next + c_next_next_next_next
new_start_index = start_index + 5
# check one connector jukto borno
# example: ক + ্ + ষ = ক্ষ
elif is_normal_char(c) and c_next == connector and is_normal_char(c_next_next):
if log:
print('...In one connector juktoborno')
is_found_jukto_borno = True
# chekc jukto borno is with 'ENDING SYMBOLS'
# example: ক + ্ + ষ + া = ক্ষা
if is_ending_char(c_next_next_next):
if log:
print("This is jukto borno with 'ENDING_SYMBOLS' ")
jukto_borno = c + c_next + c_next_next + c_next_next_next
# update start index
new_start_index = start_index + 4
else:
# example: ক + ্ + ষ + া = ক্ষ
if log:
print("This is jukto borno without any 'ENDING_SYMBOLS' ")
jukto_borno = c + c_next + c_next_next
# update start index
new_start_index = start_index + 3
# chech jukto-borno has 'ENDING_SYMBOLS_SPATIAL'
# example: ক + ্ + ক + ু = ক্কু
temp_spatial_char = chars[new_start_index]
if log:
print('temp_spatial_char', temp_spatial_char)
if is_ending_char_spatial(temp_spatial_char):
if log:
print('This is juktoborno with spatial ending char')
jukto_borno += temp_spatial_char
new_start_index = new_start_index + 1
if log:
print('////// End of is_jukto_borno method ///////// ', new_start_index)
return is_found_jukto_borno, jukto_borno, new_start_index
def class_extractor(chars, log=True):
class_list = []
len_chars = len(chars)
chars = chars + DUMMY_SYMBOLS
i = 0
while i < len_chars:
# print('Number of symbols processed', i)
if log:
print('i', i)
if chars[i] == ' ':
print_c('space found in position: ' + str(i))
i += 1
continue
c = chars[i]
c_next = chars[i + 1]
c_next_next = chars[i + 2]
c_next_next_next = chars[i + 3]
if log:
print('1:', c, '2:', c_next, '3:', c_next_next, '4:', c_next_next_next)
is_found_jukto_borno, jukto_borno, new_start_index = is_jukto_borno(chars, i)
# print('jukto_borno_found:', is_found_jukto_borno)
# init
chunk = ''
start = i
if is_spatial_type_banjn_borno(c):
chunk = c
start = i + 1
elif is_found_jukto_borno:
if log:
print('jukto-borno', jukto_borno)
chunk = jukto_borno
# set start positon for next iteration of i
start = new_start_index
if log:
print('start in is_found_jukto_borno', start)
elif is_normal_char(c) and is_normal_char(c_next):
chunk = c
# set start positon for next iteration of i
start = i + 1
elif is_normal_char(c) and is_ending_char(c_next) and is_ending_char(c_next_next):
chunk = c + c_next + c_next_next
start = i + 3
elif is_normal_char(c) and is_spatial_type_banjn_borno(c_next):
# print('in')
# example: অংস
chunk = c
start = i + 1
elif is_normal_char(c) and is_ending_char_spatial(c_next) and c_next_next == CHONDRO_BRINDO:
# example: 'অগ্নিঝুঁকিতে'
chunk = c + c_next + c_next_next
start = i + 3
elif is_spatial_type_banjn_borno(c) and is_normal_char(c_next):
# example: অংস
# print('in2')
chunk = c
start = i + 1
elif is_spatial_type_banjn_borno(c) and c_next == CHONDRO_BRINDO:
# example : হুঁশিয়ার
chunk = c + c_next
start = i + 2
elif is_normal_char(c) and is_ending_char(c_next):
# chekc it has ENDING_SYMBOLS_SPATIAL
# example : ক ু া
if is_ending_char_spatial(c_next_next):
if log:
print('normal + ending + spatial + ending')
chunk = c + c_next + c_next_next
start = i + 3
else:
if log:
print('normal + ending')
# ending chars with out spatial ending symbols
# example : কা = ক + া
chunk = c + c_next
# print('chunk is ', chunk)
# set start positon for next iteration of i
start = i + 2
elif is_normal_char(c) and is_ending_char_spatial(c_next):
# print('ami in ?')
# check has ending char
if is_ending_char(c_next_next):
chunk = c + c_next + c_next_next
start = i + 3
else:
# print('in else')
chunk = c + c_next
# print(chunk)
start = i + 2
elif is_normal_char(c) and c_next == connector and is_normal_char(c_next_next):
if is_ending_char(c_next_next_next) or is_ending_char_spatial(c_next_next_next):
chunk = c + c_next + c_next_next + c_next_next_next
start = i + 4
else:
chunk = c + c_next + c_next_next
start = i + 3
elif is_normal_char(c) and c_next == connector:
# example: অপ্
chunk = c + c_next
start = i + 2
elif is_normal_char(c) and c_next == '1':
# this is the last chars of the line
chunk = c
start = i + 1
elif is_normal_char(c) and is_akar_ekar_oikar(c_next) and c_next_next == CHONDRO_BRINDO:
chunk = [c + c_next_next, c_next]
start = i + 3
elif is_akar_ekar_oikar(c):
chunk = c
start = i + 1
elif is_normal_char(c):
chunk = c
start = i + 1
elif str(c) in DUMMY_SYMBOLS:
# print('dummy')
# we reach to end
# example = হল্
break
if type(chunk) is list:
print('chunk is list')
for item in chunk:
class_list.append(item)
else:
# write data to text file
# print('chunk is 2', chunk)
if chunk != '':
class_list.append(chunk)
else:
print_c('chunk is empty for: ' + chars[i])
# skip this word
return []
# updae the start position of i
i = start
return class_list
def max_squence_lenght():
max_len = 0
min_len = 1000
avg_len = 0
with open('corpus/bangla_golpo_monogram.txt', 'r') as corpus:
total_words = 0
for line in corpus:
line = line.strip(' \n')
word = line.split(' ')[0]
en_word = convert_bn_to_en(word)
_len = len(en_word) * 2
print('bn word', word, ' en word', en_word, ' en len', _len)
if _len > max_len:
max_len = _len # each symbol is represented by 2 english chars
if _len < min_len:
min_len = _len
avg_len += _len
total_words += 1
print('max squence len:', max_len, ' min len:', min_len, ' avg_len:', avg_len / total_words)
def go(corpus_text):
f_not_passed = open('data/bn_not-passed.txt', 'w')
bangla_unique_class = set()
bangla_unique_word = set()
bangla_cls_frequency = {}
with open(corpus_text, 'r') as f:
# with open('bn-corpus/utest.txt', 'r') as f:
line_counter = 0
for line in f:
print('line is', line)
line = line.strip('\n')
line = line.strip(' ')
bangla_unique_word.add(line)
class_list = class_extractor(line)
# print('class_list', class_list)
if len(class_list) >= 1:
for chunk in class_list:
# print('chunk is', chunk)
bangla_unique_class.add(chunk)
# count frequency of a chunk
if chunk not in bangla_cls_frequency:
bangla_cls_frequency[chunk] = 1
else:
bangla_cls_frequency[chunk] += 1
else:
print_c('can not process word: ' + line + " lc " + str(line_counter))
f_not_passed.write(line)
f_not_passed.write('\n')
line_counter += 1
f_not_passed.close()
# sort dictionary by frequency count value
bangla_cls_frequency = OrderedDict(sorted(bangla_cls_frequency.items(), key=lambda x: x[1], reverse = True))
print('*' * 35)
print("Total unique word found :", len(bangla_unique_word))
print('Number of unique class found:', len(bangla_unique_class))
# write unique class to a file
with open('data/bn_unique_class.txt', 'w') as f_unique:
for chunk in bangla_unique_class:
f_unique.write(chunk)
f_unique.write('\n')
# write class presence frequency
with open('data/bn_class-frequency_count.txt', 'w') as f_cls_freqency:
for key, value in bangla_cls_frequency.items():
f_cls_freqency.write(key)
f_cls_freqency.write(' ')
f_cls_freqency.write(str(value))
f_cls_freqency.write('\n')
bangla_unique_word = list(bangla_unique_word)
bangla_unique_word.sort(key=len)
with open('data/bn_unique_word.txt', 'w') as w:
for word in bangla_unique_word:
w.write(word + '\n')
def debug():
chars = 'হ্যাঁ'
class_extractor(chars)
with open('single_chars.txt', 'w') as f:
for c in chars:
print(c)
f.write(c)
f.write('\n')
class_list = class_extractor(chars)
print(class_list)
def test_words():
words = ["হ্যান্ডব্যাগেই", "বিশ্ববিদ্যালয়গুলো", "ইন্টেলিজেন্সের","হিষ্টিরিয়াগ্রস্তের","মুক্তিযুদ্ধের"]
for word in words:
class_list = class_extractor(word)
print(f"{word} => {class_list}")
if __name__ == "__main__":
corpus_path = 'corpus/BengaliWordList_439.txt'
go(corpus_path)
# debug()
# test_words()