-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngrams_format_dic.py
120 lines (110 loc) · 2.4 KB
/
ngrams_format_dic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import re
import sys
# we use \' to escape ' (python specific)
contractions = [
"(ain) (\'t)",
"(aren) (\'t)",
"(can) (\'t)",
"(couldn) (\'t)",
"(didn) (\'t)",
"(doesn) (\'t)",
"(don) (\'t)",
"(hadn) (\'t)",
"(hasn) (\'t)",
"(haven) (\'t)",
"(isn) (\'t)",
"(mightn) (\'t)",
"(mustn) (\'t)",
"(needn) (\'t)",
"(shan) (\'t)",
"(shouldn) (\'t)",
"(wasn) (\'t)",
"(weren) (\'t)",
"(won) (\'t)",
"(wouldn) (\'t)",
"(he) (\'d)",
"(he) (\'ll)",
"(he) (\'s)",
"(how) (\'d)",
"(how) (\'ll)",
"(how) (\'s)",
"(i) (\'d)",
"(i) (\'ll)",
"(i) (\'m)",
"(i) (\'ve)",
"(it) (\'d)",
"(it) (\'ll)",
"(it) (\'s)",
"(let) (\'s)",
"(ma) (\'am)",
"(might) (\'ve)",
"(must) (\'ve)",
"(o) (\'clock)",
"(she) (\'d)",
"(she) (\'ll)",
"(she) (\'s)",
"(should) (\'ve)",
"(somebody) (\'s)",
"(someone) (\'s)",
"(something) (\'s)",
"(that) (\'d)",
"(that) (\'ll)",
"(that) (\'s)",
"(there) (\'d)",
"(there) (\'ll)",
"(there) (\'s)",
"(they) (\'d)",
"(they) (\'ll)",
"(they) (\'re)",
"(they) (\'ve)",
"(wasn) (\'t)",
"(we) (\'d)",
"(we) (\'ll)",
"(we) (\'re)",
"(we) (\'ve)",
"(what) (\'d)",
"(what) (\'ll)",
"(what) (\'re)",
"(what) (\'s)",
"(when) (\'d)",
"(when) (\'ll)",
"(when) (\'s)",
"(where) (\'d)",
"(where) (\'ll)",
"(where) (\'s)",
"(who) (\'d)",
"(who) (\'ll)",
"(who) (\'re)",
"(who) (\'s)",
"(who) (\'ve)",
"(why) (\'d)",
"(why) (\'ll)",
"(why) (\'s)",
"(would) (\'ve)",
"(you) (\'d)",
"(you) (\'ll)",
"(you) (\'re)",
"(you) (\'ve)"
]
ngramsfile = open(sys.argv[1],"r")
outngramsfile = open(sys.argv[2],"wt")
for line in ngramsfile:
line = line.lower()
line = re.sub(',\d+\n', '', line)
if(re.search('"{2}',line)):
#print("skip multi whitespace")
continue
line = re.sub('"', '', line)
if "'" in line:
#print("possible contraction")
for contraction in contractions:
(newline, subs) = (re.subn(contraction, r'\1\2', line))
#print("newline=", end="")
if(subs):
line = newline
#print(newline)
#print(line)
outngramsfile.write(line)
outngramsfile.write("\n")
ngramsfile.close()
outngramsfile.close()