-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathngram-en.py
139 lines (117 loc) · 3.5 KB
/
ngram-en.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# coding:utf-8
"""
N-Grams for Shakespeare corpus.
"""
import random
import operator
def text_clean(text_file):
"""
delimiter will be used for prediction.
"""
all_text = ""
with open(text_file, 'r') as f:
for line in f:
# For spilt convenience.
line = line.replace(',', ' , ').replace('.', ' . ').replace('!', ' ! ')\
.replace('?', ' ? ').replace('"', ' " ')
# To lower.
line = line.lower()
all_text += line
return all_text
def ngram(text, grams):
"""
generate the n-gram vocabulary.
"""
model = []
text = text.split()
count = 0
for token in text[:len(text) - grams+1]:
model.append(' '.join(text[count:count + grams]))
count += 1
return model
def count_gram(text_file, grams):
"""
Count the frequency of n-grams and (n-1) grams,
and use add-one smoothing to make it less sparse.
"""
text = text_clean(text_file)
model = ngram(text, grams)
lower_model = ngram(text, grams-1)
# n grams dict
mdict = {}
for item in model:
if item not in mdict:
mdict[item] = 0
mdict[item] += 1
# (n-1) grams dict
lower_dict = {}
for item in lower_model:
if item not in lower_dict:
lower_dict[item] = 0
lower_dict[item] += 1
# count all unique vocabulary
text_dict = text.split()
voca_set = set()
for item in text_dict:
voca_set.add(item)
# Add-One Smoothing
voca_prob_dict = {}
for item in model:
item_list = item.split()
back_str = ' '.join(item_list[:-1])
prob = float(mdict[item] + 1) / (lower_dict[back_str] + len(voca_set))
voca_prob_dict[item] = prob
return voca_prob_dict
def generate_word(voca_prob_dict, pre, grams, word_length):
"""
:param pre: the pre sequence provided.
:param grams:
:param word_length:
:return:
"""
print "The pre is: " + pre + '\n'
pre_list = pre.split()
for i in xrange(word_length):
str_len = len(pre_list)
back_voca = ' '.join(pre_list[str_len-grams+1:])
predict_voca_dict = {}
for item in voca_prob_dict:
item_list = item.split()
back = ' '.join(item_list[:-1])
if back_voca == back:
predict_voca_dict[item_list[-1]] = voca_prob_dict[item]
if len(predict_voca_dict):
sorted_next_word = sorted(predict_voca_dict.items(), key=operator.itemgetter(1))
if len(sorted_next_word) >= 3:
next_word = random.choice(sorted_next_word[-3:])
else:
next_word = random.choice(sorted_next_word)
pre_list.append(next_word[0])
else:
break
s = ' '.join(pre_list)
print s
if __name__ == '__main__':
# Game Of Thrones corpus
print "********"
print "Generate text with 'Game Of Thrones 01.txt'."
print "grams = 3"
print "word length = 50"
f1 = "novel/GameOfThrones01.txt"
grams = 3
word_length = 50
vab = count_gram(f1, grams)
pre1 = "There was an edge to this"
generate_word(vab, pre1, grams, word_length)
print "********\n"
# Shakespeare corpus
print "Generate text with 'shakespeare.txt'."
f2 = "novel/shakespeare.txt"
print "grams = 4"
print "word length = 100"
grams = 4
word_length = 100
vab = count_gram(f2, grams)
pre2 = "track them , and we"
generate_word(vab, pre2, grams, word_length)
print "********\n"