-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_generator.py
91 lines (75 loc) · 3.01 KB
/
text_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import corpus
import random
import time
import logging
import sys
REGEX = r"[mMdD][rs]s?\. ?[\w,]+|[\w]+'?[\w,]+|[\.!\?:]"
class TextGenerator(object):
def __init__(self, corpus):
self.statements = corpus.statements.copy()
self.BEGIN_CHAR = corpus.BEGIN_CHAR
def next_token(self, token0, token1):
extensions = self.statements[token0, token1]
variants = sum(extensions.values())
choice = random.randint(1, variants)
for extension in extensions:
if choice <= extensions[extension]:
return extension
else:
choice -= extensions[extension]
def gen_sentence(self):
sentence = ''
token0, token1 = self.BEGIN_CHAR, self.BEGIN_CHAR
while True:
token0, token1 = token1, self.next_token(token0, token1)
if token1 == self.BEGIN_CHAR:
break
elif token1 in '.!?,;:' or token0 == self.BEGIN_CHAR:
sentence += token1
else:
sentence += ' ' + token1
return sentence.capitalize()
def gen_text_line(self, sentences_count, min_paragraph_size,
max_paragraph_size):
try:
if min_paragraph_size < 0 or max_paragraph_size < 0 or\
max_paragraph_size < min_paragraph_size or\
sentences_count < 0:
raise ValueError()
paragraph_size = random.randint(min_paragraph_size,
max_paragraph_size)
yield '\t'
for sentence_number in xrange(sentences_count):
paragraph_size -= 1
if paragraph_size:
ending = ' '
else:
ending = '\n\t'
paragraph_size = random.randint(min_paragraph_size,
max_paragraph_size)
yield self.gen_sentence() + ending
except ValueError:
print 'Bad Values of Arguments.'
if __name__ == "__main__":
logging.basicConfig(format='%(message)s', level=logging.INFO)
template = '{:-^50}'
REGEX = r"[mMdD][rs]s?\. ?[\w,]+|[\w]+'?[\w,]+|[\.!\?:]"
END_CHARS = '.?!'
corpus = corpus.Corpus(REGEX, END_CHARS)
INPUT_FILE_NAME = sys.argv[1]
log = 'Reading from {}'.format(INPUT_FILE_NAME)
logging.info(template.format(log))
start_time = time.time()
corpus.load(INPUT_FILE_NAME)
log = 'Time: {} s'.format(time.time() - start_time)
logging.info(template.format(log))
OUTPUT_FILE_NAME = sys.argv[2]
SENTENCES_COUNT = int(sys.argv[3])
log = 'Writing to {}'.format(OUTPUT_FILE_NAME)
logging.info(template.format(log))
start_time = time.time()
text_generator = TextGenerator(corpus)
with open(OUTPUT_FILE_NAME, 'w') as f:
f.writelines(text_generator.gen_text_line(SENTENCES_COUNT, 1, 10))
log = 'Time: {} s'.format(time.time() - start_time)
logging.info(template.format(log))