-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinal_tests.py
338 lines (291 loc) · 16.2 KB
/
final_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
# -*- coding: utf-8 -*-
"""Tests for CS109 Python Final Project.
This does not test for all the requirements of the assignment! So make
sure you test it yourself.
This entire set of tests should not take more than 30 seconds to run.
My implementation takes 9s (on an i5 at 3.3Ghz). My implementation is
not particularly optimized. Just make sure you are not interating any
sequences repeatedly when you don't need to or anything like that.
(HINT: Use dicts when you need to perform lookups)
Run this script using:
python3.5 final_tests.py
It should work if you are in the same directory as your final.py and
graph.py files. If this does not work you may want to try:
PYTHONPATH=[directory containing final.py and graph.py] python3.5 final_tests.py
NOTE: These tests use the internet to test train_url. So those tests
will fail if you do not have internet access.
"""
# DO NOT CHANGE THIS FILE. Grading will be done with an official
# version, so make sure your code works with this exact version.
from contextlib import contextmanager
import tempfile
import os
import unittest
import itertools
import sys
from pathlib import Path
import final
# You may find some of the functions or code here useful in either
# your tests or your implementation. Feel free to use it, but of
# course cite and credit your source. (This is a hint, but I'm not
# telling you which function you need.)
@contextmanager
def nonexistant_filename(suffix=""):
with tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False) as fi:
filename = fi.name
os.remove(filename)
try:
yield str(filename)
finally:
try:
os.remove(filename)
except FileNotFoundError:
pass
@contextmanager
def filled_filename(content, suffix=""):
with tempfile.NamedTemporaryFile(mode="w" if isinstance(content, str) else "wb", suffix=suffix, delete=False) as fi:
fi.write(content)
filename = fi.name
try:
yield str(filename)
finally:
os.remove(filename)
def windowed(iterable, size):
"""Convert an iterable to an iterable over a "windows" of the input.
The windows are produced by sliding a window over the input iterable.
"""
# empty list to be returned
window = list()
# iterate over the variables in the iterable
for v in iterable:
# check if the size of the window is in the proper window
if len(window) < size:
# if so then add the next word/char/byte to the window
window.append(v)
else:
# otherwise, pop front and add to back, sliding the window down
# by one unit
window.pop(0)
window.append(v)
# if the window has reached the size, yield it as a tuple
if len(window) == size:
yield tuple(window)
def contains_sequence(iteratable, sequence, length=10000, require_length=True, times=1):
sequence = tuple(sequence)
count = 0
found = 0
for window in itertools.islice(windowed(iteratable, len(sequence)), length):
#print(window, count, sequence)
count += 1
if window == sequence:
found += 1
if found >= times:
return True
#if count < length-1 and require_length:
# raise AssertionError("Iterable did not contain enought values for check. Ran out at {}; needed {}.".format(count, length))
return False
class RandomWriterTests(unittest.TestCase):
"""Some simple tests for RandomWriter.
This is not an exhaustive test suite.
"""
DEFAULT_LENGTH = 10090
def assertContainsSequence(self, iteratable, sequence, length=None, times=1):
length = length or self.DEFAULT_LENGTH
lst = list(itertools.islice(iteratable, length + len(sequence)*2))
if not contains_sequence(lst, sequence, length, times=times):
self.fail("The given iterable must contain the sequence: {} at least {} times "
"(in the first {} elements)\nSample: {}".format(list(sequence), times, length, ", ".join(repr(x) for x in lst[:100])))
def assertNotContainsSequence(self, iteratable, sequence, length=None):
length = length or self.DEFAULT_LENGTH
lst = list(itertools.islice(iteratable, length + len(sequence)*2))
if contains_sequence(lst, sequence, length):
self.fail("The given iterable must NOT contain the sequence: {} "
"(in the first {} elements)\nSample: {}".format(list(sequence), length, ", ".join(repr(x) for x in lst[:100])))
def test_numeric_sequence(self):
rw = final.RandomWriter(2)
rw.train_iterable((1,2,3,4,5,5,4,3,2,1))
self.assertNotContainsSequence(rw.generate(), [5,5,3])
self.assertNotContainsSequence(rw.generate(), [1,2,5])
self.assertNotContainsSequence(rw.generate(), [2,4])
self.assertContainsSequence(rw.generate(), [3,4,5,5,4,3,2], times=10)
def test_words(self):
rw = final.RandomWriter(1, final.Tokenization.word)
rw.train_iterable("the given iterable must contain the sequence the")
self.assertNotContainsSequence(rw.generate(), "the the".split(" "))
self.assertNotContainsSequence(rw.generate(), "the iterable".split(" "))
self.assertContainsSequence(rw.generate(), "iterable must contain".split(" "), times=10)
self.assertContainsSequence(rw.generate(), "the sequence".split(" "), times=200)
def test_save_load_pickle(self):
rw = final.RandomWriter(1, final.Tokenization.character)
rw.train_iterable("abcaea")
with nonexistant_filename() as fn:
rw.save_pickle(fn)
rw2 = final.RandomWriter.load_pickle(fn)
self.assertNotContainsSequence(rw.generate(), "ac")
self.assertNotContainsSequence(rw.generate(), "aa")
self.assertNotContainsSequence(rw.generate(), "ce")
self.assertContainsSequence(rw.generate(), "abc", times=100)
self.assertContainsSequence(rw.generate(), "aeaeab", times=100)
def test_generate_file1(self):
rw = final.RandomWriter(1, final.Tokenization.character)
rw.train_iterable("abcaea")
with nonexistant_filename() as fn:
rw.generate_file(fn, self.DEFAULT_LENGTH)
with open(fn, "rt") as fi:
content = fi.read()
self.assertNotContainsSequence(content, "ac")
self.assertNotContainsSequence(content, "aa")
self.assertNotContainsSequence(content, "ce")
self.assertContainsSequence(content, "abc", times=100)
self.assertContainsSequence(content, "aeaeab", times=100)
def test_generate_file4(self):
rw = final.RandomWriter(1, final.Tokenization.byte)
# a b c a e a
rw.train_iterable(b"\xfe\xff\x02\xfe\x03\xfe")
with nonexistant_filename() as fn:
rw.generate_file(fn, self.DEFAULT_LENGTH)
with open(fn, "rb") as fi:
content = fi.read()
self.assertNotContainsSequence(content, b"\xfe\x02")
self.assertNotContainsSequence(content, b"\xfe\xfe")
self.assertNotContainsSequence(content, b"\x02\x03")
self.assertContainsSequence(content, b"\xfe\xff\x02", times=100)
self.assertContainsSequence(content, b"\xfe\x03\xfe\x03\xfe\xff", times=100)
def test_generate_file_size(self):
rw = final.RandomWriter(1, final.Tokenization.character)
rw.train_iterable("abcaea")
with nonexistant_filename() as fn:
rw.generate_file(fn, self.DEFAULT_LENGTH)
with open(fn, "rt") as fi:
content = fi.read()
self.assertGreaterEqual(len(content), self.DEFAULT_LENGTH)
self.assertLessEqual(len(content), self.DEFAULT_LENGTH+2)
def test_generate_file2(self):
rw = final.RandomWriter(1, final.Tokenization.word)
rw.train_iterable("a the word the")
with nonexistant_filename() as fn:
rw.generate_file(fn, self.DEFAULT_LENGTH)
with open(fn, "rt") as fi:
content = fi.read()
self.assertNotContainsSequence(content, "the a")
self.assertContainsSequence(content, "the word", times=100)
def test_generate_file3(self):
rw = final.RandomWriter(2, final.Tokenization.none)
rw.train_iterable((1,2,3,4,5,5,4,3,2,1))
with nonexistant_filename() as fn:
rw.generate_file(fn, self.DEFAULT_LENGTH)
with open(fn, "rt") as fi:
content = fi.read()
self.assertNotContainsSequence(content, "5 5 3")
self.assertNotContainsSequence(content, "1 2 5")
self.assertContainsSequence(content, "3 4 5 5 4 3 2", times=100)
def test_numeric_sequence_in(self):
rw = final.RandomWriter(2)
rw.train_iterable((1,2,3,4,5,5,5,4,3,2,1,2,4,5))
self.assertIsInstance(next(iter(rw.generate())), int)
self.assertContainsSequence(rw.generate(), [3,4,5,5,4,3,2], times=10)
self.assertContainsSequence(rw.generate(), [3,4,5,5,5,5,4,3,2])
self.assertContainsSequence(rw.generate(), [5,5,5,5,5])
self.assertContainsSequence(rw.generate(), [3,2,1,2,4,5,5,4])
self.assertContainsSequence(rw.generate(), [3,2,1,2,3,4,5,5,4])
def test_numeric_sequence_notin(self):
rw = final.RandomWriter(2)
rw.train_iterable((1,2,3,4,5,5,5,4,3,2,1,2,4,5))
self.assertNotContainsSequence(rw.generate(), [5,5,3])
self.assertNotContainsSequence(rw.generate(), [1,2,5])
self.assertNotContainsSequence(rw.generate(), [4,2])
self.assertNotContainsSequence(rw.generate(), ["5"])
def test_generate_count(self):
rw = final.RandomWriter(2, final.Tokenization.character)
rw.train_iterable("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! "
"in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals!")
generated = len(list(itertools.islice(rw.generate(), 10000)))
self.assertEqual(generated, 10000)
def test_characters(self):
rw = final.RandomWriter(2, final.Tokenization.character)
rw.train_iterable("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! "
"in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals!")
self.assertIsInstance(next(iter(rw.generate())), str)
self.assertContainsSequence(rw.generate(), "worm")
self.assertNotContainsSequence(rw.generate(), "mals ")
def test_train_iterator(self):
rw = final.RandomWriter(1)
rw.train_iterable(iter((1,2,3,4,5,5,5,4,3,2,1,2,4,5)))
self.assertIsInstance(next(iter(rw.generate())), int)
self.assertContainsSequence(rw.generate(), [3,4,5,5,4,3,2], times=10)
self.assertContainsSequence(rw.generate(), [3,4,5,5,5,5,4,3,2])
self.assertContainsSequence(rw.generate(), [5,5,5,5,5])
self.assertContainsSequence(rw.generate(), [3,2,1,2,4,5,5,4])
self.assertContainsSequence(rw.generate(), [3,2,1,2,3,4,5,5,4])
def test_characters_level3(self):
rw = final.RandomWriter(3, final.Tokenization.character)
rw.train_iterable("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! "
"in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals!")
self.assertIsInstance(next(iter(rw.generate())), str)
self.assertNotContainsSequence(rw.generate(), "worm")
self.assertNotContainsSequence(rw.generate(), "mals ")
self.assertContainsSequence(rw.generate(), "n how n")
def test_bytes_nonutf8(self):
rw = final.RandomWriter(2, final.Tokenization.byte)
rw.train_iterable(b"What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! "
b"in action how like an angel! in apprehension how like a god!\xff\xfe the beauty of the world, the paragon of animals!")
self.assertTrue(isinstance(next(iter(rw.generate())), (int, bytes)))
self.assertNotContainsSequence(rw.generate(), b"mals ")
self.assertContainsSequence(rw.generate(), b"worm")
self.assertContainsSequence(rw.generate(), b"!\xff\xfe")
def test_bytes_nonutf8_file(self):
rw = final.RandomWriter(1, final.Tokenization.byte)
rw.train_url("http://www.singingwizard.org/stuff/nonutf8.txt")
self.assertTrue(isinstance(next(iter(rw.generate())), (int, bytes)))
self.assertContainsSequence(rw.generate(), b"\xfe\xff\xfe")
self.assertNotContainsSequence(rw.generate(), b"\x02\xfe")
def test_train_twice(self):
rw = final.RandomWriter(3, final.Tokenization.character)
rw.train_iterable("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! ")
rw.train_iterable("in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals!")
self.assertIsInstance(next(iter(rw.generate())), str)
self.assertNotContainsSequence(rw.generate(), "worm")
self.assertNotContainsSequence(rw.generate(), "mals ")
self.assertContainsSequence(rw.generate(), "n how n")
def test_words2(self):
rw = final.RandomWriter(2, final.Tokenization.word)
rw.train_iterable("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! "
"in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals!")
self.assertIsInstance(next(iter(rw.generate())), str)
self.assertNotContainsSequence(rw.generate(), "man angel".split(" "), length=50000)
self.assertNotContainsSequence(rw.generate(), "infinite in reason".split(" "), length=50000)
self.assertNotContainsSequence(rw.generate(), ("worm",))
self.assertContainsSequence(rw.generate(), "action how like a god!".split(" "), length=50000)
self.assertContainsSequence(rw.generate(), "infinite in faculty!".split(" "), length=50000)
def test_multiple_generators(self):
rw = final.RandomWriter(2, final.Tokenization.character)
rw.train_iterable("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! "
"in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals!")
self.assertIsInstance(next(iter(rw.generate())), str)
g1 = rw.generate()
g2 = rw.generate()
ss = zip(*[(next(g1), next(g2)) for _ in range(self.DEFAULT_LENGTH)])
for s in ss:
self.assertContainsSequence(s, "worm")
self.assertNotContainsSequence(s, "mals ")
def test_train_url_characters(self):
rw = final.RandomWriter(3, final.Tokenization.character)
rw.train_url("http://www.singingwizard.org/stuff/pg24132.txt")
self.assertContainsSequence(rw.generate(), "ad di", length=200000)
def test_train_url_bytes(self):
rw = final.RandomWriter(4, final.Tokenization.byte)
rw.train_url("http://www.singingwizard.org/stuff/pg24132.txt")
self.assertContainsSequence(rw.generate(), b"ad di", length=300000)
def test_train_url_word(self):
rw = final.RandomWriter(1, final.Tokenization.word)
rw.train_url("http://www.singingwizard.org/stuff/pg24132.txt")
self.assertContainsSequence(rw.generate(), "she had".split(), length=100000)
def test_train_url_utf8(self):
rw = final.RandomWriter(5, final.Tokenization.character)
rw.train_url("http://www.singingwizard.org/stuff/utf8test.txt")
self.assertContainsSequence(rw.generate(), "ajtób", length=100000)
def test_graph_module(self):
import graph
print("Remember to make sure your graph module is general enough to be used in other applications.")
if __name__ == "__main__":
unittest.main()