-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_a4.py
147 lines (112 loc) · 5.86 KB
/
test_a4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""test_a4.py
Tests for the first two parts of assignment 4.
"""
import sys
import unittest
import warnings
from main_a4 import Text
import brown
def ignore_warnings(test_func):
"""Catching warnings via a decorator."""
def do_test(self, *args, **kwargs):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
test_func(self, *args, **kwargs)
return do_test
class ExploreTaggedCorpusTests(unittest.TestCase):
"""Note the use of setUpClass, which loads some data that can be used in each
individual test. Each method that starts with 'test' will run when you do
unittest.main(). You may use as many other as you want."""
@classmethod
@ignore_warnings
def setUpClass(cls):
# this takes a while...
cls.bc = brown.BrownCorpus()
cls.nouns_more_common_in_plural = brown.nouns_more_common_in_plural_form(cls.bc)
cls.most_tags = brown.which_word_has_greatest_number_of_distinct_tags(cls.bc)
cls.frequent_tags = brown.tags_in_order_of_decreasing_frequency(cls.bc)
cls.tags_after_nouns = brown.tags_that_nouns_are_most_commonly_found_after(cls.bc)
cls.ambiguous_types = brown.proportion_ambiguous_word_types(cls.bc)
cls.ambiguous_tokens = brown.proportion_ambiguous_word_tokens(cls.bc)
def test_nouns_more_common_in_plural(self):
"""There are about 3400 of them."""
self.assertTrue(3000 < len(self.nouns_more_common_in_plural) < 3800)
def test_most_tags(self):
"""The word with the most tags is 'that'."""
self.assertEqual(self.most_tags[0][0], 'that')
def test_frequent_tags1(self):
"""The most frequent Brown tag is NN and it occurs 152470 times."""
self.assertEqual(self.frequent_tags[0][0], 'NN')
self.assertTrue(150000 < self.frequent_tags[0][1], 155000)
def test_frequent_tags2(self):
"""Get the 20 most frequent tags and make sure the overlap is greater than 18."""
most_frequent_tags = {'TO', 'NNS', 'RB', ',', 'NP', 'CD', 'VBD', 'CS', 'VBG', 'JJ',
'VBN', 'NN', 'PPSS', 'VB', 'IN', 'PP$', 'CC', 'AT', 'PPS', '.'}
most_frequent_tags_found = set(t[0] for t in self.frequent_tags[:20])
self.assertTrue(len(most_frequent_tags & most_frequent_tags_found) > 18)
def test_noun_tags1(self):
"""Overlap of found set and example set is at least 8."""
tags = [('AT', 59656), ('JJ', 40864), ('IN', 24012), ('NN', 17789), ('PP$', 12241),
('CC', 6610), ('CD', 5264), ('AP', 5112), ('DT', 4540), ('VBG', 4407)]
self.assertTrue(len(set([t[0] for t in tags])
& set([t[0] for t in self.tags_after_nouns])) > 8)
def test_noun_tags2(self):
"""Most frequent tag before a noun occurs at least 50k times."""
self.assertTrue(self.tags_after_nouns[0][1] > 50000)
def test_ambiguous_types(self):
self.assertTrue(0.15 < self.ambiguous_types < 0.25)
def test_ambiguous_tokens(self):
self.assertTrue(0.78 < self.ambiguous_tokens < 0.88)
class ExploreTextTests(unittest.TestCase):
@classmethod
@ignore_warnings
def setUpClass(cls):
cls.grail = Text('data/grail.txt')
cls.nouns_more_in_plural = cls.grail.nouns_more_common_in_plural_form()
cls.distinct_tags = cls.grail.which_word_has_greatest_number_of_distinct_tags()
cls.tags_in_decreasing_order = cls.grail.tags_in_order_of_decreasing_frequency()
cls.tags_after_nouns = cls.grail.tags_that_nouns_are_most_commonly_found_after()
cls.ambiguous_types = cls.grail.proportion_ambiguous_word_types()
cls.ambiguous_tokens = cls.grail.proportion_ambiguous_word_tokens()
def test_if_empty(self):
"""Testing if the text is properly read."""
self.assertTrue(len(self.grail.text) > 0)
def test_plural_form1(self):
"""The words are recroded."""
self.assertTrue(len(self.nouns_more_in_plural) > 0)
def test_plural_form2(self):
"""If 'Knights' in the list"""
self.assertTrue('Knights' in self.nouns_more_in_plural)
def test_plural_form3(self):
"""Testing if all words end with 's'"""
self.assertTrue(all(i.endswith('s') for i in self.nouns_more_in_plural))
def test_distinct_tag1(self):
"""'arthur' is in the list"""
self.assertTrue(any(i[0] == 'arthur' for i in self.distinct_tags))
def test_distinct_tag2(self):
"""There are about 4 words have the same number of distinct tags"""
self.assertTrue(len(self.distinct_tags) > 2 & len(self.distinct_tags) < 5)
def test_tags_frequency(self):
"""'NN' is the most frequent tags"""
self.assertTrue(self.tags_in_decreasing_order[0][0] == 'NN')
def test_tags_frequency_contents(self):
"""Testing if the list contain several tags"""
tags = ['NN', 'VB', 'DT', 'JJ', '.']
self.assertTrue(len([i[0] for i in self.tags_in_decreasing_order if i[0] in tags]) > 4)
def test_noun_tags1(self):
"""Overlap of found set and example set is at least 6."""
# Here the target was lowered from 8 since potentially there was a
# different tag set used.
tags = [('.', 1198), ('NNP', 588), (':', 576), ('DT', 527), ('JJ', 443),
('NN', 426), ('IN', 167), ('PRP$', 125), (',', 115), ('CC', 49)]
self.assertTrue(len(set([t[0] for t in tags])
& set([t[0] for t in self.tags_after_nouns])) > 6)
def test_noun_tags2(self):
"""Most frequent tag before a noun occurs at least 1000 times."""
self.assertTrue(self.tags_after_nouns[0][1] > 1000)
def test_ambiguous_types(self):
self.assertTrue(0.14 < self.ambiguous_types < 0.25)
def test_ambiguous_tokens(self):
self.assertTrue(0.35 < self.ambiguous_tokens < 0.50)
if __name__ == '__main__':
unittest.main()