-
Notifications
You must be signed in to change notification settings - Fork 64
/
Copy pathinverse_index_lab.py
86 lines (71 loc) · 3.1 KB
/
inverse_index_lab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from random import randint # version code d345910f07ae
coursera = 1
# Please fill out this stencil and submit using the provided submission script.
## 1: (Task 1) Movie Review
## Task 1
def movie_review(name):
"""
Input: the name of a movie
Output: a string (one of the review options), selected at random using randint
"""
movie_reviews=['See it!', 'A gem!', 'Ideological claptrapl!']
return movie_reviews[randint(0, len(movie_reviews)-1)]
## 2: (Task 2) Make Inverse Index
def makeInverseIndex(strlist):
"""
Input: a list of documents as strings
Output: a dictionary that maps each word in any document to the set consisting of the
document ids (ie, the index in the strlist) for all documents containing the word.
Distinguish between an occurence of a string (e.g. "use") in the document as a word
(surrounded by spaces), and an occurence of the string as a substring of a word (e.g. "because").
Only the former should be represented in the inverse index.
Feel free to use a loop instead of a comprehension.
Example:
>>> makeInverseIndex(['hello world','hello','hello cat','hellolot of cats']) == {'hello': {0, 1, 2}, 'cat': {2}, 'of': {3}, 'world': {0}, 'cats': {3}, 'hellolot': {3}}
True
"""
words=set()
tmpsets={}
for tmplist in strlist:
words |= set(tmplist.split())
for w in words:
tmpsetsVal=set()
for i in range(len(strlist)):
if w in strlist[i] and w in strlist[i].split():
tmpsetsVal |= {i}
tmpsets[w] = set(tmpsetsVal)
return tmpsets
## 3: (Task 3) Or Search
def orSearch(inverseIndex, query):
"""
Input: an inverse index, as created by makeInverseIndex, and a list of words to query
Output: the set of document ids that contain _any_ of the specified words
Feel free to use a loop instead of a comprehension.
>>> idx = makeInverseIndex(['Johann Sebastian Bach', 'Johannes Brahms', 'Johann Strauss the Younger', 'Johann Strauss the Elder', ' Johann Christian Bach', 'Carl Philipp Emanuel Bach'])
>>> orSearch(idx, ['Bach','the'])
{0, 2, 3, 4, 5}
>>> orSearch(idx, ['Johann', 'Carl'])
{0, 2, 3, 4, 5}
"""
qSets=set()
for i in range(len(query)):
if query[i] in list(inverseIndex.keys()):
qSets |= inverseIndex[query[i]]
return qSets
## 4: (Task 4) And Search
def andSearch(inverseIndex, query):
"""
Input: an inverse index, as created by makeInverseIndex, and a list of words to query
Output: the set of all document ids that contain _all_ of the specified words
Feel free to use a loop instead of a comprehension.
>>> idx = makeInverseIndex(['Johann Sebastian Bach', 'Johannes Brahms', 'Johann Strauss the Younger', 'Johann Strauss the Elder', ' Johann Christian Bach', 'Carl Philipp Emanuel Bach'])
>>> andSearch(idx, ['Johann', 'the'])
{2, 3}
>>> andSearch(idx, ['Johann', 'Bach'])
{0, 4}
"""
qSets=set(range(len(inverseIndex)))
for i in range(len(query)):
if query[i] in list(inverseIndex.keys()):
qSets &= inverseIndex[query[i]]
return qSets