-
Notifications
You must be signed in to change notification settings - Fork 0
/
inverted_index.py
296 lines (243 loc) · 11.5 KB
/
inverted_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
"""
Copyright 2017, University of Freiburg
Chair of Algorithms and Data Structures.
Hannah Bast <bast@cs.uni-freiburg.de>
Claudius Korzen <korzen@cs.uni-freiburg.de>
Theresa Klumpp <klumppt@cs.uni-freiburg.de>
"""
import math
import re
import argparse
import pickle
DEFAULT_B = 0.75
DEFAULT_K = 1.75
class InvertedIndex:
"""
A simple inverted index that uses BM25 scores.
"""
def __init__(self):
"""
Creates an empty inverted index.
"""
self.inverted_lists = {} # The inverted lists.
self.docs = [] # The docs, each in form (title, description).
self.doc_lengths = [] # The document lengths (= number of words).
def read_from_file(self, file_name, b=None, k=None, verbose=True):
"""
Construct the inverted index from the given file. The expected format
of the file is one document per line, in the format
<title>TAB<description>. Each entry in the inverted list associated to
a word should contain a document id and a BM25 score. Compute the BM25
scores as follows:
(1) In a first pass, compute the inverted lists with tf scores (that
is the number of occurrences of the word within the <title> and the
<description> of a document). Further, compute the document length
(DL) for each document (that is the number of words in the <title>
and the <description> of a document). Afterwards, compute the
average document length (AVDL).
(2) In a second pass, iterate each inverted list and replace the tf
scores by BM25 scores, defined as:
BM25 = tf * (k+1) / (k * (1 - b + b * DL/AVDL) + tf) * log2(N/df),
where N is the total number of documents and df is the number of
documents that contains the word.
On reading the file, use UTF-8 as the standard encoding. To split the
texts into words, use the method introduced in the lecture. Make sure
that you ignore empty words.
>>> ii = InvertedIndex()
>>> ii.read_from_file("example.tsv",
... b=0,
... k=float("inf"),
... verbose=False)
>>> inv_lists = sorted(ii.inverted_lists.items())
>>> [(w, [(i, '%.3f' % tf) for i, tf in l]) for w, l in inv_lists]
... # doctest: +NORMALIZE_WHITESPACE
[('animated', [(1, '0.415'), (2, '0.415'), (4, '0.415')]),
('animation', [(3, '2.000')]),
('film', [(2, '1.000'), (4, '1.000')]),
('movie', [(1, '0.000'), (2, '0.000'), (3, '0.000'), (4, '0.000')]),
('non', [(2, '2.000')]),
('short', [(3, '1.000'), (4, '2.000')])]
>>> ii = InvertedIndex()
>>> ii.read_from_file("example.tsv", b=0.75, k=1.75, verbose=False)
>>> inv_lists = sorted(ii.inverted_lists.items())
>>> [(w, [(i, '%.3f' % tf) for i, tf in l]) for w, l in inv_lists]
... # doctest: +NORMALIZE_WHITESPACE
[('animated', [(1, '0.459'), (2, '0.402'), (4, '0.358')]),
('animation', [(3, '2.211')]),
('film', [(2, '0.969'), (4, '0.863')]),
('movie', [(1, '0.000'), (2, '0.000'), (3, '0.000'), (4, '0.000')]),
('non', [(2, '1.938')]),
('short', [(3, '1.106'), (4, '1.313')])]
"""
b = DEFAULT_B if b is None else b
k = DEFAULT_K if k is None else k
# First pass: Compute (1) the inverted lists with tf scores and (2) the
# document lengths.
with open(file_name, "r", encoding="utf-8") as f:
doc_id = 0
for line in f:
line = line.strip()
dl = 0 # Compute the document length (number of words).
doc_id += 1
for word in re.split("[^A-Za-z]+", line):
word = word.lower().strip()
# Ignore the word if it is empty.
if len(word) == 0:
continue
dl += 1
if word not in self.inverted_lists:
# The word is seen for first time, create new list.
self.inverted_lists[word] = [(doc_id, 1)]
continue
# Get last posting to check if the doc was already seen.
last = self.inverted_lists[word][-1]
if last[0] == doc_id:
# The doc was already seen, increment tf by 1.
self.inverted_lists[word][-1] = (doc_id, last[1] + 1)
else:
# The doc was not already seen, set tf to 1.
self.inverted_lists[word].append((doc_id, 1))
# Store the doc as a tuple (title, description).
self.docs.append(tuple(line.split("\t")))
# Register the document length.
self.doc_lengths.append(dl)
if verbose:
if doc_id % 1000 == 0:
print(f"Progress: Read {doc_id:6} documents.",
end="\r")
# Compute N (the total number of documents).
n = len(self.docs)
if verbose:
print(f"Progress: Read {n:6} documents.")
# Compute AVDL (the average document length).
avdl = sum(self.doc_lengths) / n
# Second pass: Iterate the inverted lists and replace the tf scores by
# BM25 scores, defined as follows:
# BM25 = tf * (k + 1) / (k * (1 - b + b * DL / AVDL) + tf) * log2(N/df)
for word, inverted_list in self.inverted_lists.items():
for i, (doc_id, tf) in enumerate(inverted_list):
# Obtain the document length (dl) of the document.
dl = self.doc_lengths[doc_id - 1] # doc_id is 1-based.
# Compute alpha = (1 - b + b * DL / AVDL).
alpha = 1 - b + (b * dl / avdl)
# Compute tf2 = tf * (k + 1) / (k * alpha + tf).
tf2 = tf * (1 + (1 / k)) / (alpha + (tf / k)) if k > 0 else 1
# Compute df (that is the length of the inverted list).
df = len(self.inverted_lists[word])
# Compute the BM25 score = tf' * log2(N/df).
inverted_list[i] = (doc_id, tf2 * math.log(n / df, 2))
def merge(self, list1, list2):
"""
Compute the union of the two given inverted lists in linear time
(linear in the total number of entries in the two lists), where the
entries in the inverted lists are postings of form (doc_id, bm25_score)
and are expected to be sorted by doc_id, in ascending order.
>>> ii = InvertedIndex()
>>> l1 = ii.merge([(1, 2.1), (5, 3.2)], [(1, 1.7), (2, 1.3), (6, 3.3)])
>>> [(id, "%.1f" % tf) for id, tf in l1]
[(1, '3.8'), (2, '1.3'), (5, '3.2'), (6, '3.3')]
>>> l2 = ii.merge([(3, 1.7), (5, 3.2), (7, 4.1)], [(1, 2.3), (5, 1.3)])
>>> [(id, "%.1f" % tf) for id, tf in l2]
[(1, '2.3'), (3, '1.7'), (5, '4.5'), (7, '4.1')]
"""
i = 0 # The pointer in the first list.
j = 0 # The pointer in the second list.
result = []
# Iterate the lists in an interleaving order and aggregate the scores.
while i < len(list1) and j < len(list2):
if i < list1[i][0] == list2[j][0]:
result.append((list1[i][0], list1[i][1] + list2[j][1]))
i += 1
j += 1
elif list1[i][0] < list2[j][0]:
result.append(list1[i])
i += 1
else:
result.append(list2[j])
j += 1
# Append the rest of the first list.
while i < len(list1):
result.append(list1[i])
i += 1
# Append the rest of the second list.
while j < len(list2):
result.append(list2[j])
j += 1
return result
def process_query(self, keywords):
"""
Process the given keyword query as follows: Fetch the inverted list for
each of the keywords in the query and compute the union of all lists.
Sort the resulting list by BM25 scores in descending order.
>>> ii = InvertedIndex()
>>> ii.inverted_lists = {
... "foo": [(1, 0.2), (3, 0.6)],
... "bar": [(1, 0.4), (2, 0.7), (3, 0.5)],
... "baz": [(2, 0.1)]}
>>> result = ii.process_query(["foo", "bar"])
>>> [(id, "%.1f" % tf) for id, tf in result]
[(3, '1.1'), (2, '0.7'), (1, '0.6')]
"""
if not keywords:
return []
# Fetch the inverted lists for each of the given keywords.
lists = []
for keyword in keywords:
if keyword in self.inverted_lists:
lists.append(self.inverted_lists[keyword])
# Compute the union of all inverted lists.
if len(lists) == 0:
return []
union = lists[0]
for i in range(1, len(lists)):
union = self.merge(union, lists[i])
# Filter all postings with BM25 = 0.
union = [x for x in union if x[1] != 0]
# Sort the postings by BM25 scores, in descending order.
return sorted(union, key=lambda x: x[1], reverse=True)
def render_output(self, postings, keywords, k=3):
"""
Render the output for the top-k of the given postings. Fetch the
the titles and descriptions of the related docs and highlight the
occurences of the given keywords in the output, using ANSI escape
codes.
"""
# Compile a pattern to identify the given keywords in a string.
p = re.compile('\\b(' + '|'.join(keywords) + ')\\b', re.IGNORECASE)
# Output at most k matching docs.
for i in range(min(len(postings), k)):
doc_id, tf = postings[i]
title, desc = self.docs[doc_id - 1] # doc_id is 1-based.
# Highlight the keywords in the title in bold and red.
title = re.sub(p, "\033[0m\033[1;31m\\1\033[0m\033[1m", title)
# Print the rest of the title in bold.
title = "\033[1m%s\033[0m" % title
# Highlight the keywords in the description in red.
desc = re.sub(p, "\033[31m\\1\033[0m", desc)
print("\n%s\n%s" % (title, desc))
print("\n# total hits: %s." % len(postings))
def main(file_name, b, k):
# Create a new inverted index from the given file.
print("Creating index with BM25 scores from file '%s'." % file_name)
ii = InvertedIndex()
ii.read_from_file(file_name, b=b, k=k)
new_name = (file_name.replace("input", "output")
.replace(".tsv", "_")) + "precomputed_ii.pkl"
print("Saving index as '%s'." % new_name)
pickle.dump(ii, open(new_name, "wb"))
if __name__ == "__main__":
# Parse the command line arguments.
parser = argparse.ArgumentParser(description="""Construct the inverted
index with BM25 scores from the given file. Save the inverted index
using pickle.""")
# Positional arguments
parser.add_argument("doc_file", type=str, help="""File to read from. The
expected format of the file is one document per line, in the format
<title>TAB<description>.""")
# Optional arguments
parser.add_argument("-b", "--b", type=float, default=0.75, help="""b value
for the BM25 scores (default: %(default)s)""")
parser.add_argument("-k", "--k", type=float, default=1.75, help="""k value
for the BM25 scores (default: %(default)s)""")
args = parser.parse_args()
main(args.doc_file, args.b, args.k)