-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathinvdx.py
75 lines (62 loc) · 2.09 KB
/
invdx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding;utf-8 -*-
__author__ = 'jrlimingyang@jd.com'
class InvertedIndex:
def __init__(self):
self.index = dict()
def __contains__(self, item):
return item in self.index
def __getitem__(self, item):
return self.index[item]
def add(self, word, docid):
if word in self.index:
if docid in self.index[word]:
self.index[word][docid] += 1
else:
self.index[word][docid] = 1
else:
d = dict()
d[docid] = 1
self.index[word] = d
# Frequency of word in document
def get_document_frequency(self, word, docid):
if word in self.index:
if docid in self.index[word]:
return self.index[word][docid]
else:
raise LookupError('%s not in document %s' % (str(word), str(docid)))
else:
raise LookupError('%s not in index' % str(word))
# Frequency of word in index, i.e. number of documents that contain word
def get_index_frequency(self, word):
if word in self.index:
return len(self.index[word])
else:
raise LookupError('%s not in index' % word)
class DocumentLengthTable:
def __init__(self):
self.table = dict()
def __len__(self):
return len(self.table)
def add(self, docid, length):
self.table[docid] = length
def get_length(self, docid):
if docid in self.table:
return self.table[docid]
else:
raise LookupError('%s not found in table' % str(docid))
def get_average_length(self):
sum = 0
for length in self.table.itervalues():
sum += length
return float(sum) / float(len(self.table))
def build_data_structures(corpus):
idx = InvertedIndex()
dlt = DocumentLengthTable()
for docid in corpus:
# build inverted index
for word in corpus[docid]:
idx.add(str(word), str(docid))
# build document length table
length = len(corpus[str(docid)])
dlt.add(docid, length)
return idx, dlt