-
Notifications
You must be signed in to change notification settings - Fork 69
/
Copy pathsearchengine.py
359 lines (293 loc) · 12 KB
/
searchengine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
import os
import re
import urllib2
import urlparse
from pysqlite2 import dbapi2 as sqlite
from BeautifulSoup import BeautifulSoup
import nn
net = nn.searchnet('nn.db') # XXX: somehow train this from user clicks
ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])
# XXX: the root page (amnoid.de) is indexed twice for some reason (e.g.
# select * from links where toid = 2;
# shows the link 1->2 two times.
class crawler:
def __init__(self, dbname):
self.con = sqlite.connect(dbname)
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
def getentryid(self, table, field, value, createnew=True):
"""Returns an entry id and creates it if it is not present."""
cur = self.con.execute('select rowid from %s where %s="%s"'
% (table, field, value))
result = cur.fetchone()
if not result:
cur = self.con.execute('insert into %s (%s) values("%s")'
% (table, field, value))
return cur.lastrowid
else:
return result[0]
def addtoindex(self, url, soup):
"""Indexes a given page."""
if self.isindexed(url): return
print 'Indexing', url
# Extract words
text = self.gettextonly(soup)
words = self.separatewords(text)
# Get url id from db
urlid = self.getentryid('urllist', 'url', url)
# Link each word to that url
for i in range(len(words)):
word = words[i]
if word in ignorewords: continue
wordid = self.getentryid('wordlist', 'word', word)
self.con.execute('insert into wordlocation(urlid, wordid, location) \
values (%d, %d, %d)' % (urlid, wordid, i))
def gettextonly(self, soup):
"""Extracts all text from a html page, i.e. strips the tags."""
v = soup.string
if v == None:
return '\n'.join([self.gettextonly(t) for t in soup.contents])
else:
return v.strip()
def separatewords(self, text):
"""Splits words by non-whitespace characters."""
splitter = re.compile(r'\W*')
return [s.lower() for s in splitter.split(text) if s != '']
def isindexed(self, url):
cur = self.con.execute('select rowid from urllist where url="%s"' % url)
u = cur.fetchone()
if not u: return False
# check if it has been crawled (XXX: why?)
v = self.con.execute('select * from wordlocation where urlid="%d"'
% u[0]).fetchone()
return v != None
def addlinkref(self, urlfrom, urlto, linktext):
"""Add a link between two pages."""
fromid = self.getentryid('urllist', 'url', urlfrom)
toid = self.getentryid('urllist', 'url', urlto)
if fromid == toid: return
cur = self.con.execute('insert into link (fromid, toid) values (%d, %d)'
% (fromid, toid))
linkid = cur.lastrowid
# Remember each word in link text
for word in self.separatewords(linktext):
if word in ignorewords: continue
wordid = self.getentryid('wordlist', 'word', word)
self.con.execute('insert into linkwords (wordid, linkid) \
values (%d, %d)' % (wordid, linkid))
def crawl(self, pages, depth=2):
"""Find pages linked from a root set in BFS order, up to a given depth."""
for i in range(depth):
newpages = set()
for page in pages:
try:
print page
c = urllib2.urlopen(page)
except urllib2.URLError:
print 'Could not load', page
continue
if c.headers.type not in set(['text/html', 'text/plain']):
print 'Skipping', page, c.headers.type
continue
soup = BeautifulSoup(c.read())
self.addtoindex(page, soup)
links = soup.findAll('a')
for link in links:
if 'href' in dict(link.attrs):
url = urlparse.urljoin(page, link['href'])
if url.find("'") != -1:
print 'IGNORING', url
continue
url = url.split('#')[0]
if url[0:4] == 'http' and not self.isindexed(url):
newpages.add(url)
linktext = self.gettextonly(link)
self.addlinkref(page, url, linktext)
self.dbcommit()
pages = newpages
def createindextables(self):
"""Create the database tables."""
self.con.execute('create table urllist(url)')
self.con.execute('create table wordlist(word)')
self.con.execute('create table wordlocation(urlid, wordid, location)')
self.con.execute('create table link(fromid integer, toid integer)')
self.con.execute('create table linkwords(wordid, linkid)')
self.con.execute('create index urlidx on urllist(url)')
self.con.execute('create index wordidx on wordlist(word)')
self.con.execute('create index wordurlidx on wordlocation(wordid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.con.execute('create index urltoidx on link(toid)')
self.dbcommit()
# XXX: If a page with a highish pagerank links 16 other pages, and one
# of those links to just one page C, then C has a higher pagerank than
# the original high-pr page. Weird.
def calculatepagerank(self, iterations=20):
self.con.execute('drop table if exists pagerank')
self.con.execute('create table pagerank(urlid primary key, score)')
# initialize all pageranks with 1.0 (actual value does not matter)
self.con.execute('insert into pagerank select rowid, 1.0 from urllist')
self.dbcommit()
for i in range(iterations):
print 'Iteration', i
for urlid, in self.con.execute('select rowid from urllist'):
pr = 0.15
# Loop through all pages that link to this one
for linker, in self.con.execute(
'select distinct fromid from link where toid = %d' % urlid):
# Get pagerank of linker
linkingrank = self.con.execute('select score from pagerank '
+ 'where urlid = %d' % linker).fetchone()[0]
# Get total number of links on page 'linker'
linkingcount = self.con.execute('select count(*) '
+ 'from link where fromid = %d' % linker).fetchone()[0]
pr += 0.85 * (linkingrank / linkingcount)
# Store pagerank
self.con.execute(
'update pagerank set score = %f where urlid = %d' % (pr, urlid))
self.dbcommit()
class searcher:
def __init__(self, dbname):
self.con = sqlite.connect(dbname)
def __del__(self):
self.con.close()
def getmatchquery(self, q):
# Example query:
# SELECT w0.urlid, w0.location, w1.location, w2.location
# FROM wordlocation w0, wordlocation w1, wordlocation w2
# WHERE w0.wordid = 255
# and w0.urlid = w1.urlid and w1.wordid = 1192
# and w1.urlid = w2.urlid and w2.wordid = 73
# XXX: This returns O((n/k)^k) many results for a query with k known words
# (n is number of words on a page) for each page.
# XXX: Break this into pieces, test them
fieldlist = 'w0.urlid'
tablelist = ''
clauselist = ''
wordids = []
words = q.split(' ')
tablecount = 0
for word in words:
wordrow = self.con.execute('select rowid from wordlist where word = "%s"'
% word).fetchone()
if not wordrow: continue
wordid = wordrow[0]
wordids.append(wordid)
if tablecount > 0:
tablelist += ', '
clauselist += ' and '
clauselist += 'w%d.urlid = w%d.urlid and ' % (
tablecount - 1, tablecount)
fieldlist += ', w%d.location' % tablecount
tablelist += 'wordlocation w%d' % tablecount
clauselist += 'w%d.wordid = %d' % (tablecount, wordid)
tablecount += 1
if tablecount == 0:
return '', wordids
fullquery = 'select %s from %s where %s' % (
fieldlist, tablelist, clauselist)
return fullquery, wordids
def getmatchrows(self, q):
"""Returns list of tuples. 1st element of each tuple is urlid, the others
are the positions of each query word in that document."""
sql, wordids = self.getmatchquery(q)
print sql
cur = self.con.execute(sql)
rows = list(cur)
return rows, wordids
def getscoredlist(self, rows, wordids):
totalscores = dict([(row[0], 0) for row in rows])
if not rows: return totalscores
weightedScores = [(2.0, self.frequencyscore(rows)),
(1.0, self.locationscore(rows)),
(1.0, self.distancescore(rows)),
(1.0, self.inboundlinkscore(rows)),
(1.0, self.pagerankscore(rows)),
(1.0, self.linktextscore(rows, wordids)),
(0.0, self.nnscore(rows, wordids)), # not trained yet...
]
for weight, scores in weightedScores:
for url in totalscores:
totalscores[url] += weight * scores[url]
return totalscores
def geturlname(self, id):
return self.con.execute(
'select url from urllist where rowid = %d' % id).fetchone()[0]
def query(self, q):
rows, wordids = self.getmatchrows(q)
scores = self.getscoredlist(rows, wordids)
rankedscores = sorted([(score, url) for url, score in scores.items()],
reverse=True)
return [(score, self.geturlname(urlid))
for (score, urlid) in rankedscores[0:10]]
# Scoring functions
def normalizescores(self, scores, smallIsBetter=False):
vsmall = 0.00001 # smoothen out division by zero
if smallIsBetter:
minscore = min(scores.values())
return dict([(u, float(minscore)/max(vsmall, l))
for (u, l) in scores.items()])
else:
maxscore = max(scores.values())
if maxscore == 0: maxscore = vsmall
return dict([(u, float(c)/maxscore) for (u, c) in scores.items()])
def frequencyscore(self, rows):
counts = dict([(row[0], 0) for row in rows])
# disproportionally high, see the "XXX" above
for row in rows: counts[row[0]] += 1
return self.normalizescores(counts, smallIsBetter=False)
def locationscore(self, rows):
locations = dict([(row[0], 1000000) for row in rows])
for row in rows:
loc = sum(row[1:])
if loc < locations[row[0]]: locations[row[0]] = loc
return self.normalizescores(locations, smallIsBetter=True)
def distancescore(self, rows):
# only one word in query?
if len(rows[0]) <= 2: return dict([(row[0], 1.0) for row in rows])
mindistance = dict([(row[0], 1000000) for row in rows])
for row in rows:
dist = sum([abs(row[i] - row[i - 1]) for i in range(2, len(row))])
if dist < mindistance[row[0]]: mindistance[row[0]] = dist
return self.normalizescores(mindistance, smallIsBetter=True)
def inboundlinkscore(self, rows):
uniqueurls = set([row[0] for row in rows]) # XXX: why is this needed?
inboundcount = dict([(u, self.con.execute(
'select count(*) from link where toid = %d' % u).fetchone()[0])
for u in uniqueurls])
return self.normalizescores(inboundcount, smallIsBetter=False)
def pagerankscore(self, rows):
pageranks = dict([(row[0], self.con.execute('select score from pagerank '
+ 'where urlid = %d' % row[0]).fetchone()[0]) for row in rows])
return self.normalizescores(pageranks, smallIsBetter=False)
def linktextscore(self, rows, wordids):
linkscores = dict([(row[0], 0) for row in rows])
for wordid in wordids:
cur = self.con.execute(
'select link.fromid, link.toid from linkwords, link '
+ 'where wordid = %d and linkwords.linkid = link.rowid' % wordid)
for fromid, toid in cur:
if toid in linkscores:
rank = self.con.execute('select score from pagerank where urlid = %d'
% fromid).fetchone()[0]
linkscores[toid] += rank
return self.normalizescores(linkscores, smallIsBetter=False)
def nnscore(self, rows, wordids):
# Get unique url ids as ordered list
urlids = list(set([row[0] for row in rows]))
#urlids = list(set([row[0] for row in rows]))
#assert urlids == sorted(urlids)
nnres = net.getresult(wordids, urlids)
scores = dict([(urlids[i], nnres[i]) for i in range(len(urlids))])
return self.normalizescores(scores, smallIsBetter=False)
if __name__ == '__main__':
create = not os.path.exists('searchindex.db')
crawl = crawler('searchindex.db')
if create:
crawl.createindextables()
if True:
crawl.crawl(['http://amnoid.de/'], depth=3)
crawl.calculatepagerank()
s = searcher('searchindex.db')
print s.query('ddsview is great')