-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalgLogic.py
134 lines (95 loc) · 3.63 KB
/
algLogic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from bs4 import BeautifulSoup
import urllib.request
import requests
import gensim
from difflib import SequenceMatcher
from rake_nltk import Rake
import index
from google import search
import random
import numpy as np
import time
def findAllLinks(soup):
# Gets all the links that are present on a webpage
#TODO: include the relative links
#TODO: what to do if the link is an image?
content = []
for link in soup.find_all('a', href=True):
if link['href'].startswith('http'):
content.append(link['href'])
return content
def findRelevantLinks(soup):
all_links = [tag['href'] for tag in soup.select('p a[href]')]
return all_links
def checkURL(url, setOfPages):
# Checks if a given url is in the set setOfPages
return url in setOfPages
def findMetaData(soup):
# Scrapes a website and returns a list with the contents of each meta tag
contents = []
for tag in soup.find_all('meta'):
contents.append(tag['content'])
return contents
def similar(a, b):
# Returns a number from zero to one of how similar two strings are
return SequenceMatcher(None, a, b).ratio()
def simpleContentSimilarity(soupPage1, soupPage2):
# Given two soups, returns a value (unbounded?) of how similar the pages are
meta1 = ''.join(findMetaData(soupPage1))
meta2 = ''.join(findMetaData(soupPage2))
print(type(meta1))
print(len(meta2))
return similar(meta1,meta2)
def extractKeywords(text):
# Extracts the most important words from a given text
r = Rake()
r.extract_keywords_from_text(text)
return r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.
def contentSimilarity(mainVector,url, m):
try:
resp = urllib.request.urlopen(url)
except:
return 0
soup = BeautifulSoup(resp, 'lxml')
soup_string = str(soup)
if soup_string:
page = index.parse_html_string(soup_string)
split = page.content.strip().split()
vector = m.infer_vector(split, alpha=0.01, steps=1000)
print("Dot product: %s"%str(np.dot(mainVector/np.linalg.norm(mainVector), vector/np.linalg.norm(vector))))
return np.dot(mainVector/np.linalg.norm(mainVector), vector/np.linalg.norm(vector))
return 0
def selectedWeightedKeyWords(content):
keywords = extractKeywords(content)
main = keywords[:5]
beginning = random.sample(keywords[5:len(keywords)//3],5)
middle = random.sample(keywords[len(keywords)//3:2*len(keywords)//3],2)
end = random.sample(keywords[2*len(keywords)//3:],1)
return main+beginning+middle+end
def selectedKeyWords(content):
keywords = extractKeywords(content)
return [np.random.choice(keywords, 2 , replace=False) for i in range(7)]
def googleSearch(page, q):
keywords = [page.title] + [' '.join(x) for x in selectedKeyWords(page.content)]
for keyword in keywords:
results = search(keyword, stop = 5)
for result in results:
q.append(result)
def main(url, access_time, query, model, q):
if query:
results = search(query, stop = 10)
for result in results:
q.append(result)
try:
mainResponse = urllib.request.urlopen(url)
except:
return
mainSoup = BeautifulSoup(mainResponse, 'lxml')
mainPage = index.parse_html_string(str(mainSoup))
googleSearch(mainPage, q)
splitDocument = mainPage.content.strip().split()
mainVector = model.infer_vector(splitDocument, alpha=0.01, steps=1000)
firstLevel = random.sample(findAllLinks(mainSoup), 10)
for link in firstLevel:
if contentSimilarity(mainVector, link, model) >= .4:
q.append(link)