-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTask1.py
78 lines (62 loc) · 2.06 KB
/
Task1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import urllib.request as ul
import re
from bs4 import BeautifulSoup
BASE_URL = "https://en.wikipedia.org"
BASE_URL_WIKI = "https://en.wikipedia.org/wiki/"
def getLinks(page):
listOfLinks = []
webPage = ul.urlopen(page)
soup = BeautifulSoup(webPage, "html.parser")
divData = soup.findAll('div')
for div in divData:
links = div.findAll('a',{'href' : re.compile('^/wiki/')})
for link in links:
fullURL = BASE_URL + link.get('href')
if '#' not in fullURL and ':' not in link.get('href'):
listOfLinks.append(fullURL)
return listOfLinks
def getAllLinks(fileName):
file = open(fileName, "r").read()
links = file.splitlines()
count = 1
linkDictionary = {}
for link in links:
linkDictionary[link] = getLinks(link)
print(count)
count +=1
return linkDictionary
def extractName(page):
docID = page[len(BASE_URL_WIKI):]
return docID
def getInLinks(page, linkDictionary):
inLinks = []
links = linkDictionary.keys()
for link in links:
if page != link:
if page in linkDictionary[link]:
inLinks.append(str(" ") + str(extractName(link)))
return inLinks
def getGraphs(linkDictionary, fileName):
file = open(fileName, "r").read()
links = file.splitlines()
graph = []
for link in links:
inLinkString = ""
inLinks = getInLinks(link, linkDictionary)
for inLink in inLinks:
inLinkString = inLinkString + str(inLink)
graph.append(str(extractName(link)) + str(inLinkString))
return graph
def writeFile(graph, fileName):
file = open(fileName, "w")
for inLink in graph:
file.write(inLink + str("\n"))
file.close()
def main():
BFSlinkDictionary = getAllLinks("BFSCrawledURLs.txt")
BFSgraph = getGraphs(BFSlinkDictionary, "BFSCrawledURLs.txt")
writeFile(BFSgraph, "G1.txt")
DFSlinkDictionary = getAllLinks("DFSCrawledURLs.txt")
DFSgraph = getGraphs(DFSlinkDictionary, "DFSCrawledURLs.txt")
writeFile(DFSgraph, "G2.txt")
main()