-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
110 lines (107 loc) · 5.39 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
Author-Divyansh Sharma
TASK ::
Write a program that takes a URL as an input, and saves the response of that URL in a file,
and then does the same for all the pages linked inside that HTML page.
LIBRARIES USED :: urllib,re,urlparse,Queue
All libraries used are already built-in utility libraries come packaged with Python 2.7
Please look in the comment aside the library import for more details.
PS : The links file generated are written into text files for debugging purposes,we can comment them but i have not commented as it will
give a better view,understanding and debugging of the program.
"""
import urllib #used for fetching content over internet
import re #regex library for python
from urlparse import urlparse
#for parsing URL into parts: 1)Scheme:'http','https',etc the protocol 2) Netloc: is the network location of that resource,'www.google.com'
#3)Path : is the URL path like '/en/resources.html'
from Queue import Queue #built in Queue ADT for storing links
import os #os module for making directries,changing directory path etc
count=0
prgrm_path=os.getcwd() #gets the current working directory of the program
"""
init function fetches the seed URL html page and content and finds all links associated with it using regex.
Using a queue for storing links in the seed URL.I assume the seed URL is parent and the links contained inside is child.
After i recognise all the child links in the parent page i insert them into the queue,and pop the parent link from the queue.
This is done till the queue is not empty and all the child have not been visited.Basically I am doing a Breadth First Search.
"""
def init():
inp=open('input.txt','r') #takes the input from a input text file
req=inp.read()
inp.close()
base_url_obj=urlparse(req) #parsing the URL
base_url=base_url_obj.scheme+"://"+base_url_obj.netloc
#proxies={'http',''}
#urllib.urlopen(req,proxies=proxies)
#You can use aove commented method in case of static proxies to download content over the internet,by default the connection is DIRECT
response = urllib.urlopen(req) #opens the URL
the_page = response.read() #reads the URL and saves it as a string
urls = re.findall(r'href=[\'"]?([^\'" >]+)', the_page) #this is a regex to check for all links inside the downloaded html page,uses re module
q=Queue(maxsize=0)
#some preprocessing with the links if the links are external links they are not changed,but if the are attached to a parent they are
#modified and attached with parent base URL.example: a link referring to other child resource may be defined as < a href='/reference/index.html>'
#using regex i am getting '/reference/index.html' as a link
#actually its relative URL to the parent,so changing that link to : 'www.google.com/reference/index.html'
for x in range(len(urls)):
if urls[x].startswith('/'): #if relative to parent make it absolute
urls[x]=base_url+urls[x]
elif urls[x].startswith(base_url_obj.scheme): #if its third party links dont touch them
urls[x]=urls[x]
else :
urls[x]=base_url+base_url_obj.path[:(base_url_obj.path).rfind('/')]+'/'+urls[x]
q.put(urls[x]) #put the links in queue
target=open('index.html','w')
target1=open('links.txt','w')
target.write(the_page)
links='\n'.join(urls)
target1.write(links)
target.close()
target1.close()
#opening and closing files in python,writing html page and links to a file
download_content(q) #call a recursive function that will do the same things for all the links in the link file,in argument it passes the initial queue
"""
This a recursive function which does all the magic of fetching html for all the child links of the seed URL.
"""
def download_content(q) :
global count
global prgrm_path
while not q.empty(): #do while the queue is not empty
reqst=q.get() #get the top node
# below things are almost same as init function
bs_url_obj=urlparse(reqst)
bs_url=bs_url_obj.scheme+"://"+bs_url_obj.netloc
bs_path=bs_url_obj.path
if bs_path.endswith('/'):
bs_path=bs_path[:-1]
resp=urllib.urlopen(reqst)
html_pg=resp.read()
links=re.findall(r'href=[\'"]?([^\'" >]+)', html_pg)
for x in range(len(links)):
tmp=links[x]
if links[x].endswith('/'):
links[x]=tmp[:-1]
if links[x].startswith('/'):
links[x]=bs_url+links[x]
elif links[x].startswith(bs_url_obj.scheme):
links[x]=links[x]
else:
links[x]=bs_url+bs_url_obj.path[:(bs_url_obj.path).rfind('/')]+'/'+links[x]
q.put(links[x]) #put all the child links in queue
q.task_done() #remove parent node from queue
filenm=bs_path[bs_path.rfind('/'):] #dynamic file naming according to the name of resources
if filenm.startswith('/'): #as resources are names starting with '/',removing '/' as special symbols are not permitted in naming,so '/reference'
filenm=filenm[1:] #will be named as reference
path=bs_url_obj.netloc+bs_path
if not os.path.exists(path): #checks if file already exists else make directory in the given path
os.makedirs(path)
os.chdir(path) #go to the path to write the html files in the hierarchical style
exec "target%d=open('%s.html','w')"%(count,filenm)
exec "target_again%d=open('link_file%d.txt','w')"%(count,count)
exec "target%d.write(html_pg)"%count
tp_link='\n'.join(links)
exec "target_again%d.write(tp_link)"%count
exec "target%d.close()"%count
exec "target_again%d.close()"%count
os.chdir(prgrm_path+'\\'+bs_url_obj.netloc)
count=count+1
download_content(q) #call again for other links in the queue
init() #call the init function