-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch.py
93 lines (79 loc) · 2.47 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
'Provides routines to fetch urls'
'''
LICENSE:
Copyright (C) 2011, Pankaj Kumar Garg
This program is distributed under GNU General Public License
'''
__author__ = "Pankaj Kumar Garg"
__email__ = "pankajn17@gmail.com"
__copyright__ = "Copyright (c) 2011, Pankaj Kumar Garg"
__license__ = "GPLv3"
import socket
import random, subprocess, shlex, cStringIO, gzip
import urllib, urllib2
from pprint import pprint
def fetch(url, postData = None, headers = {}, timeout = 120, debug = False, unarchive = False):
'''
Since, this module uses python library to fetch urls, there are few things to notice
Proxy setup is automatically done by the python
Behind proxy, "https" links cann't be opened
To solve this, look at: http://code.activestate.com/recipes/456195/
postData: pass a dictionary
decode is True by default
responseCode shall be int
headers returned shall be a dict
'''
data = {}
data["url"] = url
socket.setdefaulttimeout(timeout)
if "User-Agent" not in headers:
headers["User-Agent"] = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
#"Referer"
if postData is not None:
post = urllib.urlencode(postData)
else:
post = None
req = urllib2.Request(url, post, headers)
try:
response = urllib2.urlopen(req)
data["html"] = response.read()
except urllib2.URLError, e:
if debug:
if hasattr(e, 'reason'):
print 'We failed to reach a server.'
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
data["html"] = ""
data["effectiveUrl"] = ""
data["headers"] = {}
if hasattr(e, "code"):
data["responseCode"] = e.code
else:
data["responseCode"] = 444
except Exception:
#timeout error
data["html"] = ""
data["effectiveUrl"] = ""
data["headers"] = {}
data["responseCode"] = 444
else:
# everything is fine
data["effectiveUrl"] = response.geturl()
data["headers"] = dict(response.info())
data["responseCode"] = 200
headerStr = "\n".join(key + ": " + value for key,value in data["headers"].iteritems())
pprint(data['headers'])
#decoding
if unarchive:
if "content-encoding" in data["headers"] and data["headers"]["content-encoding"] == "gzip":
try:
htmlStream = cStringIO.StringIO(urlList[i]['html'][:])
gzipper = gzip.GzipFile(fileobj = htmlStream, mode="rb")
urlList[i]['html'] = gzipper.read()
gzipper.close()
htmlStream.close()
except:
urlList[i]['html'] = urlList[i]['html']
return data