-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathabsrel.py
162 lines (146 loc) · 5.99 KB
/
absrel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import html5lib
import html5lib.serializer
import html5lib.treewalkers
import urlparse
import os.path
# List of (ELEMENT, ATTRIBUTE) for HTML5 attributes which contain URLs.
# Based on the list at http://www.feedparser.org/docs/resolving-relative-links.html
url_attributes = [
('a', 'href'),
('applet', 'codebase'),
('area', 'href'),
('audio', 'src'),
('blockquote', 'cite'),
('body', 'background'),
('del', 'cite'),
('form', 'action'),
('frame', 'longdesc'),
('frame', 'src'),
('iframe', 'longdesc'),
('iframe', 'src'),
('head', 'profile'),
('img', 'longdesc'),
('img', 'src'),
('img', 'usemap'),
('input', 'src'),
('input', 'usemap'),
('ins', 'cite'),
('link', 'href'),
('object', 'classid'),
('object', 'codebase'),
('object', 'data'),
('object', 'usemap'),
('q', 'cite'),
('script', 'src'),
('source', 'src'),
('video', 'poster'),
('video', 'src'),
]
linkurl_attributes = [
('a', 'href'),
('area', 'href'),
('blockquote', 'cite'),
('iframe', 'src'),
('q', 'cite'),
]
def absolutify(src, base_url):
"""absolutify(SRC, BASE_URL): Resolve relative URLs in SRC.
SRC is a string containing HTML. All URLs in SRC are resolved relative
to BASE_URL. Return the body of the result as HTML."""
# Parse SRC as HTML.
tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
dom = parser.parse(src)
# Handle <BASE> if any.
head = dom.getElementsByTagName('head')[0]
for b in head.getElementsByTagName('base'):
u = b.getAttribute('href')
if u:
base_url = urlparse.urljoin(base_url, u)
# HTML5 4.2.3 "if there are multiple base elements with href
# attributes, all but the first are ignored."
break
# Change all relative URLs to absolute URLs by resolving them
# relative to BASE_URL. Note that we need to do this even for URLs
# that consist only of a fragment identifier, because Google Reader
# changes href=#foo to href=http://site/#foo
for tag, attr in url_attributes:
for e in dom.getElementsByTagName(tag):
u = e.getAttribute(attr)
if u:
e.setAttribute(attr, urlparse.urljoin(base_url, u))
# Return the HTML5 serialization of the result
body = dom.getElementsByTagName('html')[0]
tree_walker = html5lib.treewalkers.getTreeWalker('dom')
html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
return u''.join(html_serializer.serialize(tree_walker(body)))
def relativize(src, base_url):
"""relativize(SRC, BASE_URL): Resolve absolute URLs in SRC.
SRC is a string containing HTML. All URLs in SRC are made relative
to BASE_URL. Return the result as HTML."""
# Parse SRC as HTML.
tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
dom = parser.parse(src)
# Handle <BASE> if any.
head = dom.getElementsByTagName('head')[0]
for b in head.getElementsByTagName('base'):
u = b.getAttribute('href')
if u:
base_url = urlparse.urljoin(base_url, u)
# HTML5 4.2.3 "if there are multiple base elements with href
# attributes, all but the first are ignored."
break
if not base_url.endswith('/'):
base_url = base_url+'/' # make urlparse.urljoin handle nested dirs right
rel_basebits = urlparse.urlsplit(base_url)
basepath = rel_basebits.path or '/'
#print "basebits.path: '%s' basepath:'%s'" %(rel_basebits.path,basepath)
# Change all absolute URLs to relative URLs by resolving them
# relative to BASE_URL, then removing BASE_URL
for tag, attr in url_attributes:
for e in dom.getElementsByTagName(tag):
u = e.getAttribute(attr)
if u:
ubits = urlparse.urlsplit(urlparse.urljoin(base_url, u))
path = ubits.path or '/'
#print "base_url: '%s' ubits.path: '%s' path:'%s'" %(base_url,ubits.path,path)
if ubits.netloc == rel_basebits.netloc:
newpath= os.path.relpath(path,basepath)
if newpath == ".":
newpath = ""
newu = urlparse.urlunsplit(('','',newpath,ubits.query,ubits.fragment))
#print "path: '%s', basepath: '%s', newpath: '%s', newu: '%s'" %(path,basepath,newpath,newu)
e.setAttribute(attr, newu)
body = dom.getElementsByTagName('html')[0]
tree_walker = html5lib.treewalkers.getTreeWalker('dom')
html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
return u''.join(html_serializer.serialize(tree_walker(body)))
def geturls(src, base_url):
"""return all outbound URLs so you can webmention them""" # Parse SRC as HTML.
tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
dom = parser.parse(src)
urls=[]
# Handle <BASE> if any.
head = dom.getElementsByTagName('head')[0]
for b in head.getElementsByTagName('base'):
u = b.getAttribute('href')
if u:
base_url = urlparse.urljoin(base_url, u)
# HTML5 4.2.3 "if there are multiple base elements with href
# attributes, all but the first are ignored."
break
if not base_url.endswith('/'):
base_url = base_url+'/' # make urlparse.urljoin handle nested dirs right
rel_basebits = urlparse.urlsplit(base_url)
basepath = rel_basebits.path or '/'
# Change all relative URLs to absolute URLs by resolving them
# relative to BASE_URL. Note that we need to do this even for URLs
for tag, attr in linkurl_attributes:
for e in dom.getElementsByTagName(tag):
u = e.getAttribute(attr)
if u:
fullurl = urlparse.urljoin(base_url, u)
urls.append(fullurl)
return urls