-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathxkcd-scraper.py
247 lines (235 loc) · 9.31 KB
/
xkcd-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
from requests import get
from random import randrange
from json import loads
from re import search
from PIL import Image, ImageDraw, ImageFont
from textwrap import wrap
import argparse
import os
# Let's make a class to hold everything
class xkcd_scraper:
def __init__(self, download_dir):
# Download_dir doesn't exist -> get out of there (may be a trap)
if not os.path.exists(download_dir + os.path.sep):
print("ERROR: ", "'" + download_dir + "', no such directory")
raise SystemExit
# If we don't have access to download_dir -> get out of there
if not os.access(download_dir, os.W_OK):
print("ERROR:", "'" + download_dir + "', permission denied")
raise SystemExit
# Set the variables if everything is good
self.download_dir = download_dir
self.append_title = False
self.embed = False
# Settings that control the embedded text appearance
self.title_fontsize = 28
self.alt_fontsize = 18
self.line_offset = 10
def download_json(self, comic_number):
# Can this even happen
if comic_number < 0:
return None
# Try to get the JSONs
try:
# Currently the JSONs are available at links like:
# xkcd.com/comic_number/info.0.json and the latest one at
# xkcd.com/info.0.json
if comic_number == 0:
return get("http://xkcd.com/info.0.json").json()
else:
return get(("http://xkcd.com/{0}/info.0.json").format(comic_number)).json()
except (requests.exceptions.ConnectionError, ValueError):
return None
def add_text(self, image, title, alt, tfont = 'xkcd.ttf', afont = 'xkcd.ttf'):
try:
img = Image.open(image)
except OSError:
return
tfont = ImageFont.truetype("xkcd.ttf", self.title_fontsize)
afont = ImageFont.truetype("xkcd.ttf", self.alt_fontsize)
# Compute the widths and heights for the title and alt
twidth, theight = tfont.getsize(title)
awidth, aheight = afont.getsize(alt)
line_padding = 5
# Get the object to draw upon
draw = ImageDraw.Draw(img)
lines = self.text_wrap(tfont, title, img.size[0])
lheight = max([tfont.getsize(" ".join(i))[1] for i in lines])
lheight_total = (lheight + line_padding) * (len(lines)) + line_padding * 4
title_crop = (0, -1 * lheight_total, img.size[0], img.size[1])
img = img.crop(title_crop)
w, h = img.size
old_h = h
draw = ImageDraw.Draw(img)
lheight_total = line_padding
for i in lines:
draw.text((w / 2 - tfont.getsize(" ".join(i))[0] / 2,
lheight_total),
" ".join(i),
font=tfont,
fill=0xffffff)
lheight_total += lheight + line_padding
lheight_total = line_padding
lines = self.text_wrap(afont, alt, w)
lheight = max([afont.getsize(" ".join(i))[1] for i in lines])
lheight_total = lheight * len(lines)
alt_crop = (0, 0, img.size[0],
img.size[1] + lheight_total + (len(lines) + 3) * line_padding)
img = img.crop(alt_crop)
draw = ImageDraw.Draw(img)
lheight_total = old_h + line_padding
for i in lines:
if not i:
continue
draw.text((w / 2 - afont.getsize(" ".join(i))[0] / 2, lheight_total), " ".join(i), font=afont, fill=0xffffff)
lheight_total += lheight + line_padding
# Save all of the stuff we did
img.save(image)
def text_wrap(self, font, text, image_width, i = 0):
lines = [[]]
text = text.split(" ")
while len(text) > 0:
while len(text) > 0 \
and font.getsize(" ".join(lines[i]))[0] < image_width:
if font.getsize(text[0] + " " + " ".join(lines[i]))[0] \
> image_width * 0.95:
if len(lines[i]) == 0:
text[0] = text[0][:len(text[0]) // 2 + 1] \
+ " " + text[0][:len(text[0]) // 2 + 1:]
text = text[0].split(" ") + text[1:]
break
lines[i].append(text[0])
text.pop(0)
i += 1
lines.append([])
sub = []
for e, i in enumerate(lines):
if font.getsize(" ".join(lines[e]))[0] > image_width:
temp_str = ""
for c in "".join(i):
if font.getsize(temp_str + c)[0] > image_width:
lines[i] = lines[i][:len(lines[i]) // 2] \
+ lines[i][len(lines[i]) // 2:]
break
temp_str += c
sub.append(temp_str)
del lines[e]
lines = [i for i in lines if len(i) != 0]
for c in [i for i in sub if len(i) != 0]:
lines.append(c)
return lines
def download_images(self, comic_number):
# The object where we will write the image, will be used later
images = []
# Let's be careful about the 404 comic
# You HAVE to see it though... http://xkcd.com/404
if comic_number == 404:
print("You should try heading out to http://xkcd.com/404 yourself!")
print("It's such a good one we couldn't manage to download it!")
return
if comic_number == 0:
print("Fetching comic -> Latest".format(comic_number))
else:
print("Fetching comic -> {0}".format(comic_number))
# Retrieve the JSON of the comic
info = self.download_json(comic_number)
# In case we failed to retrieve the JSON, abort mission
if not info:
print("ERROR:\nWhat we want you to think happenned -> The URL could not be reached!")
print("What actually happenned -> The JSON associated with the comic could not be retrieved!")
return
# If we have got the JSON, lets get to work and extract relevant information
title = info['safe_title']
alt = info['alt']
num = str(info['num'])
url = info['img']
date = info['day'] + "/" + info['month'] + "/" + info['year']
# Let's decide the name we want to give to our downloaded images
# Extract the file extension from the image url and append comic title
if self.append_title == True:
image = num + " - " + title + search("\.([a-z])+$", info['img']).group()
else:
image = num + search("\.([a-z])+$", info['img']).group()
# Open the image file for writing
with open(self.download_dir + '/' + image, 'wb') as image_file:
# Get the image from the website
srcimg = get(info['img'], stream = True)
for block in srcimg.iter_content(1024):
if block:
image_file.write(block)
image_file.flush()
if self.embed and not search("\.gif", info['img']):
print("Processing comic -> {0}".format(comic_number))
self.add_text(self.download_dir+'/'+image, title, alt)
def download_all(self):
# We get the latest comic number from the download_json(0)['num'] and add 1
# because range function is not inclusive
for i in range(1, self.download_json(0)['num'] + 1):
self.download_images(i)
def download_random(self, iterations = 1):
# Check if we have a connection by getting the JSON for the latest comic
info = self.download_json(0)
if not info:
print("ERROR:\nWhat we want you to think happenned -> The URL could not be reached!")
print("What actually happenned -> The JSON associated with the comic could not be retrieved!")
return
else:
# Download as many random comics as requested
for i in range(iterations):
self.download_images(randrange(1, info['num'] + 1))
def main():
# Let's add some command line arguments
parser = argparse.ArgumentParser(description='Retrieve xkcd comics.', prefix_chars='-+')
# Output directory argument
parser.add_argument('-o', '--output-dir', metavar='DIRECTORY', action='store', default='./', help='Change the output directory. Default is current directory')
# The comic number argument
parser.add_argument('N', type=int, nargs='*', help='An integer or set of integers greater than or equal to zero. Use 0 for latest comic.')
# The range argument to help download multiple comics
parser.add_argument('-r', '--range', action='store', metavar='N', type=int, nargs=2, help='Fetch comics within a certain range')
# The all argument to download all comics
parser.add_argument('-a', '--all', action='store_true', help='Fetch all comics')
# The random argument
parser.add_argument('-x' ,'--random', metavar='ITERATIONS', type=int, help='Fetch random comics', nargs='?', const=1)
# The append comic title argument
parser.add_argument('-t', '--title', action='store_true', help='Appends the comic title to the filename')
# The embed comic title and alt text in the image argument
parser.add_argument('-e', '--embed', action='store_true', help='Embeds the comic title and alt text to the comic image (unless its a gif)')
args = parser.parse_args()
x = xkcd_scraper(args.output_dir)
# Let us try and make sense of the arguments passed
if args.title:
x.append_title = True
if args.embed:
x.embed = True
# Range is pretty stand-alone
if args.range:
if args.N or args.random or args.all:
raise argparse.ArgumentTypeError("You are confusing me... Please avoid using random, all or specific comic numbers with the range option.")
else:
# Download all the comics in the specified range
for i in range(args.range[0], args.range[1] + 1):
x.download_images(i)
return
# They want to download all comics
if args.all:
if args.N or args.random:
raise argparse.ArgumentTypeError("You are confusing me... Random and individual comics numbers confuse me when you tell me to download all comics.")
return x.download_all()
# They want a random one (they think it's random but we know better)
if args.random:
if args.N:
raise argparse.ArgumentTypeError("You are confusing me... How can I get a random comic for you if you keep telling me which one to download.")
return x.download_random(args.random)
else:
# They obviously failed to provide an argument, let's help them
if not args.N:
parser.print_help()
# They have specified specific comics, lets go get them
for i in args.N:
x.download_images(i)
return
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
raise SystemExit