forked from LibraryOfCongress/newspaper-navigator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_beyond_words_dataset.py
284 lines (227 loc) · 9.36 KB
/
process_beyond_words_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import json
import sys
import urllib
import requests
from PIL import Image
import time
import math
import os
import datetime
from datetime import date
import pprint
import glob
# resampling scale
scale = 6
# this function downscales according to scale defined above
def rescale(dimension):
return math.floor(dimension/float(scale))
# this script creates a COCO-formatted dataset out of the Beyond Words data
# it adheres to this schema: http://cocodataset.org/#format-data
data = {}
today = str(datetime.date.today().month) + "/" + str(datetime.date.today().day) + "/" + str(datetime.date.today().year)
data["info"] = {
"description": "Beyond Words Dataset (verified)",
"URL": "http://beyondwords.labs.loc.gov/#/",
"version":"1.0",
"year": datetime.date.today().year,
"contributor": "LC Labs",
"date_created": today
}
data["licenses"] = [{
"url": "https://creativecommons.org/publicdomain/zero/1.0/",
"id": 1,
"name": "CC0 1.0"
}]
data["categories"] = [
{
"id": 0,
"name": "Photograph",
"supercategory": "Content",
},
{
"id": 1,
"name": "Illustration",
"supercategory": "Content",
},
{
"id": 2,
"name": "Map",
"supercategory": "Content"
},
{
"id": 3,
"name": "Comics/Cartoon",
"supercategory": "Content",
},
{
"id": 4,
"name": "Editorial Cartoon",
"supercategory": "Content"
}
]
data["images"] = []
data["annotations"] = []
def add_image(data, filename, url, height, width, date_captured, id):
image = {
"license": 1,
"file_name": filename,
"url": url,
"height": height,
"width": width,
"date_captured": today,
"id": id
}
data["images"].append(image)
def add_annotation(data, id, bw_id, image_id, category_id, bbox):
annotation = {
"id": id,
"bw_id": bw_id,
"image_id": image_id,
"category_id": category_id,
"bbox": bbox,
"iscrowd": 0,
"area": bbox[2]*bbox[3]
}
data["annotations"].append(annotation)
# first we open the beyond words data (cached in beyond_words_data for reproducability, but can be found here: http://beyondwords.labs.loc.gov/data)
with open('beyond_words_data/beyond_words.txt') as f:
bw = json.load(f)
# grabs the data containing the list of annotated images
contents = bw["data"]
# quick print of stats
print("Total # of annotations: " + str(len(contents)))
# create log for storing pages that don't download
log = open("build_manifest_log.txt", "a")
# find the number of unique images
paths = []
for annotation in contents:
paths.append(annotation["location"]["standard"])
unique_paths = list(set(paths))
print("Number of unique images: "+ str(len(unique_paths)))
# constructing dictionary for accessing the width and height of each image
image_dim_dict = {}
for path in unique_paths:
for annotation in contents:
if path in image_dim_dict:
continue
if path == annotation["location"]["standard"]:
image_dim_dict[path] = {"width": annotation["width"], "height": annotation["height"]}
# sets count for observing progress in outer loop
ct = 1
# now, we iterate through each unique path and grab the image using requests
# we also find all corresponding annotations
for path in unique_paths: #can truncate (e.g., [:10]) for testing here
# destination filepath of image
destination = "beyond_words_data/images/" + str(ct) + ".jpg"
# here, we try to pull down the image (if the request isn't stale)
try:
r = requests.get(path, stream=True)
# makes sure the request passed:
if r.status_code == 200:
with open(destination, 'wb') as f:
f.write(r.content)
# resize image for ease of use
time.sleep(0.1)
im = Image.open(destination)
im = im.resize( (rescale(im.size[0]),rescale(im.size[1])), Image.ANTIALIAS)
im.save(destination)
sys.stdout.write("\rProcessed Image "+str(ct)+"/"+str(len(unique_paths))+" ")
sys.stdout.flush()
except:
log.write("Download failed: " + str(path) + "\n")
continue
im_width = rescale(image_dim_dict[path]["width"])
im_height = rescale(image_dim_dict[path]["height"])
# function to add image to JSON
add_image(data, str(ct) + ".jpg", path, im_height, im_width, today, ct)
# counts the number of annotations per image
n_annotations = 0
# list storing booleans of whether the annotation has been processed already
processed_list = [False]*len(contents)
# we now find all of the annotations corresponding to this image
for i in range(0, len(contents)):
# if this annotation has already been processed for another inmage, we skip it and move on
if processed_list[i] == True:
continue
# pulls off the annotation
annotation = contents[i]
# pulls off filepath for annotation
location = annotation["location"]["standard"]
# if the annotation corresponds to ths image, we record the annotation on the label image
if location == path:
# pull off the other values we need
id = annotation["id"]
annotation_region = annotation["region"]
im_width = annotation["width"]
im_height = annotation["height"]
category = ''
# pulling off annotation category requires conditional parsing based on structure of dictionary
# (some annotations have "values" defined, and the annotation data lives inside as a the 0th element)
if 'category' in annotation["data"]:
category = annotation["data"]["category"]
elif 'values' in annotation["data"]:
if 'category' in annotation["data"]["values"][0]:
category = annotation["data"]["values"][0]["category"]
else:
log.write("Annotation failed: " + str(annotation) + "\n")
# if the category wasn't found for whatever reason, we skip
if category == '':
continue
# sets coordinates of annotation region
x1 = rescale(annotation_region["x"])
x2 = rescale(annotation_region["x"] + annotation_region["width"])
y1 = rescale(annotation_region["y"])
y2 = rescale(annotation_region["y"] + annotation_region["height"])
bbox = [x1, y1, rescale(annotation_region["width"]), rescale(annotation_region["height"])]
# add annotation to label image based on category type
if category == 'Photograph':
add_annotation(data, i, id, ct, 0, bbox)
elif category == 'Illustration':
add_annotation(data, i, id, ct, 1, bbox)
elif category == 'Map':
add_annotation(data, i, id, ct, 2, bbox)
elif category == 'Comics/Cartoon':
add_annotation(data, i, id, ct, 3, bbox)
elif category == 'Editorial Cartoon':
add_annotation(data, i, id, ct, 4, bbox)
# increment the number of annotations per the specific image
n_annotations += 1
# flag that this annotation has been processed already and doesn't need to be re-processed
processed_list[i] = True
print("Number of annotations for this image: " + str(n_annotations))
# increment count for log
ct += 1
# dumps json containing all annotation & image data in COCO format
with open('beyond_words_data/trainval.json', 'w') as f:
json.dump(data, f)
# this next chunk of code removes stale downloads (files that didn't download properly)
with open('beyond_words_data/trainval.json') as json_file:
data = json.load(json_file)
filenames = glob.glob('./beyond_words_data/images/*.jpg')
# total number of expected images in dataset
ct = len(data["images"])
print(ct)
stale_filenames = []
stale_indices = []
for i in range(1, ct):
if ("./beyond_words_data/images/" + str(i) + ".jpg") not in filenames:
stale_filenames.append(str(i) + ".jpg")
print(i)
stale_indices.append(i)
updated_images = []
for k in data["images"]:
if k["file_name"] in stale_filenames:
continue
else:
updated_images.append(k)
updated_annotations = []
for k in data["annotations"]:
if k["image_id"] in stale_indices:
continue
else:
updated_annotations.append(k)
data["images"] = updated_images
data["annotations"] = updated_annotations
# dumps json containing all annotation & image data in COCO format
with open('./beyond_words_data/trainval.json', 'w') as f:
json.dump(data, f)