forked from SampannaKahu/ScanBank
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfigure_json_transformer.py
36 lines (29 loc) · 1.31 KB
/
figure_json_transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from pathlib import Path
import json
import os
# import tensorboxresnet.utils.annolist.AnnotationLib as al
figure_json_path = '/work/host-input/arxiv_data_output/figure-jsons'
output_figure_boundaries_path = '/work/host-input/figure_boundaries.json'
output_caption_boundaries_path = '/work/host-input/caption_boundaries.json'
figure_boundaries = []
caption_boundaries = []
for filename in Path(figure_json_path).rglob('*.json'):
contents = json.load(open(str(filename)))
for key, value in contents.items():
dir_name, file = os.path.split(key)
correct_path = os.path.join(dir_name, 'black.pdf-images/ghostscript/dpi100', file)
if not len(value):
continue
figure_annotation = {
"image_path": correct_path,
"rects": [ann['figure_boundary'] for ann in value]
}
caption_annotation = {
"image_path": correct_path,
"rects": [ann['caption_boundary'] for ann in value]
}
figure_boundaries.append(figure_annotation)
caption_boundaries.append(caption_annotation)
json.dump(figure_boundaries, open(output_figure_boundaries_path, mode='w'), indent=2)
json.dump(caption_boundaries, open(output_caption_boundaries_path, mode='w'), indent=2)
# annolist = al.parse(output_figure_boundaries_path, abs_path=False)