-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcombining_hocr.py
34 lines (24 loc) · 1.14 KB
/
combining_hocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# -*- coding: utf-8 -*-
"""combining_hocr.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/18aFAo3c27m7yDneiaCTDPuZLDol0H6Z2
"""
from bs4 import BeautifulSoup
def combine_hocr_files(hocr_files):
combined_hocr = BeautifulSoup(features="xml")
for hocr_file in hocr_files:
with open(hocr_file, "r", encoding="utf-8") as file:
hocr_content = file.read()
soup = BeautifulSoup(hocr_content, "html.parser")
# Append the contents of the current hOCR document to the combined document
for tag in soup.find_all(recursive=False):
combined_hocr.append(tag)
return combined_hocr
# List of hOCR files to combine
hocr_files = ["/content/OCR-D-HOCR_142440_page_1.xml", "/content/OCR-D-HOCR_142440_page_2.xml", "/content/OCR-D-HOCR_142440_page_3.xml","/content/OCR-D-HOCR_142440_page_4.xml"]
# Combine the hOCR files
combined_hocr = combine_hocr_files(hocr_files)
# Write the combined hOCR document to a file
with open("combined_hocr.hocr", "w", encoding="utf-8") as combined_file:
combined_file.write(str(combined_hocr))