Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Manual Segmentation of PDF characters into Spans, Lines and Blocks #19

Merged
merged 17 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
from rapidfuzz import fuzz
import tabulate
from tqdm import tqdm
import pypdfium2 as pdfium

from pdftext.extraction import paginated_plain_text_output
from pdftext.model import get_model
from pdftext.settings import settings


Expand Down
Binary file removed models/dt.joblib
Binary file not shown.
Binary file removed models/dt.onnx
Binary file not shown.
34 changes: 15 additions & 19 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
import atexit
import math
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from itertools import repeat
from typing import List
from concurrent.futures import ProcessPoolExecutor
import math

import pypdfium2 as pdfium

from pdftext.inference import inference
from pdftext.model import get_model
from pdftext.pdf.chars import get_pdfium_chars
from pdftext.pdf.utils import unnormalize_bbox
from pdftext.postprocessing import merge_text, sort_blocks, postprocess_text, handle_hyphens
from pdftext.pdf.pages import get_pages
from pdftext.postprocessing import handle_hyphens, merge_text, postprocess_text, sort_blocks
from pdftext.settings import settings


Expand All @@ -25,21 +23,17 @@ def _load_pdf(pdf, flatten_pdf):


def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True):
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pages = inference(text_chars, model)
return pages
return get_pages(pdf_doc, page_range, flatten_pdf, quote_loosebox)


def worker_shutdown(pdf_doc):
pdf_doc.close()


def worker_init(pdf_path, flatten_pdf):
global model
global pdf_doc

pdf_doc = _load_pdf(pdf_path, flatten_pdf)
model = get_model()

atexit.register(partial(worker_shutdown, pdf_doc))

Expand All @@ -53,10 +47,9 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True
workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference

if workers is None or workers <= 1:
model = get_model()
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pages = get_pages(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pdf_doc.close()
return inference(text_chars, model)
return pages

pdf_doc.close()
page_range = list(page_range)
Expand Down Expand Up @@ -85,13 +78,13 @@ def paginated_plain_text_output(pdf_path, sort=False, hyphens=False, page_range=


def _process_span(span, page_width, page_height, keep_chars):
span["bbox"] = unnormalize_bbox(span["bbox"], page_width, page_height)
span["bbox"] = span["bbox"].unnormalize(page_width, page_height).bbox
span["text"] = handle_hyphens(postprocess_text(span["text"]), keep_hyphens=True)
if not keep_chars:
del span["chars"]
else:
for char in span["chars"]:
char["bbox"] = unnormalize_bbox(char["bbox"], page_width, page_height)
char["bbox"] = char["bbox"].unnormalize(page_width, page_height).bbox


def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None):
Expand All @@ -102,16 +95,19 @@ def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, f
for k in list(block.keys()):
if k not in ["lines", "bbox"]:
del block[k]
block["bbox"] = unnormalize_bbox(block["bbox"], page_width, page_height)
block["bbox"] = block["bbox"].unnormalize(page_width, page_height).bbox
for line in block["lines"]:
for k in list(line.keys()):
if k not in ["spans", "bbox"]:
del line[k]
line["bbox"] = unnormalize_bbox(line["bbox"], page_width, page_height)
line["bbox"] = line["bbox"].unnormalize(page_width, page_height).bbox
for span in line["spans"]:
_process_span(span, page_width, page_height, keep_chars)

if sort:
page["blocks"] = sort_blocks(page["blocks"])

if page["rotation"] == 90 or page["rotation"] == 270:
page["width"], page["height"] = page["height"], page["width"]
page["bbox"] = [page["bbox"][2], page["bbox"][3], page["bbox"][0], page["bbox"][1]]
return pages
252 changes: 0 additions & 252 deletions pdftext/inference.py

This file was deleted.

7 changes: 0 additions & 7 deletions pdftext/model.py

This file was deleted.

Loading