Skip to content

Commit

Permalink
Merge pull request #22 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Misc improvements
  • Loading branch information
iammosespaulr authored Dec 12, 2024
2 parents f26428a + 31e33e5 commit cd9d41d
Show file tree
Hide file tree
Showing 14 changed files with 1,176 additions and 1,353 deletions.
13 changes: 9 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,24 @@ env:

jobs:
build:
runs-on: ubuntu-latest
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
steps:
- uses: actions/checkout@v3

- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies

- name: Install Python dependencies
run: |
pip install poetry
poetry install
- name: Run detection benchmark test
run: |
poetry run python benchmark.py --max 5 --result_path results --pdftext_only
poetry run python scripts/verify_benchmark_scores.py results/results.json
poetry run python scripts/verify_benchmark_scores.py results/results.json
4 changes: 1 addition & 3 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
from rapidfuzz import fuzz
import tabulate
from tqdm import tqdm
import pypdfium2 as pdfium

from pdftext.extraction import paginated_plain_text_output
from pdftext.model import get_model
from pdftext.settings import settings


Expand Down Expand Up @@ -82,7 +80,7 @@ def main():
row = dataset[i]
pdf = row["pdf"]
tool_pages = {}
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
f.write(pdf)
f.seek(0)
pdf_path = f.name
Expand Down
Binary file removed models/dt.joblib
Binary file not shown.
Binary file removed models/dt.onnx
Binary file not shown.
34 changes: 15 additions & 19 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
import atexit
import math
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from itertools import repeat
from typing import List
from concurrent.futures import ProcessPoolExecutor
import math

import pypdfium2 as pdfium

from pdftext.inference import inference
from pdftext.model import get_model
from pdftext.pdf.chars import get_pdfium_chars
from pdftext.pdf.utils import unnormalize_bbox
from pdftext.postprocessing import merge_text, sort_blocks, postprocess_text, handle_hyphens
from pdftext.pdf.pages import get_pages
from pdftext.postprocessing import handle_hyphens, merge_text, postprocess_text, sort_blocks
from pdftext.settings import settings


Expand All @@ -25,21 +23,17 @@ def _load_pdf(pdf, flatten_pdf):


def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True):
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pages = inference(text_chars, model)
return pages
return get_pages(pdf_doc, page_range, flatten_pdf, quote_loosebox)


def worker_shutdown(pdf_doc):
pdf_doc.close()


def worker_init(pdf_path, flatten_pdf):
global model
global pdf_doc

pdf_doc = _load_pdf(pdf_path, flatten_pdf)
model = get_model()

atexit.register(partial(worker_shutdown, pdf_doc))

Expand All @@ -53,10 +47,9 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True
workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference

if workers is None or workers <= 1:
model = get_model()
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pages = get_pages(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pdf_doc.close()
return inference(text_chars, model)
return pages

pdf_doc.close()
page_range = list(page_range)
Expand Down Expand Up @@ -85,13 +78,13 @@ def paginated_plain_text_output(pdf_path, sort=False, hyphens=False, page_range=


def _process_span(span, page_width, page_height, keep_chars):
span["bbox"] = unnormalize_bbox(span["bbox"], page_width, page_height)
span["bbox"] = span["bbox"].bbox
span["text"] = handle_hyphens(postprocess_text(span["text"]), keep_hyphens=True)
if not keep_chars:
del span["chars"]
else:
for char in span["chars"]:
char["bbox"] = unnormalize_bbox(char["bbox"], page_width, page_height)
char["bbox"] = char["bbox"].bbox


def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None):
Expand All @@ -102,16 +95,19 @@ def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, f
for k in list(block.keys()):
if k not in ["lines", "bbox"]:
del block[k]
block["bbox"] = unnormalize_bbox(block["bbox"], page_width, page_height)
block["bbox"] = block["bbox"].bbox
for line in block["lines"]:
for k in list(line.keys()):
if k not in ["spans", "bbox"]:
del line[k]
line["bbox"] = unnormalize_bbox(line["bbox"], page_width, page_height)
line["bbox"] = line["bbox"].bbox
for span in line["spans"]:
_process_span(span, page_width, page_height, keep_chars)

if sort:
page["blocks"] = sort_blocks(page["blocks"])

if page["rotation"] == 90 or page["rotation"] == 270:
page["width"], page["height"] = page["height"], page["width"]
page["bbox"] = [page["bbox"][2], page["bbox"][3], page["bbox"][0], page["bbox"][1]]
return pages
252 changes: 0 additions & 252 deletions pdftext/inference.py

This file was deleted.

Loading

0 comments on commit cd9d41d

Please sign in to comment.