Skip to content

Commit

Permalink
Drop validation
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Apr 25, 2024
1 parent cc6a6e4 commit b2a4e8b
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ jobs:
poetry install
- name: Run detection benchmark test
run: |
poetry run python benchmark.py --max 5 --result_path results
poetry run python benchmark.py --max 5 --result_path results --pdftext_only
poetry run python scripts/verify_benchmark_scores.py results/results.json
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ Here are the scores:
| Library | Time (s per page) | Alignment Score (% accuracy vs pymupdf) |
|------------|-------------------|-----------------------------------------|
| pymupdf | 0.32 | -- |
| pdftext | 1.79 | 96.22 |
| pdftext | 1.57 | 96.22 |
| pdfplumber | 3.0 | 89.88 |

pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same information).
Expand Down
4 changes: 4 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def main():
parser = argparse.ArgumentParser(description="Benchmark pdf extraction.")
parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None)
parser.add_argument("--pdftext_only", action="store_true", help="Only run pdftext inference", default=False)
args = parser.parse_args()

split = "train"
Expand All @@ -66,6 +67,9 @@ def main():
alignments = defaultdict(list)
times_tools = ["pymupdf", "pdftext", "pdfplumber"]
alignment_tools = ["pdftext", "pdfplumber"]
if args.pdftext_only:
times_tools = ["pdftext", "pymupdf"]
alignment_tools = ["pdftext"]
model = get_model()
for i in tqdm(range(len(dataset)), desc="Benchmarking"):
row = dataset[i]
Expand Down
6 changes: 5 additions & 1 deletion pdftext/inference.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from itertools import chain

import sklearn

from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES


Expand Down Expand Up @@ -152,7 +154,9 @@ def inference(text_chars, model):
training_rows = [tl[1] for tl in training_list]
training_idxs = [tl[0] for tl in training_list]

predictions = model.predict(training_rows)
# Disable nan, etc, validation for a small speedup
with sklearn.config_context(assume_finite=True):
predictions = model.predict(training_rows)
for pred, page_idx in zip(predictions, training_idxs):
next_prediction[page_idx] = pred
page_blocks = sorted(page_blocks.items())
Expand Down
6 changes: 2 additions & 4 deletions pdftext/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,9 @@ def merge_text(page: Dict, sort=False) -> str:
for char in line["chars"]:
line_text += char["char"]
line_text = postprocess_text(line_text)
if line_text.endswith("\n"):
line_text = line_text[:-1].strip() + " "
line_text = line_text.rstrip() + "\n"

block_text += line_text
if not block_text.endswith("\n"):
block_text += "\n\n"
block_text = block_text.rstrip() + "\n\n"
text += block_text
return text
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.1.1"
version = "0.1.2"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit b2a4e8b

Please sign in to comment.