Skip to content

Commit

Permalink
Enable parallel workers
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 23, 2024
1 parent 50398d5 commit 37d1caf
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 27 deletions.
10 changes: 4 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pdftext PDF_PATH --out_path output.txt
- `--sort` will attempt to sort in reading order if specified.
- `--keep_hyphens` will keep hyphens in the output (they will be stripped and words joined otherwise)
- `--pages` will specify pages (comma separated) to extract
- `--workers` specifies the number of parallel workers to use

## JSON

Expand All @@ -42,6 +43,7 @@ pdftext PDF_PATH --out_path output.txt --json
- `--sort` will attempt to sort in reading order if specified.
- `--pages` will specify pages (comma separated) to extract
- `--keep_chars` will keep individual characters in the json output
- `--workers` specifies the number of parallel workers to use

The output will be a json list, with each item in the list corresponding to a single page in the input pdf (in order). Each page will include the following keys:

Expand Down Expand Up @@ -71,21 +73,17 @@ If the pdf is rotated, the bboxes will be relative to the rotated page (they're
Extract plain text:

```python
import pypdfium2 as pdfium
from pdftext.extraction import plain_text_output

pdf = pdfium.PdfDocument(PDF_PATH)
text = plain_text_output(pdf, sort=False, hyphens=False, page_range=[1,2,3]) # Optional arguments explained above
text = plain_text_output(PDF_PATH, sort=False, hyphens=False, page_range=[1,2,3]) # Optional arguments explained above
```

Extract structured blocks and lines:

```python
import pypdfium2 as pdfium
from pdftext.extraction import dictionary_output

pdf = pdfium.PdfDocument(PDF_PATH)
text = dictionary_output(pdf, sort=False, page_range=[1,2,3], keep_chars=False) # Optional arguments explained above
text = dictionary_output(PDF_PATH, sort=False, page_range=[1,2,3], keep_chars=False) # Optional arguments explained above
```

If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper. pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well.
Expand Down
8 changes: 4 additions & 4 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,8 @@ def pdfplumber_inference(pdf_path):
return pages


def pdftext_inference(pdf_path, model):
pdf = pdfium.PdfDocument(pdf_path)
return paginated_plain_text_output(pdf, model=model)
def pdftext_inference(pdf_path, model=None, workers=None):
return paginated_plain_text_output(pdf_path, model=model, workers=workers)


def compare_docs(doc1: str, doc2: str):
Expand All @@ -64,6 +63,7 @@ def main():
parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None)
parser.add_argument("--pdftext_only", action="store_true", help="Only run pdftext inference", default=False)
parser.add_argument("--pdftext_workers", type=int, help="Number of workers to use for pdftext inference", default=None)
args = parser.parse_args()

split = "train"
Expand All @@ -88,7 +88,7 @@ def main():
f.seek(0)
pdf_path = f.name

pdftext_inference_model = partial(pdftext_inference, model=model)
pdftext_inference_model = partial(pdftext_inference, model=model, workers=args.pdftext_workers)
inference_funcs = [pymupdf_inference, pdftext_inference_model, pdfplumber_inference]
for tool, inference_func in zip(times_tools, inference_funcs):
start = time.time()
Expand Down
7 changes: 4 additions & 3 deletions extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,20 @@ def main():
parser.add_argument("--keep_hyphens", action="store_true", help="Keep hyphens in words", default=False)
parser.add_argument("--pages", type=str, help="Comma separated pages to extract, like 1,2,3", default=None)
parser.add_argument("--keep_chars", action="store_true", help="Keep character level information", default=False)
parser.add_argument("--workers", type=int, help="Number of workers to use for parallel processing", default=None)
args = parser.parse_args()

pdf_doc = pdfium.PdfDocument(args.pdf_path)
pages = None
if args.pages is not None:
pdf_doc = pdfium.PdfDocument(args.pdf_path)
pages = [int(p) for p in args.pages.split(",")]
assert all(p <= len(pdf_doc) for p in pages), "Invalid page number(s) provided"

if args.json:
text = dictionary_output(pdf_doc, sort=args.sort, page_range=pages, keep_chars=args.keep_chars)
text = dictionary_output(args.pdf_path, sort=args.sort, page_range=pages, keep_chars=args.keep_chars, workers=args.workers)
text = json.dumps(text)
else:
text = plain_text_output(pdf_doc, sort=args.sort, hyphens=args.keep_hyphens, page_range=pages)
text = plain_text_output(args.pdf_path, sort=args.sort, hyphens=args.keep_hyphens, page_range=pages, workers=args.workers)

if args.out_path is None:
print(text)
Expand Down
53 changes: 43 additions & 10 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,68 @@
from functools import partial
from typing import List
from concurrent.futures import ProcessPoolExecutor
import math
import pypdfium2 as pdfium

from pdftext.inference import inference
from pdftext.model import get_model
from pdftext.pdf.chars import get_pdfium_chars
from pdftext.pdf.utils import unnormalize_bbox
from pdftext.postprocessing import merge_text, sort_blocks, postprocess_text, handle_hyphens
from pdftext.settings import settings


def _get_pages(pdf_doc, model=None, page_range=None):
if model is None:
model = get_model()
text_chars = get_pdfium_chars(pdf_doc, page_range=page_range)
def _get_page_range(pdf_path, model, page_range):
pdf_doc = pdfium.PdfDocument(pdf_path)
text_chars = get_pdfium_chars(pdf_doc, page_range)
pages = inference(text_chars, model)
return pages


def plain_text_output(pdf_doc, sort=False, model=None, hyphens=False, page_range=None) -> str:
text = paginated_plain_text_output(pdf_doc, sort=sort, model=model, hyphens=hyphens, page_range=page_range)
def _get_pages(pdf_path, model=None, page_range=None, workers=None):
if model is None:
model = get_model()

pdf_doc = pdfium.PdfDocument(pdf_path)
if page_range is None:
page_range = range(len(pdf_doc))

if workers is not None:
workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference

if workers is None or workers <= 1:
text_chars = get_pdfium_chars(pdf_doc, page_range)
return inference(text_chars, model)

func = partial(_get_page_range, pdf_path, model)
page_range = list(page_range)

pages_per_worker = math.ceil(len(page_range) / workers)
page_range_chunks = [page_range[i * pages_per_worker:(i + 1) * pages_per_worker] for i in range(workers)]

with ProcessPoolExecutor(max_workers=workers) as executor:
pages = list(executor.map(func, page_range_chunks))

ordered_pages = [page for sublist in pages for page in sublist]

return ordered_pages


def plain_text_output(pdf_path, sort=False, model=None, hyphens=False, page_range=None, workers=None) -> str:
text = paginated_plain_text_output(pdf_path, sort=sort, model=model, hyphens=hyphens, page_range=page_range, workers=workers)
return "\n".join(text)


def paginated_plain_text_output(pdf_doc, sort=False, model=None, hyphens=False, page_range=None) -> List[str]:
pages = _get_pages(pdf_doc, model, page_range)
def paginated_plain_text_output(pdf_path, sort=False, model=None, hyphens=False, page_range=None, workers=None) -> List[str]:
pages = _get_pages(pdf_path, model, page_range, workers=workers)
text = []
for page in pages:
text.append(merge_text(page, sort=sort, hyphens=hyphens).strip())
return text


def dictionary_output(pdf_doc, sort=False, model=None, page_range=None, keep_chars=False):
pages = _get_pages(pdf_doc, model, page_range)
def dictionary_output(pdf_path, sort=False, model=None, page_range=None, keep_chars=False, workers=None):
pages = _get_pages(pdf_path, model, page_range, workers=workers)
for page in pages:
for block in page["blocks"]:
bad_keys = [key for key in block.keys() if key not in ["lines", "bbox"]]
Expand Down
4 changes: 1 addition & 3 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@ def update_previous_fonts(text_chars: Dict, i: int, prev_fontname: str, prev_fon
text_chars["chars"][j]["font"]["flags"] = fontflags


def get_pdfium_chars(pdf, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ, page_range=None):
def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ):
blocks = []
if page_range is None:
page_range = range(len(pdf))

for page_idx in page_range:
page = pdf.get_page(page_idx)
Expand Down
1 change: 1 addition & 0 deletions pdftext/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class Settings(BaseSettings):

# Inference
BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection
WORKER_PAGE_THRESHOLD: int = 30 # Min number of pages per worker in parallel

# Benchmark
RESULTS_FOLDER: str = "results"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.7"
version = "0.3.8"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit 37d1caf

Please sign in to comment.