diff --git a/README.md b/README.md index 8a15f85..1512920 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ pdftext PDF_PATH --out_path output.txt - `--sort` will attempt to sort in reading order if specified. - `--keep_hyphens` will keep hyphens in the output (they will be stripped and words joined otherwise) - `--pages` will specify pages (comma separated) to extract +- `--workers` specifies the number of parallel workers to use ## JSON @@ -42,6 +43,7 @@ pdftext PDF_PATH --out_path output.txt --json - `--sort` will attempt to sort in reading order if specified. - `--pages` will specify pages (comma separated) to extract - `--keep_chars` will keep individual characters in the json output +- `--workers` specifies the number of parallel workers to use The output will be a json list, with each item in the list corresponding to a single page in the input pdf (in order). Each page will include the following keys: @@ -71,21 +73,17 @@ If the pdf is rotated, the bboxes will be relative to the rotated page (they're Extract plain text: ```python -import pypdfium2 as pdfium from pdftext.extraction import plain_text_output -pdf = pdfium.PdfDocument(PDF_PATH) -text = plain_text_output(pdf, sort=False, hyphens=False, page_range=[1,2,3]) # Optional arguments explained above +text = plain_text_output(PDF_PATH, sort=False, hyphens=False, page_range=[1,2,3]) # Optional arguments explained above ``` Extract structured blocks and lines: ```python -import pypdfium2 as pdfium from pdftext.extraction import dictionary_output -pdf = pdfium.PdfDocument(PDF_PATH) -text = dictionary_output(pdf, sort=False, page_range=[1,2,3], keep_chars=False) # Optional arguments explained above +text = dictionary_output(PDF_PATH, sort=False, page_range=[1,2,3], keep_chars=False) # Optional arguments explained above ``` If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper. pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well. diff --git a/benchmark.py b/benchmark.py index d56c458..218e344 100644 --- a/benchmark.py +++ b/benchmark.py @@ -50,9 +50,8 @@ def pdfplumber_inference(pdf_path): return pages -def pdftext_inference(pdf_path, model): - pdf = pdfium.PdfDocument(pdf_path) - return paginated_plain_text_output(pdf, model=model) +def pdftext_inference(pdf_path, model=None, workers=None): + return paginated_plain_text_output(pdf_path, model=model, workers=workers) def compare_docs(doc1: str, doc2: str): @@ -64,6 +63,7 @@ def main(): parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None) parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None) parser.add_argument("--pdftext_only", action="store_true", help="Only run pdftext inference", default=False) + parser.add_argument("--pdftext_workers", type=int, help="Number of workers to use for pdftext inference", default=None) args = parser.parse_args() split = "train" @@ -88,7 +88,7 @@ def main(): f.seek(0) pdf_path = f.name - pdftext_inference_model = partial(pdftext_inference, model=model) + pdftext_inference_model = partial(pdftext_inference, model=model, workers=args.pdftext_workers) inference_funcs = [pymupdf_inference, pdftext_inference_model, pdfplumber_inference] for tool, inference_func in zip(times_tools, inference_funcs): start = time.time() diff --git a/extract_text.py b/extract_text.py index af5a487..3cf3531 100644 --- a/extract_text.py +++ b/extract_text.py @@ -14,19 +14,20 @@ def main(): parser.add_argument("--keep_hyphens", action="store_true", help="Keep hyphens in words", default=False) parser.add_argument("--pages", type=str, help="Comma separated pages to extract, like 1,2,3", default=None) parser.add_argument("--keep_chars", action="store_true", help="Keep character level information", default=False) + parser.add_argument("--workers", type=int, help="Number of workers to use for parallel processing", default=None) args = parser.parse_args() - pdf_doc = pdfium.PdfDocument(args.pdf_path) pages = None if args.pages is not None: + pdf_doc = pdfium.PdfDocument(args.pdf_path) pages = [int(p) for p in args.pages.split(",")] assert all(p <= len(pdf_doc) for p in pages), "Invalid page number(s) provided" if args.json: - text = dictionary_output(pdf_doc, sort=args.sort, page_range=pages, keep_chars=args.keep_chars) + text = dictionary_output(args.pdf_path, sort=args.sort, page_range=pages, keep_chars=args.keep_chars, workers=args.workers) text = json.dumps(text) else: - text = plain_text_output(pdf_doc, sort=args.sort, hyphens=args.keep_hyphens, page_range=pages) + text = plain_text_output(args.pdf_path, sort=args.sort, hyphens=args.keep_hyphens, page_range=pages, workers=args.workers) if args.out_path is None: print(text) diff --git a/pdftext/extraction.py b/pdftext/extraction.py index 547598b..efda88f 100644 --- a/pdftext/extraction.py +++ b/pdftext/extraction.py @@ -1,35 +1,68 @@ +from functools import partial from typing import List +from concurrent.futures import ProcessPoolExecutor +import math +import pypdfium2 as pdfium from pdftext.inference import inference from pdftext.model import get_model from pdftext.pdf.chars import get_pdfium_chars from pdftext.pdf.utils import unnormalize_bbox from pdftext.postprocessing import merge_text, sort_blocks, postprocess_text, handle_hyphens +from pdftext.settings import settings -def _get_pages(pdf_doc, model=None, page_range=None): - if model is None: - model = get_model() - text_chars = get_pdfium_chars(pdf_doc, page_range=page_range) +def _get_page_range(pdf_path, model, page_range): + pdf_doc = pdfium.PdfDocument(pdf_path) + text_chars = get_pdfium_chars(pdf_doc, page_range) pages = inference(text_chars, model) return pages -def plain_text_output(pdf_doc, sort=False, model=None, hyphens=False, page_range=None) -> str: - text = paginated_plain_text_output(pdf_doc, sort=sort, model=model, hyphens=hyphens, page_range=page_range) +def _get_pages(pdf_path, model=None, page_range=None, workers=None): + if model is None: + model = get_model() + + pdf_doc = pdfium.PdfDocument(pdf_path) + if page_range is None: + page_range = range(len(pdf_doc)) + + if workers is not None: + workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference + + if workers is None or workers <= 1: + text_chars = get_pdfium_chars(pdf_doc, page_range) + return inference(text_chars, model) + + func = partial(_get_page_range, pdf_path, model) + page_range = list(page_range) + + pages_per_worker = math.ceil(len(page_range) / workers) + page_range_chunks = [page_range[i * pages_per_worker:(i + 1) * pages_per_worker] for i in range(workers)] + + with ProcessPoolExecutor(max_workers=workers) as executor: + pages = list(executor.map(func, page_range_chunks)) + + ordered_pages = [page for sublist in pages for page in sublist] + + return ordered_pages + + +def plain_text_output(pdf_path, sort=False, model=None, hyphens=False, page_range=None, workers=None) -> str: + text = paginated_plain_text_output(pdf_path, sort=sort, model=model, hyphens=hyphens, page_range=page_range, workers=workers) return "\n".join(text) -def paginated_plain_text_output(pdf_doc, sort=False, model=None, hyphens=False, page_range=None) -> List[str]: - pages = _get_pages(pdf_doc, model, page_range) +def paginated_plain_text_output(pdf_path, sort=False, model=None, hyphens=False, page_range=None, workers=None) -> List[str]: + pages = _get_pages(pdf_path, model, page_range, workers=workers) text = [] for page in pages: text.append(merge_text(page, sort=sort, hyphens=hyphens).strip()) return text -def dictionary_output(pdf_doc, sort=False, model=None, page_range=None, keep_chars=False): - pages = _get_pages(pdf_doc, model, page_range) +def dictionary_output(pdf_path, sort=False, model=None, page_range=None, keep_chars=False, workers=None): + pages = _get_pages(pdf_path, model, page_range, workers=workers) for page in pages: for block in page["blocks"]: bad_keys = [key for key in block.keys() if key not in ["lines", "bbox"]] diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index 4209138..01c3ad7 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -21,10 +21,8 @@ def update_previous_fonts(text_chars: Dict, i: int, prev_fontname: str, prev_fon text_chars["chars"][j]["font"]["flags"] = fontflags -def get_pdfium_chars(pdf, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ, page_range=None): +def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ): blocks = [] - if page_range is None: - page_range = range(len(pdf)) for page_idx in page_range: page = pdf.get_page(page_idx) diff --git a/pdftext/settings.py b/pdftext/settings.py index 0df433a..65da221 100644 --- a/pdftext/settings.py +++ b/pdftext/settings.py @@ -12,6 +12,7 @@ class Settings(BaseSettings): # Inference BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection + WORKER_PAGE_THRESHOLD: int = 30 # Min number of pages per worker in parallel # Benchmark RESULTS_FOLDER: str = "results" diff --git a/pyproject.toml b/pyproject.toml index 167970f..d25b356 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.3.7" +version = "0.3.8" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"