Skip to content

Commit

Permalink
Merge pull request #34 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
VikParuchuri authored Jan 28, 2025
2 parents 54914cb + fd881b7 commit 92fd696
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 70 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
pip install poetry
poetry install
- name: Run detection benchmark test
- name: Run benchmark
run: |
poetry run python benchmark.py --max 5 --result_path results --pdftext_only
poetry run python scripts/verify_benchmark_scores.py results/results.json
poetry run python benchmark/benchmark.py --max 5 --result_path results --pdftext_only
poetry run python benchmark/verify_benchmark_scores.py results/results.json
28 changes: 28 additions & 0 deletions .github/workflows/scripts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Script test

on: [push]

env:
TORCH_DEVICE: "cpu"
PYTHONIOENCODING: "utf-8"

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
steps:
- uses: actions/checkout@v3

- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install Python dependencies
run: |
pip install poetry
poetry install
- name: Run script
run: |
poetry run pdftext benchmark/adversarial_short.pdf
Binary file added benchmark/adversarial_short.pdf
Binary file not shown.
File renamed without changes.
File renamed without changes.
59 changes: 2 additions & 57 deletions extract_text.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,4 @@
import json
from pathlib import Path
from typing import List

import click
import pypdfium2 as pdfium

from pdftext.extraction import plain_text_output, dictionary_output

def parse_range_str(range_str: str) -> List[int]:
range_lst = range_str.split(",")
page_lst = []
for i in range_lst:
if "-" in i:
start, end = i.split("-")
page_lst += list(range(int(start), int(end) + 1))
else:
page_lst.append(int(i))
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
return page_lst

@click.command(help="Extract plain text or JSON from PDF.")
@click.argument("pdf_path", type=click.Path(exists=True))
@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout")
@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False)
@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False)
@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False)
@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None)
@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False)
@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False)
@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None)
def main(
pdf_path: Path,
out_path: Path | None,
**kwargs
):
pages = None
if kwargs["page_range"] is not None:
pdf_doc = pdfium.PdfDocument(pdf_path)
pages = parse_range_str(kwargs["page_range"])
doc_len = len(pdf_doc)
pdf_doc.close()
assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"

if kwargs["json"]:
text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
text = json.dumps(text)
else:
text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])

if out_path is None:
print(text)
else:
with open(out_path, "w+") as f:
f.write(text)

from pdftext.scripts.extract_text import extract_text_cli

if __name__ == "__main__":
main()
extract_text_cli()
55 changes: 55 additions & 0 deletions pdftext/scripts/extract_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import json
from pathlib import Path
from typing import List

import click
import pypdfium2 as pdfium

from pdftext.extraction import plain_text_output, dictionary_output

def parse_range_str(range_str: str) -> List[int]:
range_lst = range_str.split(",")
page_lst = []
for i in range_lst:
if "-" in i:
start, end = i.split("-")
page_lst += list(range(int(start), int(end) + 1))
else:
page_lst.append(int(i))
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
return page_lst

@click.command(help="Extract plain text or JSON from PDF.")
@click.argument("pdf_path", type=click.Path(exists=True))
@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout")
@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False)
@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False)
@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False)
@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None)
@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False)
@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False)
@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None)
def extract_text_cli(
pdf_path: Path,
out_path: Path | None,
**kwargs
):
pages = None
if kwargs["page_range"] is not None:
pdf_doc = pdfium.PdfDocument(pdf_path)
pages = parse_range_str(kwargs["page_range"])
doc_len = len(pdf_doc)
pdf_doc.close()
assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"

if kwargs["json"]:
text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
text = json.dumps(text)
else:
text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])

if out_path is None:
print(text)
else:
with open(out_path, "w+") as f:
f.write(text)
12 changes: 6 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 1 addition & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@ keywords = ["pdf", "text", "extraction"]
packages = [
{include = "pdftext"}
]
include = [
"extract_text.py"
]

[tool.poetry.dependencies]
python = "^3.10"
Expand All @@ -35,4 +32,4 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.scripts]
pdftext = "extract_text:main"
pdftext = "pdftext.scripts.extract_text:extract_text_cli"

0 comments on commit 92fd696

Please sign in to comment.