Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #34

Merged
merged 2 commits into from
Jan 28, 2025
Merged

Dev #34

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
pip install poetry
poetry install

- name: Run detection benchmark test
- name: Run benchmark
run: |
poetry run python benchmark.py --max 5 --result_path results --pdftext_only
poetry run python scripts/verify_benchmark_scores.py results/results.json
poetry run python benchmark/benchmark.py --max 5 --result_path results --pdftext_only
poetry run python benchmark/verify_benchmark_scores.py results/results.json
28 changes: 28 additions & 0 deletions .github/workflows/scripts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Script test

on: [push]

env:
TORCH_DEVICE: "cpu"
PYTHONIOENCODING: "utf-8"

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
steps:
- uses: actions/checkout@v3

- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install Python dependencies
run: |
pip install poetry
poetry install
- name: Run script
run: |
poetry run pdftext benchmark/adversarial_short.pdf
Binary file added benchmark/adversarial_short.pdf
Binary file not shown.
File renamed without changes.
59 changes: 2 additions & 57 deletions extract_text.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,4 @@
import json
from pathlib import Path
from typing import List

import click
import pypdfium2 as pdfium

from pdftext.extraction import plain_text_output, dictionary_output

def parse_range_str(range_str: str) -> List[int]:
range_lst = range_str.split(",")
page_lst = []
for i in range_lst:
if "-" in i:
start, end = i.split("-")
page_lst += list(range(int(start), int(end) + 1))
else:
page_lst.append(int(i))
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
return page_lst

@click.command(help="Extract plain text or JSON from PDF.")
@click.argument("pdf_path", type=click.Path(exists=True))
@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout")
@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False)
@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False)
@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False)
@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None)
@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False)
@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False)
@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None)
def main(
pdf_path: Path,
out_path: Path | None,
**kwargs
):
pages = None
if kwargs["page_range"] is not None:
pdf_doc = pdfium.PdfDocument(pdf_path)
pages = parse_range_str(kwargs["page_range"])
doc_len = len(pdf_doc)
pdf_doc.close()
assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"

if kwargs["json"]:
text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
text = json.dumps(text)
else:
text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])

if out_path is None:
print(text)
else:
with open(out_path, "w+") as f:
f.write(text)

from pdftext.scripts.extract_text import extract_text_cli

if __name__ == "__main__":
main()
extract_text_cli()
55 changes: 55 additions & 0 deletions pdftext/scripts/extract_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import json
from pathlib import Path
from typing import List

import click
import pypdfium2 as pdfium

from pdftext.extraction import plain_text_output, dictionary_output

def parse_range_str(range_str: str) -> List[int]:
range_lst = range_str.split(",")
page_lst = []
for i in range_lst:
if "-" in i:
start, end = i.split("-")
page_lst += list(range(int(start), int(end) + 1))
else:
page_lst.append(int(i))
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
return page_lst

@click.command(help="Extract plain text or JSON from PDF.")
@click.argument("pdf_path", type=click.Path(exists=True))
@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout")
@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False)
@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False)
@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False)
@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None)
@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False)
@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False)
@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None)
def extract_text_cli(
pdf_path: Path,
out_path: Path | None,
**kwargs
):
pages = None
if kwargs["page_range"] is not None:
pdf_doc = pdfium.PdfDocument(pdf_path)
pages = parse_range_str(kwargs["page_range"])
doc_len = len(pdf_doc)
pdf_doc.close()
assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"

if kwargs["json"]:
text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
text = json.dumps(text)
else:
text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])

if out_path is None:
print(text)
else:
with open(out_path, "w+") as f:
f.write(text)
12 changes: 6 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 1 addition & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@ keywords = ["pdf", "text", "extraction"]
packages = [
{include = "pdftext"}
]
include = [
"extract_text.py"
]

[tool.poetry.dependencies]
python = "^3.10"
Expand All @@ -35,4 +32,4 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.scripts]
pdftext = "extract_text:main"
pdftext = "pdftext.scripts.extract_text:extract_text_cli"