-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
54914cb
commit 80219ab
Showing
3 changed files
with
58 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,59 +1,4 @@ | ||
import json | ||
from pathlib import Path | ||
from typing import List | ||
|
||
import click | ||
import pypdfium2 as pdfium | ||
|
||
from pdftext.extraction import plain_text_output, dictionary_output | ||
|
||
def parse_range_str(range_str: str) -> List[int]: | ||
range_lst = range_str.split(",") | ||
page_lst = [] | ||
for i in range_lst: | ||
if "-" in i: | ||
start, end = i.split("-") | ||
page_lst += list(range(int(start), int(end) + 1)) | ||
else: | ||
page_lst.append(int(i)) | ||
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order | ||
return page_lst | ||
|
||
@click.command(help="Extract plain text or JSON from PDF.") | ||
@click.argument("pdf_path", type=click.Path(exists=True)) | ||
@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout") | ||
@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False) | ||
@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False) | ||
@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False) | ||
@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None) | ||
@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False) | ||
@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False) | ||
@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None) | ||
def main( | ||
pdf_path: Path, | ||
out_path: Path | None, | ||
**kwargs | ||
): | ||
pages = None | ||
if kwargs["page_range"] is not None: | ||
pdf_doc = pdfium.PdfDocument(pdf_path) | ||
pages = parse_range_str(kwargs["page_range"]) | ||
doc_len = len(pdf_doc) | ||
pdf_doc.close() | ||
assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided" | ||
|
||
if kwargs["json"]: | ||
text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"]) | ||
text = json.dumps(text) | ||
else: | ||
text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"]) | ||
|
||
if out_path is None: | ||
print(text) | ||
else: | ||
with open(out_path, "w+") as f: | ||
f.write(text) | ||
|
||
from pdftext.scripts.extract_text import extract_text_cli | ||
|
||
if __name__ == "__main__": | ||
main() | ||
extract_text_cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import json | ||
from pathlib import Path | ||
from typing import List | ||
|
||
import click | ||
import pypdfium2 as pdfium | ||
|
||
from pdftext.extraction import plain_text_output, dictionary_output | ||
|
||
def parse_range_str(range_str: str) -> List[int]: | ||
range_lst = range_str.split(",") | ||
page_lst = [] | ||
for i in range_lst: | ||
if "-" in i: | ||
start, end = i.split("-") | ||
page_lst += list(range(int(start), int(end) + 1)) | ||
else: | ||
page_lst.append(int(i)) | ||
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order | ||
return page_lst | ||
|
||
@click.command(help="Extract plain text or JSON from PDF.") | ||
@click.argument("pdf_path", type=click.Path(exists=True)) | ||
@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout") | ||
@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False) | ||
@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False) | ||
@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False) | ||
@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None) | ||
@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False) | ||
@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False) | ||
@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None) | ||
def extract_text_cli( | ||
pdf_path: Path, | ||
out_path: Path | None, | ||
**kwargs | ||
): | ||
pages = None | ||
if kwargs["page_range"] is not None: | ||
pdf_doc = pdfium.PdfDocument(pdf_path) | ||
pages = parse_range_str(kwargs["page_range"]) | ||
doc_len = len(pdf_doc) | ||
pdf_doc.close() | ||
assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided" | ||
|
||
if kwargs["json"]: | ||
text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"]) | ||
text = json.dumps(text) | ||
else: | ||
text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"]) | ||
|
||
if out_path is None: | ||
print(text) | ||
else: | ||
with open(out_path, "w+") as f: | ||
f.write(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters