Skip to content

Commit

Permalink
remove old files and update poetry lock
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Dec 10, 2024
1 parent 7fe4827 commit f9561a7
Show file tree
Hide file tree
Showing 7 changed files with 719 additions and 883 deletions.
4 changes: 2 additions & 2 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
from rapidfuzz import fuzz
import tabulate
from tqdm import tqdm
import pypdfium2 as pdfium

from pdftext.extraction import paginated_plain_text_output
from pdftext.model import get_model
from pdftext.settings import settings


Expand Down Expand Up @@ -94,6 +92,8 @@ def main():
pages = inference_func(pdf_path)
times[tool].append(time.time() - start)
tool_pages[tool] = pages
open(f"{row['__key__']}.{tool}.md", "w").write("\n".join(pages))
open(f"{row['__key__']}.pdf", "wb").write(pdf)

for tool in alignment_tools:
alignments[tool].append(
Expand Down
Binary file removed models/dt.joblib
Binary file not shown.
Binary file removed models/dt.onnx
Binary file not shown.
7 changes: 0 additions & 7 deletions pdftext/model.py

This file was deleted.

3 changes: 0 additions & 3 deletions pdftext/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@


class Settings(BaseSettings):
BASE_PATH: str = os.path.dirname(os.path.dirname(__file__))
MODEL_PATH: str = os.path.join(BASE_PATH, "models", "dt.onnx")

# Inference
BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection
WORKER_PAGE_THRESHOLD: int = 10 # Min number of pages per worker in parallel
Expand Down
1,583 changes: 716 additions & 867 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 1 addition & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,14 @@ packages = [
{include = "pdftext"}
]
include = [
"extract_text.py",
"models/dt.joblib",
"models/dt.onnx"
"extract_text.py"
]

[tool.poetry.dependencies]
python = "^3.10"
pypdfium2 = "^4.29.0"
pydantic = "^2.7.1"
pydantic-settings = "^2.2.1"
onnxruntime = "^1.19.2"

[tool.poetry.group.dev.dependencies]
pymupdf = "^1.24.2"
Expand Down

0 comments on commit f9561a7

Please sign in to comment.