Skip to content

Commit

Permalink
add test for overriding config and update pyproj deps to include mark…
Browse files Browse the repository at this point in the history
…downify and update poetry lock
  • Loading branch information
iammosespaulr committed Nov 18, 2024
1 parent b674363 commit c479d53
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 15 deletions.
21 changes: 15 additions & 6 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,39 @@
from marker.v2.providers.pdf import PdfProvider
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning

from marker.v2.processors.sectionheader import SectionHeaderProcessor
from marker.v2.providers.pdf import PdfProvider
import tempfile
from typing import List, Optional
from collections import defaultdict
from typing import Dict, Type

import click
import datasets
from pydantic import BaseModel

from marker.v2.builders.document import DocumentBuilder
from marker.v2.builders.layout import LayoutBuilder
from marker.v2.builders.ocr import OcrBuilder
from marker.v2.builders.structure import StructureBuilder
from marker.v2.converters import BaseConverter
from marker.v2.models import setup_detection_model, setup_layout_model, \
setup_recognition_model, setup_table_rec_model, setup_texify_model
from marker.v2.processors.equation import EquationProcessor
from marker.v2.processors.sectionheader import SectionHeaderProcessor
from marker.v2.processors.table import TableProcessor
from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \
setup_detection_model
from marker.v2.renderers.markdown import MarkdownRenderer
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block
from marker.v2.schema.registry import BLOCK_REGISTRY


class PdfConverter(BaseConverter):
override_map: Dict[BlockTypes, Type[Block]] = defaultdict()

def __init__(self, config=None):
super().__init__(config)

for block_type, override_block_type in self.override_map.items():
BLOCK_REGISTRY[block_type] = override_block_type

self.layout_model = setup_layout_model()
self.texify_model = setup_texify_model()
Expand Down
17 changes: 16 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ filetype = "^1.2.0"
regex = "^2024.4.28"
pdftext = "^0.3.18"
tabled-pdf = { git = "https://github.com/VikParuchuri/tabled.git", branch = "dev-mose/compilation-updates" }
markdownify = "^0.13.1"

[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"
Expand Down
26 changes: 18 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@

import datasets
import pytest
from typing import Dict, Type

from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block
from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \
setup_detection_model
from marker.v2.builders.document import DocumentBuilder
from marker.v2.builders.layout import LayoutBuilder
from marker.v2.builders.ocr import OcrBuilder
from marker.v2.schema.document import Document
from marker.v2.schema.registry import BLOCK_REGISTRY


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -48,13 +52,22 @@ def table_rec_model():


@pytest.fixture(scope="function")
def pdf_provider(request):
def config(request):
config_mark = request.node.get_closest_marker("config")
config = config_mark.args[0] if config_mark else {}

override_map: Dict[BlockTypes, Type[Block]] = config.get("override_map", {})
for block_type, override_block_type in override_map.items():
BLOCK_REGISTRY[block_type] = override_block_type

return config


@pytest.fixture(scope="function")
def pdf_provider(request, config):
filename_mark = request.node.get_closest_marker("filename")
filename = filename_mark.args[0] if filename_mark else "adversarial.pdf"

config_mark = request.node.get_closest_marker("config")
config = config_mark.args[0] if config_mark else None

dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index(filename)

Expand All @@ -65,10 +78,7 @@ def pdf_provider(request):


@pytest.fixture(scope="function")
def pdf_document(request, pdf_provider, layout_model, recognition_model, detection_model) -> Document:
config_mark = request.node.get_closest_marker("config")
config = config_mark.args[0] if config_mark else None

def pdf_document(request, config, pdf_provider, layout_model, recognition_model, detection_model) -> Document:
layout_builder = LayoutBuilder(layout_model, config)
ocr_builder = OcrBuilder(detection_model, recognition_model, config)
builder = DocumentBuilder(config)
Expand Down
18 changes: 18 additions & 0 deletions tests/test_overriding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pytest

from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document
from marker.v2.schema.blocks import SectionHeader


class NewSectionHeader(SectionHeader):
pass


@pytest.mark.config({
"page_range": [0],
"override_map": {BlockTypes.SectionHeader: NewSectionHeader}
})
def test_overriding(pdf_document: Document):
assert pdf_document.pages[0]\
.get_block(pdf_document.pages[0].structure[0]).__class__ == NewSectionHeader

0 comments on commit c479d53

Please sign in to comment.