Skip to content

Commit

Permalink
update converter interface
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Nov 20, 2024
1 parent 8a542cf commit 36102d3
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 25 deletions.
38 changes: 19 additions & 19 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,22 @@
from marker.v2.schema.blocks import Block
from marker.v2.schema.registry import register_block_class
from marker.v2.processors.debug import DebugProcessor
from marker.v2.processors import BaseProcessor
from marker.v2.renderers import BaseRenderer


class PdfConverter(BaseConverter):
override_map: Dict[BlockTypes, Type[Block]] = defaultdict()

def __init__(self, config=None, model_lst: Optional[List[Any]] = None, processor_list: Optional[List[Any]] = None, output_format="markdown"):
def __init__(self, model_lst: List[Any], processor_list: List[BaseProcessor], renderer: BaseRenderer, config=None):
super().__init__(config)

for block_type, override_block_type in self.override_map.items():
register_block_class(block_type, override_block_type)

self.class_instance_map = {model.__class__: model for model in model_lst}
self.processor_list = processor_list

if output_format == "markdown":
self.renderer = MarkdownRenderer(self.config)
elif output_format == "json":
self.renderer = JSONRenderer(self.config)
self.renderer = renderer

def resolve_dependencies(self, cls):
init_signature = inspect.signature(cls.__init__)
Expand Down Expand Up @@ -127,29 +125,31 @@ def main(fpath: str, output_dir: str, debug: bool, output_format: str, pages: st
DocumentTOCProcessor,
DebugProcessor,
]

if output_format == "markdown":
renderer = MarkdownRenderer(config)
fext = "md"
elif output_format == "json":
renderer = JSONRenderer(config)
fext = "json"
else:
raise ValueError(f"Unknown output format: {output_format}")

converter = PdfConverter(
config=config,
model_lst=model_lst,
processor_list=processor_list,
output_format=output_format
renderer=renderer
)
rendered = converter(fpath)

with open(os.path.join(output_dir, f"{fname_base}.{fext}"), "w+") as f:
f.write(rendered.markdown)
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
f.write(json.dumps(rendered.metadata, indent=2))
if output_format == "markdown":
with open(os.path.join(output_dir, f"{fname_base}.md"), "w+") as f:
f.write(rendered.markdown)

with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
f.write(json.dumps(rendered.metadata, indent=2))

for img_name, img in rendered.images.items():
img.save(os.path.join(output_dir, img_name), "PNG")
elif output_format == "json":
with open(os.path.join(output_dir, f"{fname_base}.json"), "w+") as f:
f.write(rendered.model_dump_json(indent=2))

with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
f.write(json.dumps(rendered.metadata, indent=2))

print(f"Output written to {output_dir}")

Expand Down
28 changes: 22 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
from marker.v2.processors.text import TextProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block
from marker.v2.schema.document import Document
from marker.v2.renderers.markdown import MarkdownRenderer
from marker.v2.renderers.json import JSONRenderer
from marker.v2.schema.registry import register_block_class


Expand Down Expand Up @@ -92,7 +93,7 @@ def pdf_provider(request, config, temp_pdf):


@pytest.fixture(scope="function")
def pdf_document(request, config, pdf_provider, layout_model, recognition_model, detection_model) -> Document:
def pdf_document(request, config, pdf_provider, layout_model, recognition_model, detection_model):
layout_builder = LayoutBuilder(layout_model, config)
ocr_builder = OcrBuilder(detection_model, recognition_model, config)
builder = DocumentBuilder(config)
Expand All @@ -101,7 +102,7 @@ def pdf_document(request, config, pdf_provider, layout_model, recognition_model,


@pytest.fixture(scope="function")
def pdf_converter(request, config, layout_model, texify_model, recognition_model, table_rec_model, detection_model):
def pdf_converter(request, config, layout_model, texify_model, recognition_model, table_rec_model, detection_model, renderer):
model_lst = [layout_model, texify_model, recognition_model, table_rec_model, detection_model]
processor_list = [
EquationProcessor,
Expand All @@ -113,13 +114,28 @@ def pdf_converter(request, config, layout_model, texify_model, recognition_model
DebugProcessor,
]
yield PdfConverter(
config=config,
model_lst=model_lst,
processor_list=processor_list,
output_format="markdown"
renderer=renderer,
config=config
)


@pytest.fixture(scope="function")
def markdown_output(request, temp_pdf, pdf_converter):
def renderer(request, config):
if request.node.get_closest_marker("output_format"):
output_format = request.node.get_closest_marker("output_format").args[0]
if output_format == "markdown":
return MarkdownRenderer(config)
elif output_format == "json":
return JSONRenderer(config)
else:
raise ValueError(f"Unknown output format: {output_format}")
else:
return MarkdownRenderer(config)


@pytest.fixture(scope="function")
@pytest.mark.output_format("markdown")
def markdown_output(request, temp_pdf, pdf_converter, renderer):
yield pdf_converter(temp_pdf.name)
1 change: 1 addition & 0 deletions tests/converters/test_pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from marker.v2.renderers.markdown import MarkdownOutput


@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7]})
def test_pdf_converter(pdf_converter: PdfConverter, temp_pdf):
markdown_output: MarkdownOutput = pdf_converter(temp_pdf.name)
Expand Down

0 comments on commit 36102d3

Please sign in to comment.