Skip to content

Commit

Permalink
Wire up convert_single
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 20, 2024
1 parent 9263efc commit 4945edd
Show file tree
Hide file tree
Showing 13 changed files with 221 additions and 305 deletions.
4 changes: 2 additions & 2 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import math

from marker.convert import convert_single_pdf
from marker.output import markdown_exists, save_markdown
from marker.output import markdown_exists, save_output
from marker.pdf.utils import find_filetype
from marker.pdf.extract_text import get_length_of_text
from marker.models import load_all_models
Expand Down Expand Up @@ -58,7 +58,7 @@ def process_single_pdf(args):

full_text, images, out_metadata = convert_single_pdf(filepath, model_refs, metadata=metadata)
if len(full_text.strip()) > 0:
save_markdown(out_folder, fname, full_text, images, out_metadata)
save_output(out_folder, fname, full_text, images, out_metadata)
else:
print(f"Empty file: {filepath}. Could not convert.")
except Exception as e:
Expand Down
56 changes: 27 additions & 29 deletions convert_single.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,39 @@
import time

import pypdfium2 # Needs to be at the top to avoid warnings
import os

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS

import argparse
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
import time

from marker.output import save_markdown

configure_logging()
import click

from marker.converters.pdf import PdfConverter
from marker.logger import configure_logging
from marker.models import create_model_dict
from marker.output import save_output
from marker.config.parser import ConfigParser

def main():
parser = argparse.ArgumentParser()
parser.add_argument("filename", help="PDF file to parse")
parser.add_argument("output", help="Output base folder path")
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
args = parser.parse_args()
configure_logging()

langs = args.langs.split(",") if args.langs else None

fname = args.filename
model_lst = load_all_models()
@click.command(help="Convert a single PDF to markdown.")
@click.argument("fpath", type=str)
@ConfigParser.common_options
@click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")
def main(fpath: str, **kwargs):
models = create_model_dict()
start = time.time()
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)

fname = os.path.basename(fname)
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)

print(f"Saved markdown to the {subfolder_path} folder")
config_parser = ConfigParser(kwargs)

converter = PdfConverter(
config=config_parser.generate_config_dict(),
artifact_dict=models,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer()
)
rendered = converter(fpath)
out_folder = config_parser.get_output_folder(fpath)
save_output(rendered, out_folder, config_parser.get_base_filename(fpath))

print(f"Saved markdown to {out_folder}")
print(f"Total time: {time.time() - start}")


Expand Down
7 changes: 5 additions & 2 deletions marker/builders/ocr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

from surya.model.detection.model import EfficientViTForSemanticSegmentation
from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel
from surya.ocr import run_ocr
Expand All @@ -15,8 +17,9 @@


class OcrBuilder(BaseBuilder):
recognition_batch_size = None
detection_batch_size = None
recognition_batch_size: int | None = None
detection_batch_size: int | None = None
languages: List[str] | None = None

def __init__(self, detection_model: EfficientViTForSemanticSegmentation, recognition_model: OCREncoderDecoderModel, config=None):
super().__init__(config)
Expand Down
Empty file added marker/config/__init__.py
Empty file.
96 changes: 96 additions & 0 deletions marker/config/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import json
import os
from typing import Dict

import click

from marker.renderers.html import HTMLRenderer
from marker.settings import settings
from marker.util import parse_range_str, strings_to_classes, classes_to_string
from marker.renderers.markdown import MarkdownRenderer
from marker.renderers.json import JSONRenderer


class ConfigParser:
def __init__(self, cli_options: dict):
self.cli_options = cli_options

@staticmethod
def common_options(fn):
fn = click.option("--output_dir", type=click.Path(exists=False), required=False, default=settings.OUTPUT_DIR,
help="Directory to save output.")(fn)
fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.')(fn)
fn = click.option("--output_format", type=click.Choice(["markdown", "json", "html"]), default="markdown",
help="Format to output results in.")(fn)
fn = click.option("--page_range", type=str, default=None,
help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20")(
fn)
fn = click.option("--force_ocr", is_flag=True, help="Force OCR on the whole document.")(fn)
fn = click.option("--processors", type=str, default=None,
help="Comma separated list of processors to use. Must use full module path.")(fn)
fn = click.option("--config_json", type=str, default=None,
help="Path to JSON file with additional configuration.")(fn)
return fn

def generate_config_dict(self) -> Dict[str, any]:
config = {}
output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
for k, v in self.cli_options.items():
match k:
case "debug":
if v:
config["debug_pdf_images"] = True
config["debug_layout_images"] = True
config["debug_json"] = True
config["debug_data_folder"] = output_dir
case "page_range":
if v:
config["page_range"] = parse_range_str(v)
case "force_ocr":
if v:
config["force_ocr"] = True
case "languages":
if v:
config["languages"] = v.split(",")
case "config_json":
if v:
with open(v, "r") as f:
config.update(json.load(f))
return config

def get_renderer(self):
match self.cli_options["output_format"]:
case "json":
r = JSONRenderer
case "markdown":
r = MarkdownRenderer
case "html":
r = HTMLRenderer
case _:
raise ValueError("Invalid output format")
return classes_to_string([r])[0]

def get_processors(self):
processors = self.cli_options.get("processors", None)
if processors is not None:
processors = processors.split(",")
for p in processors:
try:
strings_to_classes([p])
except Exception as e:
print(f"Error loading processor: {p} with error: {e}")
raise

return processors

def get_output_folder(self, filepath: str):
output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
fname_base = os.path.splitext(os.path.basename(filepath))[0]
output_dir = os.path.join(output_dir, fname_base)
os.makedirs(output_dir, exist_ok=True)
return output_dir

def get_base_filename(self, filepath: str):
basename = os.path.basename(filepath)
return os.path.splitext(basename)[0]

139 changes: 0 additions & 139 deletions marker/convert.py

This file was deleted.

Loading

0 comments on commit 4945edd

Please sign in to comment.