From 2d7cb00e01a3a6beb9100e8ad680d5f82591684d Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 9 May 2024 12:06:23 -0700 Subject: [PATCH] Fix deployment bugs --- README.md | 13 ++++++++++++- convert.py | 20 +++++++++++--------- marker/convert.py | 4 +--- marker/layout/order.py | 2 +- marker/models.py | 6 ++++-- marker/ocr/lang.py | 13 +++++++++++-- marker/ocr/recognition.py | 18 +++++++++++++++++- marker/settings.py | 1 + pyproject.toml | 2 +- 9 files changed, 59 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index e3844c53..147935c3 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,17 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 mar Note that the env variables above are specific to this script, and cannot be set in `local.env`. +# Important settings/Troubleshooting + +There are some settings that you may find especially useful if things aren't working the way you expect: + +- `OCR_ALL_PAGES` - set this to true to force OCR all pages. This can be very useful if the table layouts aren't recognized properly by default, or if there is garbled text. +- `TORCH_DEVICE` - set this to force marker to use a given torch device for inference. +- `OCR_ENGINE` - can set this to `surya` or `ocrmypdf`. +- `DEBUG` - setting this to `True` shows ray logs when converting multiple pdfs + +In general, if output is not what you expect, trying to OCR the PDF is a good first step. + # Benchmarks Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct. @@ -163,7 +174,7 @@ First 3 are non-arXiv books, last 3 are arXiv papers. | marker | 0.536176 | 0.516833 | 0.70515 | 0.710657 | 0.690042 | 0.523467 | | nougat | 0.44009 | 0.588973 | 0.322706 | 0.401342 | 0.160842 | 0.525663 | -Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `5.1GB` for marker. Benchmarks were run on an A6000 Ada. +Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `4.1GB` for marker. Benchmarks were run on an A6000 Ada. **Throughput** diff --git a/convert.py b/convert.py index 9ddb6226..8625679d 100755 --- a/convert.py +++ b/convert.py @@ -20,7 +20,8 @@ @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0) -def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None): +def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None): + fname = os.path.basename(filepath) if markdown_exists(out_folder, fname): return try: @@ -28,21 +29,21 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option # This can indicate that they were scanned, and not OCRed properly # Usually these files are not recent/high-quality if min_length: - filetype = find_filetype(fname) + filetype = find_filetype(filepath) if filetype == "other": return 0 - length = get_length_of_text(fname) + length = get_length_of_text(filepath) if length < min_length: return - full_text, images, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata) + full_text, images, out_metadata = convert_single_pdf(filepath, model_refs, metadata=metadata) if len(full_text.strip()) > 0: save_markdown(out_folder, fname, full_text, images, out_metadata) else: - print(f"Empty file: {fname}. Could not convert.") + print(f"Empty file: {filepath}. Could not convert.") except Exception as e: - print(f"Error converting {fname}: {e}") + print(f"Error converting {filepath}: {e}") print(traceback.format_exc()) @@ -62,6 +63,7 @@ def main(): in_folder = os.path.abspath(args.in_folder) out_folder = os.path.abspath(args.out_folder) files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)] + files = [f for f in files if os.path.isfile(f)] os.makedirs(out_folder, exist_ok=True) # Handle chunks if we're processing in parallel @@ -100,12 +102,12 @@ def main(): print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}") futures = [ process_single_pdf.options(num_gpus=gpu_frac).remote( - filename, + filepath, out_folder, model_refs, - metadata=metadata.get(os.path.basename(filename)), + metadata=metadata.get(os.path.basename(filepath)), min_length=args.min_length - ) for filename in files_to_convert + ) for filepath in files_to_convert ] # Run all ray conversion tasks diff --git a/marker/convert.py b/marker/convert.py index 2253e983..4c6c06a2 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -1,12 +1,10 @@ import warnings - -from marker.utils import flush_cuda_memory - warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings import pypdfium2 as pdfium from PIL import Image +from marker.utils import flush_cuda_memory from marker.tables.table import format_tables from marker.debug.data import dump_bbox_debug_data from marker.layout.layout import surya_layout, annotate_block_types diff --git a/marker/layout/order.py b/marker/layout/order.py index 5e455e85..3f8cdc7c 100644 --- a/marker/layout/order.py +++ b/marker/layout/order.py @@ -26,7 +26,7 @@ def surya_order(doc, pages: List[Page], order_model, batch_multiplier=1): # Get bboxes for all pages bboxes = [] for page in pages: - bbox = [b.bbox for b in page.layout.bboxes] + bbox = [b.bbox for b in page.layout.bboxes][:settings.ORDER_MAX_BBOXES] bboxes.append(bbox) processor = order_model.processor diff --git a/marker/models.py b/marker/models.py index a77fb7e1..26bc0b19 100644 --- a/marker/models.py +++ b/marker/models.py @@ -50,7 +50,9 @@ def load_all_models(langs=None): layout = setup_layout_model() order = setup_order_model() edit = load_editing_model() - ocr = setup_recognition_model(langs) if settings.OCR_ENGINE == "surya" else None + + # Only load recognition model if we'll need it for all pdfs + ocr = setup_recognition_model(langs) if (settings.OCR_ENGINE == "surya" and settings.OCR_ALL_PAGES) else None texify = setup_texify_model() model_lst = [texify, layout, order, edit, detection, ocr] - return model_lst + return model_lst \ No newline at end of file diff --git a/marker/ocr/lang.py b/marker/ocr/lang.py index 82d6cc0e..3e49004c 100644 --- a/marker/ocr/lang.py +++ b/marker/ocr/lang.py @@ -1,14 +1,23 @@ +from typing import List + from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE +from surya.model.recognition.tokenizer import _tokenize as lang_tokenize from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE from marker.settings import settings +def langs_to_ids(langs: List[str]): + unique_langs = list(set(langs)) + _, lang_tokens = lang_tokenize("", unique_langs) + return lang_tokens + + def replace_langs_with_codes(langs): if settings.OCR_ENGINE == "surya": for i, lang in enumerate(langs): - if lang in LANGUAGE_TO_CODE: - langs[i] = LANGUAGE_TO_CODE[lang] + if lang.title() in LANGUAGE_TO_CODE: + langs[i] = LANGUAGE_TO_CODE[lang.title()] else: for i, lang in enumerate(langs): if lang in LANGUAGE_TO_CODE: diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py index 1d743bb6..d4eb4b04 100644 --- a/marker/ocr/recognition.py +++ b/marker/ocr/recognition.py @@ -7,7 +7,9 @@ from surya.ocr import run_recognition +from marker.models import setup_recognition_model from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr +from marker.ocr.lang import langs_to_ids from marker.pdf.images import render_image from marker.schema.page import Page from marker.schema.block import Block, Line, Span @@ -19,7 +21,7 @@ def get_batch_size(): if settings.RECOGNITION_BATCH_SIZE is not None: return settings.RECOGNITION_BATCH_SIZE elif settings.TORCH_DEVICE_MODEL == "cuda": - return 64 + return 32 elif settings.TORCH_DEVICE_MODEL == "mps": return 32 return 32 @@ -37,11 +39,25 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplie ocr_idxs.append(pnum) ocr_pages += 1 + # No pages need OCR + if ocr_pages == 0: + return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"} + ocr_method = settings.OCR_ENGINE if ocr_method is None: return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"} elif ocr_method == "surya": + # Load model just in time if we're not OCRing everything + del_rec_model = False + if rec_model is None: + lang_tokens = langs_to_ids(langs) + rec_model = setup_recognition_model(lang_tokens) + del_rec_model = True + new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier) + + if del_rec_model: + del rec_model elif ocr_method == "ocrmypdf": new_pages = tesseract_recognition(doc, ocr_idxs, langs) else: diff --git a/marker/settings.py b/marker/settings.py index 2b0e0dae..f28fb187 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -70,6 +70,7 @@ def TORCH_DEVICE_MODEL(self) -> str: # Ordering model SURYA_ORDER_DPI: int = 96 ORDER_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise + ORDER_MAX_BBOXES: int = 255 # Final editing model EDITOR_BATCH_SIZE: Optional[int] = None # Defaults to 6 for cuda, 12 otherwise diff --git a/pyproject.toml b/pyproject.toml index 964d3129..30f7b52d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "0.2.0" +version = "0.2.2" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md"