From 2d7cb00e01a3a6beb9100e8ad680d5f82591684d Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Thu, 9 May 2024 12:06:23 -0700
Subject: [PATCH] Fix deployment bugs

---
 README.md                 | 13 ++++++++++++-
 convert.py                | 20 +++++++++++---------
 marker/convert.py         |  4 +---
 marker/layout/order.py    |  2 +-
 marker/models.py          |  6 ++++--
 marker/ocr/lang.py        | 13 +++++++++++--
 marker/ocr/recognition.py | 18 +++++++++++++++++-
 marker/settings.py        |  1 +
 pyproject.toml            |  2 +-
 9 files changed, 59 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index e3844c53..147935c3 100644
--- a/README.md
+++ b/README.md
@@ -141,6 +141,17 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 mar
 
 Note that the env variables above are specific to this script, and cannot be set in `local.env`.
 
+# Important settings/Troubleshooting
+
+There are some settings that you may find especially useful if things aren't working the way you expect:
+
+- `OCR_ALL_PAGES` - set this to true to force OCR all pages.  This can be very useful if the table layouts aren't recognized properly by default, or if there is garbled text.
+- `TORCH_DEVICE` - set this to force marker to use a given torch device for inference.
+- `OCR_ENGINE` - can set this to `surya` or `ocrmypdf`.
+- `DEBUG` - setting this to `True` shows ray logs when converting multiple pdfs
+
+In general, if output is not what you expect, trying to OCR the PDF is a good first step.
+
 # Benchmarks
 
 Benchmarking PDF extraction quality is hard.  I've created a test set by finding books and scientific papers that have a pdf version and a latex source.  I convert the latex to text, and compare the reference to the output of text extraction methods.  It's noisy, but at least directionally correct.
@@ -163,7 +174,7 @@ First 3 are non-arXiv books, last 3 are arXiv papers.
 | marker | 0.536176        | 0.516833         | 0.70515         | 0.710657    | 0.690042     | 0.523467  |
 | nougat | 0.44009         | 0.588973         | 0.322706        | 0.401342    | 0.160842     | 0.525663  |
 
-Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `5.1GB` for marker.  Benchmarks were run on an A6000 Ada.
+Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `4.1GB` for marker.  Benchmarks were run on an A6000 Ada.
 
 **Throughput**
 
diff --git a/convert.py b/convert.py
index 9ddb6226..8625679d 100755
--- a/convert.py
+++ b/convert.py
@@ -20,7 +20,8 @@
 
 
 @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
-def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
+def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
+    fname = os.path.basename(filepath)
     if markdown_exists(out_folder, fname):
         return
     try:
@@ -28,21 +29,21 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
         # This can indicate that they were scanned, and not OCRed properly
         # Usually these files are not recent/high-quality
         if min_length:
-            filetype = find_filetype(fname)
+            filetype = find_filetype(filepath)
             if filetype == "other":
                 return 0
 
-            length = get_length_of_text(fname)
+            length = get_length_of_text(filepath)
             if length < min_length:
                 return
 
-        full_text, images, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
+        full_text, images, out_metadata = convert_single_pdf(filepath, model_refs, metadata=metadata)
         if len(full_text.strip()) > 0:
             save_markdown(out_folder, fname, full_text, images, out_metadata)
         else:
-            print(f"Empty file: {fname}.  Could not convert.")
+            print(f"Empty file: {filepath}.  Could not convert.")
     except Exception as e:
-        print(f"Error converting {fname}: {e}")
+        print(f"Error converting {filepath}: {e}")
         print(traceback.format_exc())
 
 
@@ -62,6 +63,7 @@ def main():
     in_folder = os.path.abspath(args.in_folder)
     out_folder = os.path.abspath(args.out_folder)
     files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
+    files = [f for f in files if os.path.isfile(f)]
     os.makedirs(out_folder, exist_ok=True)
 
     # Handle chunks if we're processing in parallel
@@ -100,12 +102,12 @@ def main():
     print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
     futures = [
         process_single_pdf.options(num_gpus=gpu_frac).remote(
-            filename,
+            filepath,
             out_folder,
             model_refs,
-            metadata=metadata.get(os.path.basename(filename)),
+            metadata=metadata.get(os.path.basename(filepath)),
             min_length=args.min_length
-        ) for filename in files_to_convert
+        ) for filepath in files_to_convert
     ]
 
     # Run all ray conversion tasks
diff --git a/marker/convert.py b/marker/convert.py
index 2253e983..4c6c06a2 100644
--- a/marker/convert.py
+++ b/marker/convert.py
@@ -1,12 +1,10 @@
 import warnings
-
-from marker.utils import flush_cuda_memory
-
 warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
 
 import pypdfium2 as pdfium
 from PIL import Image
 
+from marker.utils import flush_cuda_memory
 from marker.tables.table import format_tables
 from marker.debug.data import dump_bbox_debug_data
 from marker.layout.layout import surya_layout, annotate_block_types
diff --git a/marker/layout/order.py b/marker/layout/order.py
index 5e455e85..3f8cdc7c 100644
--- a/marker/layout/order.py
+++ b/marker/layout/order.py
@@ -26,7 +26,7 @@ def surya_order(doc, pages: List[Page], order_model, batch_multiplier=1):
     # Get bboxes for all pages
     bboxes = []
     for page in pages:
-        bbox = [b.bbox for b in page.layout.bboxes]
+        bbox = [b.bbox for b in page.layout.bboxes][:settings.ORDER_MAX_BBOXES]
         bboxes.append(bbox)
 
     processor = order_model.processor
diff --git a/marker/models.py b/marker/models.py
index a77fb7e1..26bc0b19 100644
--- a/marker/models.py
+++ b/marker/models.py
@@ -50,7 +50,9 @@ def load_all_models(langs=None):
     layout = setup_layout_model()
     order = setup_order_model()
     edit = load_editing_model()
-    ocr = setup_recognition_model(langs) if settings.OCR_ENGINE == "surya" else None
+
+    # Only load recognition model if we'll need it for all pdfs
+    ocr = setup_recognition_model(langs) if (settings.OCR_ENGINE == "surya" and settings.OCR_ALL_PAGES) else None
     texify = setup_texify_model()
     model_lst = [texify, layout, order, edit, detection, ocr]
-    return model_lst
+    return model_lst
\ No newline at end of file
diff --git a/marker/ocr/lang.py b/marker/ocr/lang.py
index 82d6cc0e..3e49004c 100644
--- a/marker/ocr/lang.py
+++ b/marker/ocr/lang.py
@@ -1,14 +1,23 @@
+from typing import List
+
 from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
+from surya.model.recognition.tokenizer import _tokenize as lang_tokenize
 
 from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE
 from marker.settings import settings
 
 
+def langs_to_ids(langs: List[str]):
+    unique_langs = list(set(langs))
+    _, lang_tokens = lang_tokenize("", unique_langs)
+    return lang_tokens
+
+
 def replace_langs_with_codes(langs):
     if settings.OCR_ENGINE == "surya":
         for i, lang in enumerate(langs):
-            if lang in LANGUAGE_TO_CODE:
-                langs[i] = LANGUAGE_TO_CODE[lang]
+            if lang.title() in LANGUAGE_TO_CODE:
+                langs[i] = LANGUAGE_TO_CODE[lang.title()]
     else:
         for i, lang in enumerate(langs):
             if lang in LANGUAGE_TO_CODE:
diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
index 1d743bb6..d4eb4b04 100644
--- a/marker/ocr/recognition.py
+++ b/marker/ocr/recognition.py
@@ -7,7 +7,9 @@
 
 from surya.ocr import run_recognition
 
+from marker.models import setup_recognition_model
 from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
+from marker.ocr.lang import langs_to_ids
 from marker.pdf.images import render_image
 from marker.schema.page import Page
 from marker.schema.block import Block, Line, Span
@@ -19,7 +21,7 @@ def get_batch_size():
     if settings.RECOGNITION_BATCH_SIZE is not None:
         return settings.RECOGNITION_BATCH_SIZE
     elif settings.TORCH_DEVICE_MODEL == "cuda":
-        return 64
+        return 32
     elif settings.TORCH_DEVICE_MODEL == "mps":
         return 32
     return 32
@@ -37,11 +39,25 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplie
             ocr_idxs.append(pnum)
             ocr_pages += 1
 
+    # No pages need OCR
+    if ocr_pages == 0:
+        return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
+
     ocr_method = settings.OCR_ENGINE
     if ocr_method is None:
         return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
     elif ocr_method == "surya":
+        # Load model just in time if we're not OCRing everything
+        del_rec_model = False
+        if rec_model is None:
+            lang_tokens = langs_to_ids(langs)
+            rec_model = setup_recognition_model(lang_tokens)
+            del_rec_model = True
+
         new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
+
+        if del_rec_model:
+            del rec_model
     elif ocr_method == "ocrmypdf":
         new_pages = tesseract_recognition(doc, ocr_idxs, langs)
     else:
diff --git a/marker/settings.py b/marker/settings.py
index 2b0e0dae..f28fb187 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -70,6 +70,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
     # Ordering model
     SURYA_ORDER_DPI: int = 96
     ORDER_BATCH_SIZE: Optional[int] = None  # Defaults to 12 for cuda, 6 otherwise
+    ORDER_MAX_BBOXES: int = 255
 
     # Final editing model
     EDITOR_BATCH_SIZE: Optional[int] = None # Defaults to 6 for cuda, 12 otherwise
diff --git a/pyproject.toml b/pyproject.toml
index 964d3129..30f7b52d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.2.0"
+version = "0.2.2"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"