Fix deployment bugs

VikParuchuri · May 9, 2024 · 2d7cb00 · 2d7cb00
1 parent 4966f7a
commit 2d7cb00
Show file tree

Hide file tree

Showing 9 changed files with 59 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -141,6 +141,17 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 mar
 
 Note that the env variables above are specific to this script, and cannot be set in `local.env`.
 
+# Important settings/Troubleshooting
+
+There are some settings that you may find especially useful if things aren't working the way you expect:
+
+- `OCR_ALL_PAGES` - set this to true to force OCR all pages.  This can be very useful if the table layouts aren't recognized properly by default, or if there is garbled text.
+- `TORCH_DEVICE` - set this to force marker to use a given torch device for inference.
+- `OCR_ENGINE` - can set this to `surya` or `ocrmypdf`.
+- `DEBUG` - setting this to `True` shows ray logs when converting multiple pdfs
+
+In general, if output is not what you expect, trying to OCR the PDF is a good first step.
+
 # Benchmarks
 
 Benchmarking PDF extraction quality is hard.  I've created a test set by finding books and scientific papers that have a pdf version and a latex source.  I convert the latex to text, and compare the reference to the output of text extraction methods.  It's noisy, but at least directionally correct.
@@ -163,7 +174,7 @@ First 3 are non-arXiv books, last 3 are arXiv papers.
 | marker | 0.536176        | 0.516833         | 0.70515         | 0.710657    | 0.690042     | 0.523467  |
 | nougat | 0.44009         | 0.588973         | 0.322706        | 0.401342    | 0.160842     | 0.525663  |
 
-Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `5.1GB` for marker.  Benchmarks were run on an A6000 Ada.
+Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `4.1GB` for marker.  Benchmarks were run on an A6000 Ada.
 
 **Throughput**
 

diff --git a/convert.py b/convert.py
@@ -20,29 +20,30 @@
 
 
 @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
-def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
+def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
+    fname = os.path.basename(filepath)
     if markdown_exists(out_folder, fname):
         return
     try:
         # Skip trying to convert files that don't have a lot of embedded text
         # This can indicate that they were scanned, and not OCRed properly
         # Usually these files are not recent/high-quality
         if min_length:
-            filetype = find_filetype(fname)
+            filetype = find_filetype(filepath)
             if filetype == "other":
                 return 0
 
-            length = get_length_of_text(fname)
+            length = get_length_of_text(filepath)
             if length < min_length:
                 return
 
-        full_text, images, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
+        full_text, images, out_metadata = convert_single_pdf(filepath, model_refs, metadata=metadata)
         if len(full_text.strip()) > 0:
             save_markdown(out_folder, fname, full_text, images, out_metadata)
         else:
-            print(f"Empty file: {fname}.  Could not convert.")
+            print(f"Empty file: {filepath}.  Could not convert.")
     except Exception as e:
-        print(f"Error converting {fname}: {e}")
+        print(f"Error converting {filepath}: {e}")
         print(traceback.format_exc())
 
 
@@ -62,6 +63,7 @@ def main():
     in_folder = os.path.abspath(args.in_folder)
     out_folder = os.path.abspath(args.out_folder)
     files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
+    files = [f for f in files if os.path.isfile(f)]
     os.makedirs(out_folder, exist_ok=True)
 
     # Handle chunks if we're processing in parallel
@@ -100,12 +102,12 @@ def main():
     print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
     futures = [
         process_single_pdf.options(num_gpus=gpu_frac).remote(
-            filename,
+            filepath,
             out_folder,
             model_refs,
-            metadata=metadata.get(os.path.basename(filename)),
+            metadata=metadata.get(os.path.basename(filepath)),
             min_length=args.min_length
-        ) for filename in files_to_convert
+        ) for filepath in files_to_convert
     ]
 
     # Run all ray conversion tasks

diff --git a/marker/convert.py b/marker/convert.py
@@ -1,12 +1,10 @@
 import warnings
-
-from marker.utils import flush_cuda_memory
-
 warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
 
 import pypdfium2 as pdfium
 from PIL import Image
 
+from marker.utils import flush_cuda_memory
 from marker.tables.table import format_tables
 from marker.debug.data import dump_bbox_debug_data
 from marker.layout.layout import surya_layout, annotate_block_types

diff --git a/marker/layout/order.py b/marker/layout/order.py
@@ -26,7 +26,7 @@ def surya_order(doc, pages: List[Page], order_model, batch_multiplier=1):
     # Get bboxes for all pages
     bboxes = []
     for page in pages:
-        bbox = [b.bbox for b in page.layout.bboxes]
+        bbox = [b.bbox for b in page.layout.bboxes][:settings.ORDER_MAX_BBOXES]
         bboxes.append(bbox)
 
     processor = order_model.processor

diff --git a/marker/models.py b/marker/models.py
@@ -50,7 +50,9 @@ def load_all_models(langs=None):
     layout = setup_layout_model()
     order = setup_order_model()
     edit = load_editing_model()
-    ocr = setup_recognition_model(langs) if settings.OCR_ENGINE == "surya" else None
+
+    # Only load recognition model if we'll need it for all pdfs
+    ocr = setup_recognition_model(langs) if (settings.OCR_ENGINE == "surya" and settings.OCR_ALL_PAGES) else None
     texify = setup_texify_model()
     model_lst = [texify, layout, order, edit, detection, ocr]
-    return model_lst
+    return model_lst
diff --git a/marker/ocr/lang.py b/marker/ocr/lang.py
@@ -1,14 +1,23 @@
+from typing import List
+
 from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
+from surya.model.recognition.tokenizer import _tokenize as lang_tokenize
 
 from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE
 from marker.settings import settings
 
 
+def langs_to_ids(langs: List[str]):
+    unique_langs = list(set(langs))
+    _, lang_tokens = lang_tokenize("", unique_langs)
+    return lang_tokens
+
+
 def replace_langs_with_codes(langs):
     if settings.OCR_ENGINE == "surya":
         for i, lang in enumerate(langs):
-            if lang in LANGUAGE_TO_CODE:
-                langs[i] = LANGUAGE_TO_CODE[lang]
+            if lang.title() in LANGUAGE_TO_CODE:
+                langs[i] = LANGUAGE_TO_CODE[lang.title()]
     else:
         for i, lang in enumerate(langs):
             if lang in LANGUAGE_TO_CODE:

diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -7,7 +7,9 @@
 
 from surya.ocr import run_recognition
 
+from marker.models import setup_recognition_model
 from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
+from marker.ocr.lang import langs_to_ids
 from marker.pdf.images import render_image
 from marker.schema.page import Page
 from marker.schema.block import Block, Line, Span
@@ -19,7 +21,7 @@ def get_batch_size():
     if settings.RECOGNITION_BATCH_SIZE is not None:
         return settings.RECOGNITION_BATCH_SIZE
     elif settings.TORCH_DEVICE_MODEL == "cuda":
-        return 64
+        return 32
     elif settings.TORCH_DEVICE_MODEL == "mps":
         return 32
     return 32
@@ -37,11 +39,25 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplie
             ocr_idxs.append(pnum)
             ocr_pages += 1
 
+    # No pages need OCR
+    if ocr_pages == 0:
+        return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
+
     ocr_method = settings.OCR_ENGINE
     if ocr_method is None:
         return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
     elif ocr_method == "surya":
+        # Load model just in time if we're not OCRing everything
+        del_rec_model = False
+        if rec_model is None:
+            lang_tokens = langs_to_ids(langs)
+            rec_model = setup_recognition_model(lang_tokens)
+            del_rec_model = True
+
         new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
+
+        if del_rec_model:
+            del rec_model
     elif ocr_method == "ocrmypdf":
         new_pages = tesseract_recognition(doc, ocr_idxs, langs)
     else:

diff --git a/marker/settings.py b/marker/settings.py
@@ -70,6 +70,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
     # Ordering model
     SURYA_ORDER_DPI: int = 96
     ORDER_BATCH_SIZE: Optional[int] = None  # Defaults to 12 for cuda, 6 otherwise
+    ORDER_MAX_BBOXES: int = 255
 
     # Final editing model
     EDITOR_BATCH_SIZE: Optional[int] = None # Defaults to 6 for cuda, 12 otherwise

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.2.0"
+version = "0.2.2"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"