Skip to content

Commit

Permalink
Fix deployment bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 9, 2024
1 parent 4966f7a commit 2d7cb00
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 20 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,17 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 mar

Note that the env variables above are specific to this script, and cannot be set in `local.env`.

# Important settings/Troubleshooting

There are some settings that you may find especially useful if things aren't working the way you expect:

- `OCR_ALL_PAGES` - set this to true to force OCR all pages. This can be very useful if the table layouts aren't recognized properly by default, or if there is garbled text.
- `TORCH_DEVICE` - set this to force marker to use a given torch device for inference.
- `OCR_ENGINE` - can set this to `surya` or `ocrmypdf`.
- `DEBUG` - setting this to `True` shows ray logs when converting multiple pdfs

In general, if output is not what you expect, trying to OCR the PDF is a good first step.

# Benchmarks

Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct.
Expand All @@ -163,7 +174,7 @@ First 3 are non-arXiv books, last 3 are arXiv papers.
| marker | 0.536176 | 0.516833 | 0.70515 | 0.710657 | 0.690042 | 0.523467 |
| nougat | 0.44009 | 0.588973 | 0.322706 | 0.401342 | 0.160842 | 0.525663 |

Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `5.1GB` for marker. Benchmarks were run on an A6000 Ada.
Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `4.1GB` for marker. Benchmarks were run on an A6000 Ada.

**Throughput**

Expand Down
20 changes: 11 additions & 9 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,30 @@


@ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
fname = os.path.basename(filepath)
if markdown_exists(out_folder, fname):
return
try:
# Skip trying to convert files that don't have a lot of embedded text
# This can indicate that they were scanned, and not OCRed properly
# Usually these files are not recent/high-quality
if min_length:
filetype = find_filetype(fname)
filetype = find_filetype(filepath)
if filetype == "other":
return 0

length = get_length_of_text(fname)
length = get_length_of_text(filepath)
if length < min_length:
return

full_text, images, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
full_text, images, out_metadata = convert_single_pdf(filepath, model_refs, metadata=metadata)
if len(full_text.strip()) > 0:
save_markdown(out_folder, fname, full_text, images, out_metadata)
else:
print(f"Empty file: {fname}. Could not convert.")
print(f"Empty file: {filepath}. Could not convert.")
except Exception as e:
print(f"Error converting {fname}: {e}")
print(f"Error converting {filepath}: {e}")
print(traceback.format_exc())


Expand All @@ -62,6 +63,7 @@ def main():
in_folder = os.path.abspath(args.in_folder)
out_folder = os.path.abspath(args.out_folder)
files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
files = [f for f in files if os.path.isfile(f)]
os.makedirs(out_folder, exist_ok=True)

# Handle chunks if we're processing in parallel
Expand Down Expand Up @@ -100,12 +102,12 @@ def main():
print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
futures = [
process_single_pdf.options(num_gpus=gpu_frac).remote(
filename,
filepath,
out_folder,
model_refs,
metadata=metadata.get(os.path.basename(filename)),
metadata=metadata.get(os.path.basename(filepath)),
min_length=args.min_length
) for filename in files_to_convert
) for filepath in files_to_convert
]

# Run all ray conversion tasks
Expand Down
4 changes: 1 addition & 3 deletions marker/convert.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import warnings

from marker.utils import flush_cuda_memory

warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings

import pypdfium2 as pdfium
from PIL import Image

from marker.utils import flush_cuda_memory
from marker.tables.table import format_tables
from marker.debug.data import dump_bbox_debug_data
from marker.layout.layout import surya_layout, annotate_block_types
Expand Down
2 changes: 1 addition & 1 deletion marker/layout/order.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def surya_order(doc, pages: List[Page], order_model, batch_multiplier=1):
# Get bboxes for all pages
bboxes = []
for page in pages:
bbox = [b.bbox for b in page.layout.bboxes]
bbox = [b.bbox for b in page.layout.bboxes][:settings.ORDER_MAX_BBOXES]
bboxes.append(bbox)

processor = order_model.processor
Expand Down
6 changes: 4 additions & 2 deletions marker/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ def load_all_models(langs=None):
layout = setup_layout_model()
order = setup_order_model()
edit = load_editing_model()
ocr = setup_recognition_model(langs) if settings.OCR_ENGINE == "surya" else None

# Only load recognition model if we'll need it for all pdfs
ocr = setup_recognition_model(langs) if (settings.OCR_ENGINE == "surya" and settings.OCR_ALL_PAGES) else None
texify = setup_texify_model()
model_lst = [texify, layout, order, edit, detection, ocr]
return model_lst
return model_lst
13 changes: 11 additions & 2 deletions marker/ocr/lang.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
from typing import List

from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
from surya.model.recognition.tokenizer import _tokenize as lang_tokenize

from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE
from marker.settings import settings


def langs_to_ids(langs: List[str]):
unique_langs = list(set(langs))
_, lang_tokens = lang_tokenize("", unique_langs)
return lang_tokens


def replace_langs_with_codes(langs):
if settings.OCR_ENGINE == "surya":
for i, lang in enumerate(langs):
if lang in LANGUAGE_TO_CODE:
langs[i] = LANGUAGE_TO_CODE[lang]
if lang.title() in LANGUAGE_TO_CODE:
langs[i] = LANGUAGE_TO_CODE[lang.title()]
else:
for i, lang in enumerate(langs):
if lang in LANGUAGE_TO_CODE:
Expand Down
18 changes: 17 additions & 1 deletion marker/ocr/recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

from surya.ocr import run_recognition

from marker.models import setup_recognition_model
from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
from marker.ocr.lang import langs_to_ids
from marker.pdf.images import render_image
from marker.schema.page import Page
from marker.schema.block import Block, Line, Span
Expand All @@ -19,7 +21,7 @@ def get_batch_size():
if settings.RECOGNITION_BATCH_SIZE is not None:
return settings.RECOGNITION_BATCH_SIZE
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 64
return 32
elif settings.TORCH_DEVICE_MODEL == "mps":
return 32
return 32
Expand All @@ -37,11 +39,25 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplie
ocr_idxs.append(pnum)
ocr_pages += 1

# No pages need OCR
if ocr_pages == 0:
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}

ocr_method = settings.OCR_ENGINE
if ocr_method is None:
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
elif ocr_method == "surya":
# Load model just in time if we're not OCRing everything
del_rec_model = False
if rec_model is None:
lang_tokens = langs_to_ids(langs)
rec_model = setup_recognition_model(lang_tokens)
del_rec_model = True

new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)

if del_rec_model:
del rec_model
elif ocr_method == "ocrmypdf":
new_pages = tesseract_recognition(doc, ocr_idxs, langs)
else:
Expand Down
1 change: 1 addition & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
# Ordering model
SURYA_ORDER_DPI: int = 96
ORDER_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise
ORDER_MAX_BBOXES: int = 255

# Final editing model
EDITOR_BATCH_SIZE: Optional[int] = None # Defaults to 6 for cuda, 12 otherwise
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.2.0"
version = "0.2.2"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <github@vikas.sh>"]
readme = "README.md"
Expand Down

0 comments on commit 2d7cb00

Please sign in to comment.