Set OCR engine to None

VikParuchuri · Aug 19, 2024 · 96379ed · 96379ed
1 parent d2aae0b
commit 96379ed
Show file tree

Hide file tree

Showing 5 changed files with 1,116 additions and 9 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Run table benchmark
         run: |
           poetry run python benchmarks/table.py tables.json
-          poetry run python scripts/verify_benchmark_scores.py report.json --type table
+          poetry run python scripts/verify_benchmark_scores.py tables.json --type table
         
           
 
diff --git a/README.md b/README.md
@@ -141,7 +141,7 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 mar
 
 - `METADATA_FILE` is an optional path to a json file with metadata about the pdfs.  See above for the format.
 - `NUM_DEVICES` is the number of GPUs to use.  Should be `2` or greater.
-- `NUM_WORKERS` is the number of parallel processes to run on each GPU.  Per-GPU parallelism will not increase beyond `INFERENCE_RAM / VRAM_PER_TASK`.
+- `NUM_WORKERS` is the number of parallel processes to run on each GPU.
 - `MIN_LENGTH` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing.  If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
 
 Note that the env variables above are specific to this script, and cannot be set in `local.env`.

diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -45,7 +45,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplie
         return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
 
     ocr_method = settings.OCR_ENGINE
-    if ocr_method is None:
+    if ocr_method is None or ocr_method == "None":
         return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
     elif ocr_method == "surya":
         new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)