Address a bunch of Github issues

VikParuchuri · May 8, 2024 · 287f546 · 287f546
1 parent 7f18bb9
commit 287f546
Show file tree

Hide file tree

Showing 7 changed files with 27 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -103,7 +103,7 @@ First, some configuration.  Note that settings can be overridden with env vars,
 - Your torch device will be automatically detected, but you can manually set it also.  For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default.
   - If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU).  For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
   - Depending on your document types, marker's average memory usage per task can vary slightly.  You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
-- By default, marker will use `surya` for OCR.  Surya is slower on CPU, but more accurate than tesseract.  If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).
+- By default, marker will use `surya` for OCR.  Surya is slower on CPU, but more accurate than tesseract.  If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).  If you don't want OCR at all, set `OCR_ENGINE` to `None`.
 - Inspect the other settings in `marker/settings.py`.  You can override any settings in the `local.env` file, or by setting environment variables.
 
 

diff --git a/marker/cleaners/code.py b/marker/cleaners/code.py
@@ -107,7 +107,10 @@ def indent_blocks(pages: List[Page]):
             blank_line = False
             for line in lines:
                 text = line[1]
-                prefix = " " * int((line[0][0] - min_left) / col_width)
+                if col_width == 0:
+                    prefix = ""
+                else:
+                    prefix = " " * int((line[0][0] - min_left) / col_width)
                 current_line_blank = len(text.strip()) == 0
                 if blank_line and current_line_blank:
                     # Don't put multiple blank lines in a row

diff --git a/marker/equations/equations.py b/marker/equations/equations.py
@@ -78,7 +78,6 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
     idx = 0
     success_count = 0
     fail_count = 0
-    total_inserted = 0
     for block_number, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
         latex_text = predictions[block_number]
         conditions = [
@@ -91,7 +90,7 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
             lines=[Line(
                 spans=[
                     Span(
-                        text=block_text.replace("\n", " "),
+                        text="\n\n" + block_text.replace("\n", " ") + "\n\n",
                         bbox=equation_bbox,
                         span_id=f"{pnum}_{idx}_fixeq",
                         font="Latex",

diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -28,10 +28,14 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor
             ocr_pages += 1
 
     ocr_method = settings.OCR_ENGINE
-    if ocr_method == "surya":
+    if ocr_method is None:
+        return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
+    elif ocr_method == "surya":
         new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages)
-    else:
+    elif ocr_method == "ocrmypdf":
         new_pages = tesseract_recognition(doc, ocr_idxs, langs)
+    else:
+        raise ValueError(f"Unknown OCR method {ocr_method}")
 
     for orig_idx, page in zip(ocr_idxs, new_pages):
         if detect_bad_ocr(page.prelim_text) or len(page.prelim_text) == 0:

diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py
@@ -5,6 +5,13 @@
 from typing import List
 
 
+def escape_markdown(text):
+    # List of characters that need to be escaped in markdown
+    characters_to_escape = r"[#]"
+    # Escape each of these characters with a backslash
+    escaped_text = re.sub(characters_to_escape, r'\\\g<0>', text)
+    return escaped_text
+
 def surround_text(s, char_to_insert):
     leading_whitespace = re.match(r'^(\s*)', s).group(1)
     trailing_whitespace = re.search(r'(\s*)$', s).group(1)
@@ -73,9 +80,11 @@ def block_surround(text, block_type):
     elif block_type == "Table":
         text = "\n" + text + "\n"
     elif block_type == "List-item":
-        pass
+        text = escape_markdown(text)
     elif block_type == "Code":
-        text = "\n" + text + "\n"
+        text = "\n" + escape_markdown(text) + "\n"
+    elif block_type == "Text":
+        text = escape_markdown(text)
     return text
 
 

diff --git a/marker/settings.py b/marker/settings.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Dict
+from typing import Optional, List, Dict, Literal
 
 from dotenv import find_dotenv
 from pydantic import computed_field
@@ -41,7 +41,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
 
     # OCR
     INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
-    OCR_ENGINE: Optional[str] = "surya" # Which OCR engine to use, either "surya" or "ocrmypdf".  Defaults to "ocrmypdf" on CPU, "surya" on GPU.
+    OCR_ENGINE: Optional[Literal["surya", "ocrmypdf"]] = "surya" # Which OCR engine to use, either "surya" or "ocrmypdf".  Defaults to "ocrmypdf" on CPU, "surya" on GPU.
     OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
 
     ## Surya
@@ -89,7 +89,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
     @computed_field
     @property
     def CUDA(self) -> bool:
-        return "cuda" in self.TORCH_DEVICE
+        return "cuda" in self.TORCH_DEVICE_MODEL
 
     @computed_field
     @property

diff --git a/marker/tables/table.py b/marker/tables/table.py
@@ -140,7 +140,7 @@ def format_tables(pages: List[Page]):
             if len(table_rows) == 0:
                 continue
 
-            table_text = tabulate(table_rows, headers="firstrow", tablefmt="github")
+            table_text = tabulate(table_rows, headers="firstrow", tablefmt="github", disable_numparse=True)
             table_block = Block(
                 bbox=table_box,
                 block_type="Table",