Skip to content

Commit

Permalink
Address a bunch of Github issues
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 8, 2024
1 parent 7f18bb9 commit 287f546
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ First, some configuration. Note that settings can be overridden with env vars,
- Your torch device will be automatically detected, but you can manually set it also. For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default.
- If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU). For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
- Depending on your document types, marker's average memory usage per task can vary slightly. You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
- By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).
- By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above). If you don't want OCR at all, set `OCR_ENGINE` to `None`.
- Inspect the other settings in `marker/settings.py`. You can override any settings in the `local.env` file, or by setting environment variables.


Expand Down
5 changes: 4 additions & 1 deletion marker/cleaners/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,10 @@ def indent_blocks(pages: List[Page]):
blank_line = False
for line in lines:
text = line[1]
prefix = " " * int((line[0][0] - min_left) / col_width)
if col_width == 0:
prefix = ""
else:
prefix = " " * int((line[0][0] - min_left) / col_width)
current_line_blank = len(text.strip()) == 0
if blank_line and current_line_blank:
# Don't put multiple blank lines in a row
Expand Down
3 changes: 1 addition & 2 deletions marker/equations/equations.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
idx = 0
success_count = 0
fail_count = 0
total_inserted = 0
for block_number, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
latex_text = predictions[block_number]
conditions = [
Expand All @@ -91,7 +90,7 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
lines=[Line(
spans=[
Span(
text=block_text.replace("\n", " "),
text="\n\n" + block_text.replace("\n", " ") + "\n\n",
bbox=equation_bbox,
span_id=f"{pnum}_{idx}_fixeq",
font="Latex",
Expand Down
8 changes: 6 additions & 2 deletions marker/ocr/recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,14 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor
ocr_pages += 1

ocr_method = settings.OCR_ENGINE
if ocr_method == "surya":
if ocr_method is None:
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
elif ocr_method == "surya":
new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages)
else:
elif ocr_method == "ocrmypdf":
new_pages = tesseract_recognition(doc, ocr_idxs, langs)
else:
raise ValueError(f"Unknown OCR method {ocr_method}")

for orig_idx, page in zip(ocr_idxs, new_pages):
if detect_bad_ocr(page.prelim_text) or len(page.prelim_text) == 0:
Expand Down
13 changes: 11 additions & 2 deletions marker/postprocessors/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
from typing import List


def escape_markdown(text):
# List of characters that need to be escaped in markdown
characters_to_escape = r"[#]"
# Escape each of these characters with a backslash
escaped_text = re.sub(characters_to_escape, r'\\\g<0>', text)
return escaped_text

def surround_text(s, char_to_insert):
leading_whitespace = re.match(r'^(\s*)', s).group(1)
trailing_whitespace = re.search(r'(\s*)$', s).group(1)
Expand Down Expand Up @@ -73,9 +80,11 @@ def block_surround(text, block_type):
elif block_type == "Table":
text = "\n" + text + "\n"
elif block_type == "List-item":
pass
text = escape_markdown(text)
elif block_type == "Code":
text = "\n" + text + "\n"
text = "\n" + escape_markdown(text) + "\n"
elif block_type == "Text":
text = escape_markdown(text)
return text


Expand Down
6 changes: 3 additions & 3 deletions marker/settings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, List, Dict
from typing import Optional, List, Dict, Literal

from dotenv import find_dotenv
from pydantic import computed_field
Expand Down Expand Up @@ -41,7 +41,7 @@ def TORCH_DEVICE_MODEL(self) -> str:

# OCR
INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
OCR_ENGINE: Optional[str] = "surya" # Which OCR engine to use, either "surya" or "ocrmypdf". Defaults to "ocrmypdf" on CPU, "surya" on GPU.
OCR_ENGINE: Optional[Literal["surya", "ocrmypdf"]] = "surya" # Which OCR engine to use, either "surya" or "ocrmypdf". Defaults to "ocrmypdf" on CPU, "surya" on GPU.
OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted

## Surya
Expand Down Expand Up @@ -89,7 +89,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
@computed_field
@property
def CUDA(self) -> bool:
return "cuda" in self.TORCH_DEVICE
return "cuda" in self.TORCH_DEVICE_MODEL

@computed_field
@property
Expand Down
2 changes: 1 addition & 1 deletion marker/tables/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def format_tables(pages: List[Page]):
if len(table_rows) == 0:
continue

table_text = tabulate(table_rows, headers="firstrow", tablefmt="github")
table_text = tabulate(table_rows, headers="firstrow", tablefmt="github", disable_numparse=True)
table_block = Block(
bbox=table_box,
block_type="Table",
Expand Down

0 comments on commit 287f546

Please sign in to comment.