From 4ad83425e06fe78eef182d66e6d6131d7204a73d Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 24 Jan 2025 06:43:02 -0500 Subject: [PATCH 1/9] Fix parsing logic for table cells --- marker/processors/llm/llm_table.py | 2 +- marker/renderers/markdown.py | 19 +++++++++++++++- pyproject.toml | 2 +- tests/renderers/test_markdown_renderer.py | 27 +++++++++++++++++++++++ 4 files changed, 47 insertions(+), 3 deletions(-) diff --git a/marker/processors/llm/llm_table.py b/marker/processors/llm/llm_table.py index cce504d9..f9484fd4 100644 --- a/marker/processors/llm/llm_table.py +++ b/marker/processors/llm/llm_table.py @@ -41,7 +41,7 @@ class LLMTableProcessor(BaseLLMProcessor): - Make sure to reproduce the original values as faithfully as possible. - If you see any math in a table cell, fence it with the tag. Block math should be fenced with . - Replace any images with a description, like "Image: [description]". -- Only use the tags th, td, tr, span, i, b, math, and table. Only use the attributes display, style, colspan, and rowspan if necessary. +- Only use the tags th, td, tr, br, span, i, b, math, and table. Only use the attributes display, style, colspan, and rowspan if necessary. You can use br to break up text lines in cells. **Instructions:** 1. Carefully examine the provided text block image. diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index bf907c6a..0c42e3f0 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -3,6 +3,7 @@ from typing import Annotated, Tuple import regex +from bs4 import NavigableString from markdownify import MarkdownConverter from pydantic import BaseModel @@ -17,7 +18,23 @@ def cleanup_text(full_text): return full_text.strip() def get_text_with_br(element): - return ''.join(str(content) if content.name == 'br' else content.strip() for content in element.contents) + text = [] + for content in element.descendants: + if isinstance(content, NavigableString): + stripped = content.strip() + if stripped: + text.append(stripped) + elif content.name == 'br': + text.append('
') + full_text = "" + for i, t in enumerate(text): + if t == '
': + full_text += t + elif i > 0 and text[i - 1] != '
': + full_text += " " + t + else: + full_text += t + return full_text class Markdownify(MarkdownConverter): diff --git a/pyproject.toml b/pyproject.toml index ef1fd92a..30e0d45c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "1.3.0" +version = "1.3.1" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md" diff --git a/tests/renderers/test_markdown_renderer.py b/tests/renderers/test_markdown_renderer.py index 0752b100..d25d69eb 100644 --- a/tests/renderers/test_markdown_renderer.py +++ b/tests/renderers/test_markdown_renderer.py @@ -1,6 +1,8 @@ import pytest from marker.renderers.markdown import MarkdownRenderer +from marker.schema import BlockTypes +from marker.schema.blocks import TableCell @pytest.mark.config({"page_range": [0]}) @@ -35,3 +37,28 @@ def test_markdown_renderer_images(pdf_document): assert len(markdown_output.images) == 0 assert '![](' not in markdown_output.markdown + +@pytest.mark.config({"page_range": [5]}) +def test_markdown_renderer_tables(pdf_document): + table = pdf_document.contained_blocks((BlockTypes.Table,))[0] + page = pdf_document.pages[0] + + cell = TableCell( + polygon=table.polygon, + text_lines=["54.4567
89"], + rowspan=1, + colspan=1, + row_id=0, + col_id=0, + is_header=False, + page_id=page.page_id, + ) + page.add_full_block(cell) + table.structure = [] + table.add_structure(cell) + + renderer = MarkdownRenderer() + md = renderer(pdf_document).markdown + assert "54 .45 67
89" in md + + From 04498ca6703361f2494031f70d17ce269583fd2b Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 24 Jan 2025 10:45:01 -0500 Subject: [PATCH 2/9] Store equations as html --- marker/processors/equation.py | 66 +++++++++++++++++++- marker/processors/llm/llm_equation.py | 59 ++++++++++-------- marker/schema/blocks/equation.py | 67 +-------------------- tests/processors/test_equation_processor.py | 2 +- 4 files changed, 103 insertions(+), 91 deletions(-) diff --git a/marker/processors/equation.py b/marker/processors/equation.py index d2481944..868f98a2 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -65,7 +65,27 @@ def __call__(self, document: Document): continue block = document.get_block(equation_d["block_id"]) - block.latex = prediction + block.html = self.parse_latex_to_html(prediction) + + def parse_latex_to_html(self, latex: str): + html_out = "" + try: + latex = self.parse_latex(latex) + except ValueError as e: + # If we have mismatched delimiters, we'll treat it as a single block + # Strip the $'s from the latex + latex = [ + {"class": "block", "content": latex.replace("$", "")} + ] + + for el in latex: + if el["class"] == "block": + html_out += f'{el["content"]}' + elif el["class"] == "inline": + html_out += f'{el["content"]}' + else: + html_out += f" {el['content']} " + return html_out.strip() def get_batch_size(self): if self.texify_batch_size is not None: @@ -110,3 +130,47 @@ def get_total_texify_tokens(self, text): tokenizer = self.texify_model.processor.tokenizer tokens = tokenizer(text) return len(tokens["input_ids"]) + + + @staticmethod + def parse_latex(text: str): + if text.count("$") % 2 != 0: + raise ValueError("Mismatched delimiters in LaTeX") + + DELIMITERS = [ + ("$$", "block"), + ("$", "inline") + ] + + text = text.replace("\n", "
") # we can't handle \n's inside

properly if we don't do this + + i = 0 + stack = [] + result = [] + buffer = "" + + while i < len(text): + for delim, class_name in DELIMITERS: + if text[i:].startswith(delim): + if stack and stack[-1] == delim: # Closing + stack.pop() + result.append({"class": class_name, "content": buffer}) + buffer = "" + i += len(delim) + break + elif not stack: # Opening + if buffer: + result.append({"class": "text", "content": buffer}) + stack.append(delim) + buffer = "" + i += len(delim) + break + else: + raise ValueError(f"Nested {class_name} delimiters not supported") + else: # No delimiter match + buffer += text[i] + i += 1 + + if buffer: + result.append({"class": "text", "content": buffer}) + return result diff --git a/marker/processors/llm/llm_equation.py b/marker/processors/llm/llm_equation.py index 93b13c34..74cfc4a3 100644 --- a/marker/processors/llm/llm_equation.py +++ b/marker/processors/llm/llm_equation.py @@ -15,54 +15,64 @@ class LLMEquationProcessor(BaseLLMProcessor): min_equation_height: Annotated[ float, "The minimum ratio between equation height and page height to consider for processing.", - ] = 0.1 + ] = 0.08 + equation_image_expansion_ratio: Annotated[ + float, + "The ratio to expand the image by when cropping.", + ] = 0.05 # Equations sometimes get bboxes that are too tight equation_latex_prompt: Annotated[ str, "The prompt to use for generating LaTeX from equations.", "Default is a string containing the Gemini prompt." - ] = """You're an expert mathematician who is good at writing LaTeX code for equations'. -You will receive an image of a math block that may contain one or more equations. Your job is to write the LaTeX code for the equation, along with markdown for any other text. + ] = """You're an expert mathematician who is good at writing LaTeX code and html for equations. +You'll receive an image of a math block that may contain one or more equations. Your job is to write html that represents the content of the image, with the equations in LaTeX format, and fenced by delimiters. Some guidelines: -- Keep the LaTeX code simple and concise. -- Make it KaTeX compatible. -- Use $$ as a block equation delimiter and $ for inline equations. Block equations should also be on their own line. Do not use any other delimiters. -- You can include text in between equation blocks as needed. Try to put long text segments into plain text and not inside the equations. +- Output valid html, where all the equations can render properly. +- Use as a block equation delimiter and for inline equations. +- Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible. +- Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations. +- Only use the html tags math, i, b, p, and br. +- Make sure to include all the equations in the image in the html output. **Instructions:** 1. Carefully examine the provided image. -2. Analyze the existing markdown, which may include LaTeX code. -3. If the markdown and LaTeX are correct, write "No corrections needed." -4. If the markdown and LaTeX are incorrect, generate the corrected markdown and LaTeX. -5. Output only the corrected text or "No corrections needed." +2. Analyze the existing html, which may include LaTeX code. +3. If the html and LaTeX are correct, write "No corrections needed." +4. If the html and LaTeX are incorrect, generate the corrected html. +5. Output only the corrected html or "No corrections needed." **Example:** Input: -```markdown +```html Equation 1: -$$x^2 + y^2 = z2$$ +x2 + y2 = z2 +Equation 2: +\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t} ``` Output: -```markdown -Equation 1: -$$x^2 + y^2 = z^2$$ +```html +

Equation 1:

+x^{2} + y^{2} = z^{2} +

Equation 2:

+\frac{ab \cdot x^{5} + x^{2} + 2 \cdot x + 123}{t} ``` **Input:** -```markdown +```html {equation} ``` """ def process_rewriting(self, document: Document, page: PageGroup, block: Equation): - text = block.latex if block.latex else block.raw_text(document) + text = block.html if block.html else block.raw_text(document) prompt = self.equation_latex_prompt.replace("{equation}", text) image = self.extract_image(document, block) response_schema = content.Schema( type=content.Type.OBJECT, enum=[], - required=["markdown_equation"], + required=["html_equation"], properties={ - "markdown_equation": content.Schema( + "html_equation": content.Schema( type=content.Type.STRING ) }, @@ -70,13 +80,12 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Equation response = self.model.generate_response(prompt, image, block, response_schema) - if not response or "markdown_equation" not in response: + if not response or "html_equation" not in response: block.update_metadata(llm_error_count=1) return - markdown_equation = response["markdown_equation"] - if len(markdown_equation) < len(text) * .5: + html_equation = response["html_equation"] + if len(html_equation) < len(text) * .5: block.update_metadata(llm_error_count=1) return - - block.latex = markdown_equation + block.html = html_equation diff --git a/marker/schema/blocks/equation.py b/marker/schema/blocks/equation.py index d01c349b..0c3d367c 100644 --- a/marker/schema/blocks/equation.py +++ b/marker/schema/blocks/equation.py @@ -6,76 +6,15 @@ class Equation(Block): block_type: BlockTypes = BlockTypes.Equation - latex: str | None = None + html: str | None = None block_description: str = "A block math equation." def assemble_html(self, document, child_blocks, parent_structure=None): - if self.latex: + if self.html: child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference] html_out = super().assemble_html(document, child_ref_blocks, parent_structure) - html_out += f"

" - - try: - latex = self.parse_latex(html.escape(self.latex)) - except ValueError as e: - # If we have mismatched delimiters, we'll treat it as a single block - # Strip the $'s from the latex - latex = [ - {"class": "block", "content": self.latex.replace("$", "")} - ] - - for el in latex: - if el["class"] == "block": - html_out += f'{el["content"]}' - elif el["class"] == "inline": - html_out += f'{el["content"]}' - else: - html_out += el["content"] - html_out += "

" + html_out += f"""

{self.html}

""" return html_out else: template = super().assemble_html(document, child_blocks, parent_structure) return f"

{template}

" - - @staticmethod - def parse_latex(text: str): - if text.count("$") % 2 != 0: - raise ValueError("Mismatched delimiters in LaTeX") - - DELIMITERS = [ - ("$$", "block"), - ("$", "inline") - ] - - text = text.replace("\n", "
") # we can't handle \n's inside

properly if we don't do this - - i = 0 - stack = [] - result = [] - buffer = "" - - while i < len(text): - for delim, class_name in DELIMITERS: - if text[i:].startswith(delim): - if stack and stack[-1] == delim: # Closing - stack.pop() - result.append({"class": class_name, "content": buffer}) - buffer = "" - i += len(delim) - break - elif not stack: # Opening - if buffer: - result.append({"class": "text", "content": buffer}) - stack.append(delim) - buffer = "" - i += len(delim) - break - else: - raise ValueError(f"Nested {class_name} delimiters not supported") - else: # No delimiter match - buffer += text[i] - i += 1 - - if buffer: - result.append({"class": "text", "content": buffer}) - return result diff --git a/tests/processors/test_equation_processor.py b/tests/processors/test_equation_processor.py index 64f6bca0..b5b4669a 100644 --- a/tests/processors/test_equation_processor.py +++ b/tests/processors/test_equation_processor.py @@ -11,4 +11,4 @@ def test_equation_processor(pdf_document, texify_model): for block in pdf_document.pages[0].children: if block.block_type == BlockTypes.Equation: - assert block.latex is not None + assert block.html is not None From 956a718e133e99b7742626142e193ab4533e3e47 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 24 Jan 2025 10:46:01 -0500 Subject: [PATCH 3/9] Bump surya --- marker/renderers/markdown.py | 2 +- poetry.lock | 14 +++++++------- pyproject.toml | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index 0c42e3f0..ad5452c3 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -98,7 +98,7 @@ def convert_table(self, el, text, convert_as_inline): col_idx += 1 # Fill in grid - value = get_text_with_br(cell).replace("\n", " ").replace("|", " ") + value = get_text_with_br(cell).replace("\n", " ").replace("|", " ").strip() rowspan = int(cell.get('rowspan', 1)) colspan = int(cell.get('colspan', 1)) diff --git a/poetry.lock b/poetry.lock index 1a782467..c371d07c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4589,13 +4589,13 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] [[package]] name = "starlette" -version = "0.45.2" +version = "0.45.3" description = "The little ASGI library that shines." optional = false python-versions = ">=3.9" files = [ - {file = "starlette-0.45.2-py3-none-any.whl", hash = "sha256:4daec3356fb0cb1e723a5235e5beaf375d2259af27532958e2d79df549dad9da"}, - {file = "starlette-0.45.2.tar.gz", hash = "sha256:bba1831d15ae5212b22feab2f218bab6ed3cd0fc2dc1d4442443bb1ee52260e0"}, + {file = "starlette-0.45.3-py3-none-any.whl", hash = "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d"}, + {file = "starlette-0.45.3.tar.gz", hash = "sha256:2cbcba2a75806f8a41c722141486f37c28e30a0921c5f6fe4346cb0dcee1302f"}, ] [package.dependencies] @@ -4641,13 +4641,13 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[ [[package]] name = "surya-ocr" -version = "0.9.2" +version = "0.9.3" description = "OCR, layout, reading order, and table recognition in 90+ languages" optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "surya_ocr-0.9.2-py3-none-any.whl", hash = "sha256:95866f38a05d97c7faad7d82fb7d95f96df6cf9471617b7a6fa01ba5b1367622"}, - {file = "surya_ocr-0.9.2.tar.gz", hash = "sha256:ae57c7de6b4507ef4db30c18cee387d2d6e69d15e6708789b0ce2a4412713984"}, + {file = "surya_ocr-0.9.3-py3-none-any.whl", hash = "sha256:6013131f3af004f93ab5422dfa8c49a83aa72beb2f8120fd59dca04803d98009"}, + {file = "surya_ocr-0.9.3.tar.gz", hash = "sha256:a69347a3c85c04d48e3df62d11f045dc13e22ab8b3efebfdae1dd94f05a25b99"}, ] [package.dependencies] @@ -5489,4 +5489,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "f38cc17855cc95babd721161782ec64728a1061602236fe2845519d027966482" +content-hash = "6eb647ac20025351bfd8048a8407855c8f0a51760a2944f1da6c3685b9a8ada7" diff --git a/pyproject.toml b/pyproject.toml index 30e0d45c..1c647961 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ tqdm = "^4.66.1" ftfy = "^6.1.1" texify = "^0.2.1" rapidfuzz = "^3.8.1" -surya-ocr = "~0.9.2" +surya-ocr = "~0.9.3" regex = "^2024.4.28" pdftext = "~0.5.0" markdownify = "^0.13.1" From ac8b59335f80da457a506f4230d604063a672115 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 24 Jan 2025 11:27:44 -0500 Subject: [PATCH 4/9] Fix pdftext worker count --- .github/workflows/scripts.yml | 4 +++- marker/processors/table.py | 6 +++++- marker/scripts/convert.py | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml index 02d47eb4..217e4221 100644 --- a/.github/workflows/scripts.yml +++ b/.github/workflows/scripts.yml @@ -26,4 +26,6 @@ jobs: - name: Test single script run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 - name: Test convert script - run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0 \ No newline at end of file + run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0 + - name: Text convert script multiple workers + run: poetry run marker benchmark_data/pdfs --max_files 2 --workers 2 --page_range 0-5 \ No newline at end of file diff --git a/marker/processors/table.py b/marker/processors/table.py index bd191ac2..83aa919d 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -49,6 +49,10 @@ class TableProcessor(BaseProcessor): List[BlockTypes], "Block types to remove if they're contained inside the tables." ] = (BlockTypes.Text, BlockTypes.TextInlineMath) + pdftext_workers: Annotated[ + int, + "The number of workers to use for pdftext.", + ] = 4 def __init__( self, @@ -273,7 +277,7 @@ def assign_pdftext_lines(self, extract_blocks: list, filepath: str): "tables": tables, "img_size": img_size }) - cell_text = table_output(filepath, table_inputs, page_range=unique_pages) + cell_text = table_output(filepath, table_inputs, page_range=unique_pages, workers=self.pdftext_workers) assert len(cell_text) == len(unique_pages), "Number of pages and table inputs must match" for pidx, (page_tables, pnum) in enumerate(zip(cell_text, unique_pages)): diff --git a/marker/scripts/convert.py b/marker/scripts/convert.py index b859f9f9..d6b09833 100644 --- a/marker/scripts/convert.py +++ b/marker/scripts/convert.py @@ -86,7 +86,7 @@ def convert_cli(in_folder: str, **kwargs): files_to_convert = files_to_convert[:kwargs["max_files"]] # Disable nested multiprocessing - kwargs["disable_multiprocessing"] = True + kwargs["pdftext_workers"] = 1 total_processes = min(len(files_to_convert), kwargs["workers"]) From 727a475147224d3b25bc6310182d06ceff868f95 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 24 Jan 2025 12:07:16 -0500 Subject: [PATCH 5/9] Clean up config parsing --- marker/config/printer.py | 3 ++- marker/scripts/convert.py | 4 ++-- tests/config/test_config.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 3 deletions(-) create mode 100644 tests/config/test_config.py diff --git a/marker/config/printer.py b/marker/config/printer.py index 2c728553..bb9890f5 100644 --- a/marker/config/printer.py +++ b/marker/config/printer.py @@ -41,7 +41,7 @@ def parse_args(self, ctx, args): ["--" + attr], type=info['type'], help=" ".join(info['metadata']) + f" (Applies to: {', '.join(info['classes'])})", - default=info['default'], + default=None, # This is important, or it sets all the default keys again in config is_flag=info['is_flag'], ) ) @@ -71,6 +71,7 @@ def parse_args(self, ctx, args): type=attr_type, help=" ".join(metadata), is_flag=is_flag, + default=None # This is important, or it sets all the default keys again in config ) ) diff --git a/marker/scripts/convert.py b/marker/scripts/convert.py index d6b09833..f9910c4e 100644 --- a/marker/scripts/convert.py +++ b/marker/scripts/convert.py @@ -63,12 +63,12 @@ def process_single_pdf(args): @click.command(cls=CustomClickPrinter) @click.argument("in_folder", type=str) -@ConfigParser.common_options @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert") @click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel") @click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert") @click.option("--workers", type=int, default=5, help="Number of worker processes to use.") @click.option("--skip_existing", is_flag=True, default=False, help="Skip existing converted files.") +@ConfigParser.common_options def convert_cli(in_folder: str, **kwargs): in_folder = os.path.abspath(in_folder) files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)] @@ -86,7 +86,7 @@ def convert_cli(in_folder: str, **kwargs): files_to_convert = files_to_convert[:kwargs["max_files"]] # Disable nested multiprocessing - kwargs["pdftext_workers"] = 1 + kwargs["disable_multiprocessing"] = True total_processes = min(len(files_to_convert), kwargs["workers"]) diff --git a/tests/config/test_config.py b/tests/config/test_config.py new file mode 100644 index 00000000..7458c070 --- /dev/null +++ b/tests/config/test_config.py @@ -0,0 +1,36 @@ +import sys +from contextlib import suppress +from marker.config.parser import ConfigParser + +import click + +from marker.config.printer import CustomClickPrinter + + +def test_config_parser(): + command = click.command(cls=CustomClickPrinter) + captured_kwargs = {} + + def parse_args(**kwargs): + captured_kwargs.update(kwargs) + return kwargs + + original_argv = sys.argv + sys.argv = ['test', '--disable_multiprocessing', '--output_dir', 'output_dir', "--height_tolerance", "0.5"] + try: + with suppress(SystemExit): + command(ConfigParser.common_options(parse_args))() + finally: + sys.argv = original_argv + + kwargs = captured_kwargs + parser = ConfigParser(kwargs) + config_dict = parser.generate_config_dict() + + # Validate kwarg capturing + assert captured_kwargs["disable_multiprocessing"] == True + assert captured_kwargs["output_dir"] == "output_dir" + + assert config_dict["pdftext_workers"] == 1 # disabling multiprocessing does this + assert config_dict["height_tolerance"] == 0.5 + assert "output_dir" not in config_dict # This is not a config key \ No newline at end of file From 54339d99ac3850026cf58ced8a2f852e0c45f1db Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 24 Jan 2025 12:12:07 -0500 Subject: [PATCH 6/9] Add additional test --- tests/config/test_config.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tests/config/test_config.py b/tests/config/test_config.py index 7458c070..1cd5b258 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -1,13 +1,13 @@ import sys from contextlib import suppress -from marker.config.parser import ConfigParser - import click from marker.config.printer import CustomClickPrinter +from marker.config.crawler import crawler +from marker.config.parser import ConfigParser -def test_config_parser(): +def capture_kwargs(argv): command = click.command(cls=CustomClickPrinter) captured_kwargs = {} @@ -16,21 +16,32 @@ def parse_args(**kwargs): return kwargs original_argv = sys.argv - sys.argv = ['test', '--disable_multiprocessing', '--output_dir', 'output_dir', "--height_tolerance", "0.5"] + sys.argv = argv try: with suppress(SystemExit): command(ConfigParser.common_options(parse_args))() finally: sys.argv = original_argv - kwargs = captured_kwargs + return captured_kwargs + + +def test_config_parser(): + sys.argv = ['test', '--disable_multiprocessing', '--output_dir', 'output_dir', "--height_tolerance", "0.5"] + kwargs = capture_kwargs(sys.argv) parser = ConfigParser(kwargs) config_dict = parser.generate_config_dict() # Validate kwarg capturing - assert captured_kwargs["disable_multiprocessing"] == True - assert captured_kwargs["output_dir"] == "output_dir" + assert kwargs["disable_multiprocessing"] == True + assert kwargs["output_dir"] == "output_dir" assert config_dict["pdftext_workers"] == 1 # disabling multiprocessing does this assert config_dict["height_tolerance"] == 0.5 - assert "output_dir" not in config_dict # This is not a config key \ No newline at end of file + assert "output_dir" not in config_dict # This is not a config key + +def test_config_none(): + kwargs = capture_kwargs(['test']) + + for key in crawler.attr_set: + assert kwargs.get(key) is None From d3d08c396a0c1c15ff33ee390b2816c132edcbcc Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 24 Jan 2025 12:41:26 -0500 Subject: [PATCH 7/9] Fix math inside tables --- marker/processors/llm/llm_table.py | 4 ++-- marker/renderers/markdown.py | 14 +++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/marker/processors/llm/llm_table.py b/marker/processors/llm/llm_table.py index f9484fd4..e0c738a0 100644 --- a/marker/processors/llm/llm_table.py +++ b/marker/processors/llm/llm_table.py @@ -172,11 +172,11 @@ def rewrite_single_chunk(self, page: PageGroup, block: Block, block_html: str, c return parsed_cells @staticmethod - def get_cell_text(element, keep_tags=('br',)): + def get_cell_text(element, keep_tags=('br','i', 'b', 'span', 'math')) -> str: for tag in element.find_all(True): if tag.name not in keep_tags: tag.unwrap() - return element.decode_contents().replace("
", "\n") + return element.decode_contents() def parse_html_table(self, html_text: str, block: Block, page: PageGroup) -> List[TableCell]: soup = BeautifulSoup(html_text, 'html.parser') diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index ad5452c3..fef44e30 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -17,15 +17,23 @@ def cleanup_text(full_text): full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text) return full_text.strip() -def get_text_with_br(element): +def get_formatted_table_text(element): text = [] - for content in element.descendants: + for content in element.contents: + if content is None: + continue + if isinstance(content, NavigableString): stripped = content.strip() if stripped: text.append(stripped) elif content.name == 'br': text.append('
') + elif content.name == "math": + text.append("$" + content.text + "$") + else: + text.append(str(content)) + full_text = "" for i, t in enumerate(text): if t == '
': @@ -98,7 +106,7 @@ def convert_table(self, el, text, convert_as_inline): col_idx += 1 # Fill in grid - value = get_text_with_br(cell).replace("\n", " ").replace("|", " ").strip() + value = get_formatted_table_text(cell).replace("\n", " ").replace("|", " ").strip() rowspan = int(cell.get('rowspan', 1)) colspan = int(cell.get('colspan', 1)) From 5fc0546ab2b9be4af8e93650c440494d6f7697eb Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 24 Jan 2025 12:43:16 -0500 Subject: [PATCH 8/9] Add test --- tests/processors/test_llm_processors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/processors/test_llm_processors.py b/tests/processors/test_llm_processors.py index 63adf878..f8d0bc38 100644 --- a/tests/processors/test_llm_processors.py +++ b/tests/processors/test_llm_processors.py @@ -65,7 +65,7 @@ def test_llm_table_processor(pdf_document, detection_model, table_rec_model, rec Column 4 - Value 1 + Value 1 x Value 2 Value 3 Value 4 @@ -93,6 +93,9 @@ def test_llm_table_processor(pdf_document, detection_model, table_rec_model, rec table_cells = tables[0].contained_blocks(pdf_document, (BlockTypes.TableCell,)) assert table_cells[0].text == "Column 1" + markdown = MarkdownRenderer()(pdf_document).markdown + assert "Value 1 $x$" in markdown + @pytest.mark.filename("adversarial.pdf") @pytest.mark.config({"page_range": [0]}) From d3c43d6eb04b7c20da229c02fa6496ac9b191f74 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 24 Jan 2025 13:10:09 -0500 Subject: [PATCH 9/9] Fix test --- tests/renderers/test_markdown_renderer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/renderers/test_markdown_renderer.py b/tests/renderers/test_markdown_renderer.py index d25d69eb..3404f73a 100644 --- a/tests/renderers/test_markdown_renderer.py +++ b/tests/renderers/test_markdown_renderer.py @@ -45,7 +45,7 @@ def test_markdown_renderer_tables(pdf_document): cell = TableCell( polygon=table.polygon, - text_lines=["54.4567
89"], + text_lines=["54.4567
89x"], rowspan=1, colspan=1, row_id=0, @@ -59,6 +59,6 @@ def test_markdown_renderer_tables(pdf_document): renderer = MarkdownRenderer() md = renderer(pdf_document).markdown - assert "54 .45 67
89" in md + assert "54 .45 67
89 $x$" in md