From cfde6d62690297a3e5d001b911aff8bd22291fab Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Sat, 1 Feb 2025 09:49:48 +0000 Subject: [PATCH 1/4] add llm text support for references, superscripts etc --- marker/processors/llm/llm_text.py | 24 ++++++++++++++++-------- marker/schema/text/line.py | 8 ++++++++ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py index 8a71b54e..02a6f1bf 100644 --- a/marker/processors/llm/llm_text.py +++ b/marker/processors/llm/llm_text.py @@ -28,10 +28,11 @@ class LLMTextProcessor(BaseLLMProcessor): * Inline math: Ensure all mathematical expressions are correctly formatted and rendered. * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters. * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies. -5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error. -6. Ensure that inline math is properly with inline math tags. -7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. -8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. +5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. +6. DO not remove any ... tags, those are important for references and are coming directly from the document, you MUST always keep them. +7. Ensure that inline math is properly with inline math tags. +8. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. +9. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. **Example:** @@ -120,34 +121,41 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): minimum_position=0, maximum_position=0, formats=[span['type']], + url=span.get('url'), page_id=text_line.page_id, text_extraction_method="gemini", ) ) text_line.structure.append(span_block.id) - def text_to_spans(self, text): + @staticmethod + def text_to_spans(text): soup = BeautifulSoup(text, 'html.parser') tag_types = { 'b': 'bold', 'i': 'italic', - 'math': 'math' + 'math': 'math', } spans = [] for element in soup.descendants: if not len(list(element.parents)) == 1: continue + + url = element.attrs.get('href') if hasattr(element, 'attrs') else None + if element.name in tag_types: spans.append({ 'type': tag_types[element.name], - 'content': element.get_text() + 'content': element.get_text(), + 'url': url }) elif element.string: spans.append({ 'type': 'plain', - 'content': element.string + 'content': element.string, + 'url': url }) return spans diff --git a/marker/schema/text/line.py b/marker/schema/text/line.py index 30525a38..6285ee88 100644 --- a/marker/schema/text/line.py +++ b/marker/schema/text/line.py @@ -42,10 +42,18 @@ def formatted_text(self, document): for block in self.contained_blocks(document, (BlockTypes.Span,)): block_text = html.escape(block.text) + if block.has_superscript: + block_text = re.sub(r"^([0-9\W]+)(.*)", r"\1\2", block_text) + + if block.url: + block_text = f"{block_text}" + if block.italic: text += f"{block_text}" elif block.bold: text += f"{block_text}" + elif block.math: + text += f"{block_text}" else: text += block_text From 225ff44c965d4a07b320180bbecd003f9357f3ac Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Sat, 1 Feb 2025 09:51:22 +0000 Subject: [PATCH 2/4] fix typo [skip ci] --- marker/processors/llm/llm_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py index 02a6f1bf..dbbd23b4 100644 --- a/marker/processors/llm/llm_text.py +++ b/marker/processors/llm/llm_text.py @@ -29,7 +29,7 @@ class LLMTextProcessor(BaseLLMProcessor): * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters. * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies. 5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. -6. DO not remove any ... tags, those are important for references and are coming directly from the document, you MUST always keep them. +6. Do not remove any ... tags, those are important for references and are coming directly from the document, you MUST always keep them. 7. Ensure that inline math is properly with inline math tags. 8. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. 9. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. From 93deddda79f48a2356e88e9a1dcd7da30810e366 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Sat, 1 Feb 2025 11:48:15 +0000 Subject: [PATCH 3/4] refine prompt [skip ci] --- marker/processors/llm/llm_text.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py index dbbd23b4..a87ed862 100644 --- a/marker/processors/llm/llm_text.py +++ b/marker/processors/llm/llm_text.py @@ -29,10 +29,10 @@ class LLMTextProcessor(BaseLLMProcessor): * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters. * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies. 5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. -6. Do not remove any ... tags, those are important for references and are coming directly from the document, you MUST always keep them. -7. Ensure that inline math is properly with inline math tags. -8. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. -9. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. +6. Ensure that inline math is properly with inline math tags. +7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. +8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. +9. You absolutely cannot remove any ... tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them. **Example:** @@ -40,7 +40,7 @@ class LLMTextProcessor(BaseLLMProcessor): ``` { "extracted_lines": [ - "Adversarial training (AT) [23], which aims to minimize\n", + "Adversarial training (AT) [23], which aims to minimize\n", "the model's risk under the worst-case perturbations, is cur-\n", "rently the most effective approach for improving the robust-\n", "ness of deep neural networks. For a given neural network\n", @@ -55,7 +55,7 @@ class LLMTextProcessor(BaseLLMProcessor): ```json { "corrected_lines": [ - "Adversarial training (AT) [23], which aims to minimize\n", + "Adversarial training (AT) [23], which aims to minimize\n", "the model's risk under the worst-case perturbations, is cur-\n", "rently the most effective approach for improving the robust-\n", "ness of deep neural networks. For a given neural network\n", From 4e0fadc55482425aa027a3a862600ecf03af1650 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Mon, 3 Feb 2025 03:53:14 +0000 Subject: [PATCH 4/4] fix llm table merging error --- marker/processors/llm/llm_table_merge.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/marker/processors/llm/llm_table_merge.py b/marker/processors/llm/llm_table_merge.py index e2012998..af512bf6 100644 --- a/marker/processors/llm/llm_table_merge.py +++ b/marker/processors/llm/llm_table_merge.py @@ -114,6 +114,9 @@ class LLMTableMergeProcessor(BaseLLMProcessor): @staticmethod def get_row_count(cells: List[TableCell]): + if not cells: + return 0 + max_rows = None for col_id in set([cell.col_id for cell in cells]): col_cells = [cell for cell in cells if cell.col_id == col_id] @@ -126,6 +129,9 @@ def get_row_count(cells: List[TableCell]): @staticmethod def get_column_count(cells: List[TableCell]): + if not cells: + return 0 + max_cols = None for row_id in set([cell.row_id for cell in cells]): row_cells = [cell for cell in cells if cell.row_id == row_id]