From cfde6d62690297a3e5d001b911aff8bd22291fab Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Sat, 1 Feb 2025 09:49:48 +0000 Subject: [PATCH] add llm text support for references, superscripts etc --- marker/processors/llm/llm_text.py | 24 ++++++++++++++++-------- marker/schema/text/line.py | 8 ++++++++ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py index 8a71b54e..02a6f1bf 100644 --- a/marker/processors/llm/llm_text.py +++ b/marker/processors/llm/llm_text.py @@ -28,10 +28,11 @@ class LLMTextProcessor(BaseLLMProcessor): * Inline math: Ensure all mathematical expressions are correctly formatted and rendered. * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters. * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies. -5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error. -6. Ensure that inline math is properly with inline math tags. -7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. -8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. +5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. +6. DO not remove any ... tags, those are important for references and are coming directly from the document, you MUST always keep them. +7. Ensure that inline math is properly with inline math tags. +8. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. +9. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. **Example:** @@ -120,34 +121,41 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): minimum_position=0, maximum_position=0, formats=[span['type']], + url=span.get('url'), page_id=text_line.page_id, text_extraction_method="gemini", ) ) text_line.structure.append(span_block.id) - def text_to_spans(self, text): + @staticmethod + def text_to_spans(text): soup = BeautifulSoup(text, 'html.parser') tag_types = { 'b': 'bold', 'i': 'italic', - 'math': 'math' + 'math': 'math', } spans = [] for element in soup.descendants: if not len(list(element.parents)) == 1: continue + + url = element.attrs.get('href') if hasattr(element, 'attrs') else None + if element.name in tag_types: spans.append({ 'type': tag_types[element.name], - 'content': element.get_text() + 'content': element.get_text(), + 'url': url }) elif element.string: spans.append({ 'type': 'plain', - 'content': element.string + 'content': element.string, + 'url': url }) return spans diff --git a/marker/schema/text/line.py b/marker/schema/text/line.py index 30525a38..6285ee88 100644 --- a/marker/schema/text/line.py +++ b/marker/schema/text/line.py @@ -42,10 +42,18 @@ def formatted_text(self, document): for block in self.contained_blocks(document, (BlockTypes.Span,)): block_text = html.escape(block.text) + if block.has_superscript: + block_text = re.sub(r"^([0-9\W]+)(.*)", r"\1\2", block_text) + + if block.url: + block_text = f"{block_text}" + if block.italic: text += f"{block_text}" elif block.bold: text += f"{block_text}" + elif block.math: + text += f"{block_text}" else: text += block_text