add llm text support for references, superscripts etc

VikParuchuri · Feb 1, 2025 · cfde6d6 · cfde6d6
1 parent 4ec1467
commit cfde6d6
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 8 deletions.
diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py
@@ -28,10 +28,11 @@ class LLMTextProcessor(BaseLLMProcessor):
     * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
-5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error.
-6. Ensure that inline math is properly with inline math tags.
-7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.
-8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
+5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.
+6. DO not remove any <a href='#...'>...</a> tags, those are important for references and are coming directly from the document, you MUST always keep them.
+7. Ensure that inline math is properly with inline math tags.
+8. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.
+9. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
 
 **Example:**
 
@@ -120,34 +121,41 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
                         minimum_position=0,
                         maximum_position=0,
                         formats=[span['type']],
+                        url=span.get('url'),
                         page_id=text_line.page_id,
                         text_extraction_method="gemini",
                     )
                 )
                 text_line.structure.append(span_block.id)
 
-    def text_to_spans(self, text):
+    @staticmethod
+    def text_to_spans(text):
         soup = BeautifulSoup(text, 'html.parser')
 
         tag_types = {
             'b': 'bold',
             'i': 'italic',
-            'math': 'math'
+            'math': 'math',
         }
         spans = []
 
         for element in soup.descendants:
             if not len(list(element.parents)) == 1:
                 continue
+
+            url = element.attrs.get('href') if hasattr(element, 'attrs') else None
+
             if element.name in tag_types:
                 spans.append({
                     'type': tag_types[element.name],
-                    'content': element.get_text()
+                    'content': element.get_text(),
+                    'url': url
                 })
             elif element.string:
                 spans.append({
                     'type': 'plain',
-                    'content': element.string
+                    'content': element.string,
+                    'url': url
                 })
 
         return spans
diff --git a/marker/schema/text/line.py b/marker/schema/text/line.py
@@ -42,10 +42,18 @@ def formatted_text(self, document):
         for block in self.contained_blocks(document, (BlockTypes.Span,)):
             block_text = html.escape(block.text)
 
+            if block.has_superscript:
+                block_text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", block_text)
+
+            if block.url:
+                block_text = f"<a href='{block.url}'>{block_text}</a>"
+
             if block.italic:
                 text += f"<i>{block_text}</i>"
             elif block.bold:
                 text += f"<b>{block_text}</b>"
+            elif block.math:
+                text += f"<math display='inline'>{block_text}</math>"
             else:
                 text += block_text