VikParuchuri · iammosespaulr · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025 · Feb 3, 2025
diff --git a/marker/processors/llm/llm_table_merge.py b/marker/processors/llm/llm_table_merge.py
@@ -114,6 +114,9 @@ class LLMTableMergeProcessor(BaseLLMProcessor):
 
     @staticmethod
     def get_row_count(cells: List[TableCell]):
+        if not cells:
+            return 0
+
         max_rows = None
         for col_id in set([cell.col_id for cell in cells]):
             col_cells = [cell for cell in cells if cell.col_id == col_id]
@@ -126,6 +129,9 @@ def get_row_count(cells: List[TableCell]):
 
     @staticmethod
     def get_column_count(cells: List[TableCell]):
+        if not cells:
+            return 0
+
         max_cols = None
         for row_id in set([cell.row_id for cell in cells]):
             row_cells = [cell for cell in cells if cell.row_id == row_id]

diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py
@@ -28,18 +28,19 @@ class LLMTextProcessor(BaseLLMProcessor):
     * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
-5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error.
+5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.
 6. Ensure that inline math is properly with inline math tags.
 7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.
 8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
+9. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.
 
 **Example:**
 
 Input:
 ```
 {
  "extracted_lines": [
-  "Adversarial training (AT) [23], which aims to minimize\n",
+  "Adversarial training (AT) <a href='#page-9-1'>[23]</a>, which aims to minimize\n",
   "the model's risk under the worst-case perturbations, is cur-\n",
   "rently the most effective approach for improving the robust-\n",
   "ness of deep neural networks. For a given neural network\n",
@@ -54,7 +55,7 @@ class LLMTextProcessor(BaseLLMProcessor):
 ```json
 {
  "corrected_lines": [
-  "Adversarial training (AT) [23], which aims to minimize\n",
+  "Adversarial training (AT) <a href='#page-9-1'>[23]</a>, which aims to minimize\n",
   "the model's risk under the worst-case perturbations, is cur-\n",
   "rently the most effective approach for improving the robust-\n",
   "ness of deep neural networks. For a given neural network\n",
@@ -120,34 +121,41 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
                         minimum_position=0,
                         maximum_position=0,
                         formats=[span['type']],
+                        url=span.get('url'),
                         page_id=text_line.page_id,
                         text_extraction_method="gemini",
                     )
                 )
                 text_line.structure.append(span_block.id)
 
-    def text_to_spans(self, text):
+    @staticmethod
+    def text_to_spans(text):
         soup = BeautifulSoup(text, 'html.parser')
 
         tag_types = {
             'b': 'bold',
             'i': 'italic',
-            'math': 'math'
+            'math': 'math',
         }
         spans = []
 
         for element in soup.descendants:
             if not len(list(element.parents)) == 1:
                 continue
+
+            url = element.attrs.get('href') if hasattr(element, 'attrs') else None
+
             if element.name in tag_types:
                 spans.append({
                     'type': tag_types[element.name],
-                    'content': element.get_text()
+                    'content': element.get_text(),
+                    'url': url
                 })
             elif element.string:
                 spans.append({
                     'type': 'plain',
-                    'content': element.string
+                    'content': element.string,
+                    'url': url
                 })
 
         return spans
diff --git a/marker/schema/text/line.py b/marker/schema/text/line.py
@@ -42,10 +42,18 @@ def formatted_text(self, document):
         for block in self.contained_blocks(document, (BlockTypes.Span,)):
             block_text = html.escape(block.text)
 
+            if block.has_superscript:
+                block_text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", block_text)
+
+            if block.url:
+                block_text = f"<a href='{block.url}'>{block_text}</a>"
+
             if block.italic:
                 text += f"<i>{block_text}</i>"
             elif block.bold:
                 text += f"<b>{block_text}</b>"
+            elif block.math:
+                text += f"<math display='inline'>{block_text}</math>"
             else:
                 text += block_text