Add LLM handwriting processor

VikParuchuri · Jan 17, 2025 · 9ad7dc3 · 9ad7dc3
1 parent d91dcb8
commit 9ad7dc3
Show file tree

Hide file tree

Showing 11 changed files with 227 additions and 33 deletions.
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
@@ -1,4 +1,7 @@
 import os
+
+from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disables a tokenizers warning
 
 import inspect
@@ -33,7 +36,7 @@
 from marker.processors.sectionheader import SectionHeaderProcessor
 from marker.processors.table import TableProcessor
 from marker.processors.text import TextProcessor
-from marker.providers.pdf import PdfProvider
+from marker.processors.llm.llm_equation import LLMEquationProcessor
 from marker.renderers.markdown import MarkdownRenderer
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
@@ -75,6 +78,8 @@ class PdfConverter(BaseConverter):
         LLMTextProcessor,
         LLMComplexRegionProcessor,
         LLMImageDescriptionProcessor,
+        LLMEquationProcessor,
+        LLMHandwritingProcessor,
         DebugProcessor,
     )
 

diff --git a/marker/processors/llm/__init__.py b/marker/processors/llm/__init__.py
@@ -40,11 +40,6 @@ class BaseLLMProcessor(BaseProcessor):
         float,
         "The ratio to expand the image by when cropping.",
     ] = 0.01
-    gemini_rewriting_prompt: Annotated[
-        str,
-        "The prompt to use for rewriting text.",
-        "Default is a string containing the Gemini rewriting prompt."
-    ] = ''
     use_llm: Annotated[
         bool,
         "Whether to use the LLM model.",

diff --git a/marker/processors/llm/llm_complex.py b/marker/processors/llm/llm_complex.py
@@ -12,9 +12,9 @@
 
 class LLMComplexRegionProcessor(BaseLLMProcessor):
     block_types = (BlockTypes.ComplexRegion,)
-    gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
+    complex_region_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and the text that can be extracted from the image.
-Your task is to correct any errors in the text, and format it properly.  Do not omit any text from the block - make sure everything is included in the markdown representation.  The markdown representation should be as faithful to the original text as possible.
+Your task is to generate markdown to properly represent the content of the image.  Do not omit any text present in the image - make sure everything is included in the markdown representation.  The markdown representation should be as faithful to the original image as possible.
 
 Formatting should be in markdown, with the following rules:
 - * for italics, ** for bold, and ` for inline code.
@@ -29,26 +29,31 @@ class LLMComplexRegionProcessor(BaseLLMProcessor):
 
 **Instructions:**
 1. Carefully examine the provided block image.
-2. Analyze the text representation
-3. If the text representation is largely correct, then write "No corrections needed."
-4. If the text representation contains errors, generate the corrected markdown representation.
-5. Output only either the corrected markdown representation or "No corrections needed."
+2. Analyze the existing text representation.
+3. Generate the markdown representation of the content in the image.
 **Example:**
 Input:
 ```text
-This is an example text block.
+Table 1: Car Sales
 ```
 Output:
 ```markdown
-No corrections needed.
+## Table 1: Car Sales
+
+| Car | Sales |
+| --- | --- |
+| Honda | 100 |
+| Toyota | 200 |
 ```
 **Input:**
+```text
+{extracted_text}
+```
 """
 
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
         text = block.raw_text(document)
-
-        prompt = self.gemini_rewriting_prompt + '```text\n`' + text + '`\n```\n'
+        prompt = self.complex_region_prompt.replace("{extracted_text}", text)
         image = self.extract_image(document, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,

diff --git a/marker/processors/llm/llm_equation.py b/marker/processors/llm/llm_equation.py
@@ -0,0 +1,82 @@
+from marker.processors.llm import BaseLLMProcessor
+
+from google.ai.generativelanguage_v1beta.types import content
+
+from marker.schema import BlockTypes
+from marker.schema.blocks import Equation
+from marker.schema.document import Document
+from marker.schema.groups.page import PageGroup
+
+from typing import Annotated
+
+
+class LLMEquationProcessor(BaseLLMProcessor):
+    block_types = (BlockTypes.Equation,)
+    min_equation_height: Annotated[
+        float,
+        "The minimum ratio between equation height and page height to consider for processing.",
+     ] = 0.1
+    equation_latex_prompt: Annotated[
+        str,
+        "The prompt to use for generating LaTeX from equations.",
+        "Default is a string containing the Gemini prompt."
+    ] = """You're an expert mathematician who is good at writing LaTeX code for equations'.
+You will receive an image of a math block that may contain one or more equations. Your job is to write the LaTeX code for the equation, along with markdown for any other text.
+
+Some guidelines:
+- Keep the LaTeX code simple and concise.
+- Make it KaTeX compatible.
+- Use $$ as a block equation delimiter and $ for inline equations.  Block equations should also be on their own line.  Do not use any other delimiters.
+- You can include text in between equation blocks as needed.  Try to put long text segments into plain text and not inside the equations.
+
+**Instructions:**
+1. Carefully examine the provided image.
+2. Analyze the existing markdown, which may include LaTeX code.
+3. If the markdown and LaTeX are correct, write "No corrections needed."
+4. If the markdown and LaTeX are incorrect, generate the corrected markdown and LaTeX.
+5. Output only the corrected text or "No corrections needed."
+**Example:**
+Input:
+```markdown
+Equation 1: 
+$$x^2 + y^2 = z2$$
+```
+Output:
+```markdown
+Equation 1: 
+$$x^2 + y^2 = z^2$$
+```
+**Input:**
+```markdown
+{equation}
+```
+"""
+
+    def process_rewriting(self, document: Document, page: PageGroup, block: Equation):
+        text = block.latex if block.latex else block.raw_text(document)
+        prompt = self.equation_latex_prompt.replace("{equation}", text)
+
+        image = self.extract_image(document, block)
+        response_schema = content.Schema(
+            type=content.Type.OBJECT,
+            enum=[],
+            required=["markdown_equation"],
+            properties={
+                "markdown_equation": content.Schema(
+                    type=content.Type.STRING
+                )
+            },
+        )
+
+        response = self.model.generate_response(prompt, image, block, response_schema)
+
+        if not response or "markdown_equation" not in response:
+            block.update_metadata(llm_error_count=1)
+            return
+
+        markdown_equation = response["markdown_equation"]
+        if len(markdown_equation) < len(text) * .5:
+            block.update_metadata(llm_error_count=1)
+            return
+
+        block.latex = markdown_equation
diff --git a/marker/processors/llm/llm_form.py b/marker/processors/llm/llm_form.py
@@ -1,5 +1,3 @@
-import markdown2
-
 from marker.processors.llm import BaseLLMProcessor
 
 from google.ai.generativelanguage_v1beta.types import content
@@ -12,9 +10,9 @@
 
 class LLMFormProcessor(BaseLLMProcessor):
     block_types = (BlockTypes.Form,)
-    gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
+    form_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and an html representation of the form in the image.
-Your task is to correct any errors in the htmlrepresentation, and format it properly.
+Your task is to correct any errors in the html representation, and format it properly.
 Values and labels should appear in html tables, with the labels on the left side, and values on the right.  The headers should be "Labels" and "Values".  Other text in the form can appear between the tables.  Only use the tags `table, p, span, i, b, th, td, tr, and div`.  Do not omit any text from the form - make sure everything is included in the html representation.  It should be as faithful to the original form as possible.
 **Instructions:**
 1. Carefully examine the provided form block image.
@@ -60,6 +58,9 @@ class LLMFormProcessor(BaseLLMProcessor):
 </table>
 ```
 **Input:**
+```html
+{block_html}
+```
 """
 
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
@@ -69,8 +70,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
             return
 
         block_html = block.render(document).html
+        prompt = self.form_rewriting_prompt.replace("{block_html}", block_html)
 
-        prompt = self.gemini_rewriting_prompt + '```html\n`' + block_html + '`\n```\n'
         image = self.extract_image(document, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,

diff --git a/marker/processors/llm/llm_handwriting.py b/marker/processors/llm/llm_handwriting.py
@@ -0,0 +1,86 @@
+import markdown2
+
+from marker.processors.llm import BaseLLMProcessor
+
+from google.ai.generativelanguage_v1beta.types import content
+
+from marker.schema import BlockTypes
+from marker.schema.blocks import Equation
+from marker.schema.document import Document
+from marker.schema.groups.page import PageGroup
+
+from typing import Annotated
+
+
+class LLMHandwritingProcessor(BaseLLMProcessor):
+    block_types = (BlockTypes.Equation,)
+    min_handwriting_height: Annotated[
+        float,
+        "The minimum ratio between handwriting height and page height to consider for processing.",
+     ] = 0.1
+    handwriting_generation_prompt: Annotated[
+        str,
+        "The prompt to use for OCRing handwriting.",
+        "Default is a string containing the Gemini prompt."
+    ] = """You are an expert editor specializing in accurately reproducing text from images.
+You will receive an image of a text block, along with the text that can be extracted. Your task is to generate markdown to properly represent the content of the image.  Do not omit any text present in the image - make sure everything is included in the markdown representation.  The markdown representation should be as faithful to the original image as possible.
+
+Formatting should be in markdown, with the following rules:
+- * for italics, ** for bold, and ` for inline code.
+- Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest.
+- Lists should be formatted with either - or 1. for unordered and ordered lists, respectively.
+- Links should be formatted with [text](url).
+- Use ``` for code blocks.
+- Inline math should be formatted with <math>math expression</math>.
+- Display math should be formatted with <math display="block">math expression</math>.
+- Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right.  The headers should be "Labels" and "Values".  Other text in the form can appear between the tables.
+- Tables should be formatted with markdown tables, with the headers bolded.
+
+**Instructions:**
+1. Carefully examine the provided block image.
+2. Analyze the existing text representation.
+3. Output the markdown representing the content of the image.
+**Example:**
+Input:
+```text
+This i sm handwritting.
+```
+Output:
+```markdown
+This is some *handwriting*.
+```
+**Input:**
+```text
+{extracted_text}
+```
+"""
+
+    def process_rewriting(self, document: Document, page: PageGroup, block: Equation):
+        text = block.raw_text(document)
+        prompt = self.handwriting_generation_prompt.replace("{handwriting_text}", text)
+
+        image = self.extract_image(document, block)
+        response_schema = content.Schema(
+            type=content.Type.OBJECT,
+            enum=[],
+            required=["markdown"],
+            properties={
+                "markdown": content.Schema(
+                    type=content.Type.STRING
+                )
+            },
+        )
+
+        response = self.model.generate_response(prompt, image, block, response_schema)
+
+        if not response or "markdown" not in response:
+            block.update_metadata(llm_error_count=1)
+            return
+
+        markdown = response["markdown"]
+        if len(markdown) < len(text) * .5:
+            block.update_metadata(llm_error_count=1)
+            return
+
+        markdown = markdown.strip().lstrip("```markdown").rstrip("```").strip()
+        block.html = markdown2.markdown(markdown)
diff --git a/marker/processors/llm/llm_image_description.py b/marker/processors/llm/llm_image_description.py
@@ -36,6 +36,9 @@ class LLMImageDescriptionProcessor(BaseLLMProcessor):
 Output:
 In this figure, a bar chart titled "Fruit Preference Survey" is showing the number of people who prefer different types of fruits.  The x-axis shows the types of fruits, and the y-axis shows the number of people.  The bar chart shows that most people prefer apples, followed by bananas and oranges.  20 people prefer apples, 15 people prefer bananas, and 10 people prefer oranges.
 **Input:**
+```text
+{raw_text}
+```
 """
 
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
@@ -44,7 +47,7 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
             # Since this processor replaces images with descriptions
             return
 
-        prompt = self.image_description_prompt + '```text\n`' + block.raw_text(document) + '`\n```\n'
+        prompt = self.image_description_prompt.replace("{raw_text}", block.raw_text(document))
         image = self.extract_image(document, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,

diff --git a/marker/processors/llm/llm_table.py b/marker/processors/llm/llm_table.py
@@ -16,18 +16,25 @@ class LLMTableProcessor(BaseLLMProcessor):
         Tuple[BlockTypes],
         "The block types to process.",
     ] = (BlockTypes.Table, BlockTypes.TableOfContents)
-    gemini_rewriting_prompt: Annotated[
+    table_rewriting_prompt: Annotated[
         str,
         "The prompt to use for rewriting text.",
         "Default is a string containing the Gemini rewriting prompt."
     ] = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and an html representation of the table in the image.
 Your task is to correct any errors in the html representation.  The html representation should be as faithful to the original table as possible.
+
+Some guidelines:
+- Make sure to reproduce the original values as faithfully as possible.
+- If you see any math in a table cell, fence it with the <math display="inline"> tag.  Block math should be fenced with <math display="block">.
+- Replace any images with a description, like "Image: [description]".
+- Only use the tags th, td, tr, span, i, b, math, and table.  Only use the attributes display, style, colspan, and rowspan if necessary.
+
 **Instructions:**
 1. Carefully examine the provided text block image.
 2. Analyze the html representation of the table.
 3. If the html representation is largely correct, then write "No corrections needed."
-4. If the html representation contains errors, generate the corrected html representation.  Only use the tags th, td, tr, and table.  Only use the attributes colspan and rowspan if necessary.
+4. If the html representation contains errors, generate the corrected html representation.  
 5. Output only either the corrected html representation or "No corrections needed."
 **Example:**
 Input:
@@ -50,6 +57,9 @@ class LLMTableProcessor(BaseLLMProcessor):
 No corrections needed.
 ```
 **Input:**
+```html
+{block_html}
+```
 """
 
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
@@ -59,8 +69,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
             return
 
         block_html = block.render(document).html
+        prompt = self.table_rewriting_prompt.replace("{block_html}", block_html)
 
-        prompt = self.gemini_rewriting_prompt + '```html\n`' + block_html + '`\n```\n'
         image = self.extract_image(document, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,

diff --git a/marker/processors/llm/llm_table_merge.py b/marker/processors/llm/llm_table_merge.py
@@ -36,7 +36,7 @@ class LLMTableMergeProcessor(BaseLLMProcessor):
         int,
         "The maximum gap between columns to merge tables"
     ] = 50
-    gemini_table_merge_prompt: Annotated[
+    table_merge_prompt: Annotated[
         str,
         "The prompt to use for rewriting text.",
         "Default is a string containing the Gemini rewriting prompt."
@@ -212,7 +212,7 @@ def process_rewriting(self, document: Document, blocks: List[Block]):
             start_html = start_block.render(document).html
             curr_html = curr_block.render(document).html
 
-            prompt = self.gemini_table_merge_prompt.replace("{{table1}}", start_html).replace("{{table2}}", curr_html)
+            prompt = self.table_merge_prompt.replace("{{table1}}", start_html).replace("{{table2}}", curr_html)
 
             response_schema = content.Schema(
                 type=content.Type.OBJECT,