Skip to content

Commit

Permalink
Add LLM handwriting processor
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 17, 2025
1 parent d91dcb8 commit 9ad7dc3
Show file tree
Hide file tree
Showing 11 changed files with 227 additions and 33 deletions.
7 changes: 6 additions & 1 deletion marker/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import os

from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor

os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning

import inspect
Expand Down Expand Up @@ -33,7 +36,7 @@
from marker.processors.sectionheader import SectionHeaderProcessor
from marker.processors.table import TableProcessor
from marker.processors.text import TextProcessor
from marker.providers.pdf import PdfProvider
from marker.processors.llm.llm_equation import LLMEquationProcessor
from marker.renderers.markdown import MarkdownRenderer
from marker.schema import BlockTypes
from marker.schema.blocks import Block
Expand Down Expand Up @@ -75,6 +78,8 @@ class PdfConverter(BaseConverter):
LLMTextProcessor,
LLMComplexRegionProcessor,
LLMImageDescriptionProcessor,
LLMEquationProcessor,
LLMHandwritingProcessor,
DebugProcessor,
)

Expand Down
5 changes: 0 additions & 5 deletions marker/processors/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,6 @@ class BaseLLMProcessor(BaseProcessor):
float,
"The ratio to expand the image by when cropping.",
] = 0.01
gemini_rewriting_prompt: Annotated[
str,
"The prompt to use for rewriting text.",
"Default is a string containing the Gemini rewriting prompt."
] = ''
use_llm: Annotated[
bool,
"Whether to use the LLM model.",
Expand Down
25 changes: 15 additions & 10 deletions marker/processors/llm/llm_complex.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@

class LLMComplexRegionProcessor(BaseLLMProcessor):
block_types = (BlockTypes.ComplexRegion,)
gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
complex_region_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image of a text block and the text that can be extracted from the image.
Your task is to correct any errors in the text, and format it properly. Do not omit any text from the block - make sure everything is included in the markdown representation. The markdown representation should be as faithful to the original text as possible.
Your task is to generate markdown to properly represent the content of the image. Do not omit any text present in the image - make sure everything is included in the markdown representation. The markdown representation should be as faithful to the original image as possible.
Formatting should be in markdown, with the following rules:
- * for italics, ** for bold, and ` for inline code.
Expand All @@ -29,26 +29,31 @@ class LLMComplexRegionProcessor(BaseLLMProcessor):
**Instructions:**
1. Carefully examine the provided block image.
2. Analyze the text representation
3. If the text representation is largely correct, then write "No corrections needed."
4. If the text representation contains errors, generate the corrected markdown representation.
5. Output only either the corrected markdown representation or "No corrections needed."
2. Analyze the existing text representation.
3. Generate the markdown representation of the content in the image.
**Example:**
Input:
```text
This is an example text block.
Table 1: Car Sales
```
Output:
```markdown
No corrections needed.
## Table 1: Car Sales
| Car | Sales |
| --- | --- |
| Honda | 100 |
| Toyota | 200 |
```
**Input:**
```text
{extracted_text}
```
"""

def process_rewriting(self, document: Document, page: PageGroup, block: Block):
text = block.raw_text(document)

prompt = self.gemini_rewriting_prompt + '```text\n`' + text + '`\n```\n'
prompt = self.complex_region_prompt.replace("{extracted_text}", text)
image = self.extract_image(document, block)
response_schema = content.Schema(
type=content.Type.OBJECT,
Expand Down
82 changes: 82 additions & 0 deletions marker/processors/llm/llm_equation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from marker.processors.llm import BaseLLMProcessor

from google.ai.generativelanguage_v1beta.types import content

from marker.schema import BlockTypes
from marker.schema.blocks import Equation
from marker.schema.document import Document
from marker.schema.groups.page import PageGroup

from typing import Annotated


class LLMEquationProcessor(BaseLLMProcessor):
block_types = (BlockTypes.Equation,)
min_equation_height: Annotated[
float,
"The minimum ratio between equation height and page height to consider for processing.",
] = 0.1
equation_latex_prompt: Annotated[
str,
"The prompt to use for generating LaTeX from equations.",
"Default is a string containing the Gemini prompt."
] = """You're an expert mathematician who is good at writing LaTeX code for equations'.
You will receive an image of a math block that may contain one or more equations. Your job is to write the LaTeX code for the equation, along with markdown for any other text.
Some guidelines:
- Keep the LaTeX code simple and concise.
- Make it KaTeX compatible.
- Use $$ as a block equation delimiter and $ for inline equations. Block equations should also be on their own line. Do not use any other delimiters.
- You can include text in between equation blocks as needed. Try to put long text segments into plain text and not inside the equations.
**Instructions:**
1. Carefully examine the provided image.
2. Analyze the existing markdown, which may include LaTeX code.
3. If the markdown and LaTeX are correct, write "No corrections needed."
4. If the markdown and LaTeX are incorrect, generate the corrected markdown and LaTeX.
5. Output only the corrected text or "No corrections needed."
**Example:**
Input:
```markdown
Equation 1:
$$x^2 + y^2 = z2$$
```
Output:
```markdown
Equation 1:
$$x^2 + y^2 = z^2$$
```
**Input:**
```markdown
{equation}
```
"""

def process_rewriting(self, document: Document, page: PageGroup, block: Equation):
text = block.latex if block.latex else block.raw_text(document)
prompt = self.equation_latex_prompt.replace("{equation}", text)

image = self.extract_image(document, block)
response_schema = content.Schema(
type=content.Type.OBJECT,
enum=[],
required=["markdown_equation"],
properties={
"markdown_equation": content.Schema(
type=content.Type.STRING
)
},
)

response = self.model.generate_response(prompt, image, block, response_schema)

if not response or "markdown_equation" not in response:
block.update_metadata(llm_error_count=1)
return

markdown_equation = response["markdown_equation"]
if len(markdown_equation) < len(text) * .5:
block.update_metadata(llm_error_count=1)
return

block.latex = markdown_equation
11 changes: 6 additions & 5 deletions marker/processors/llm/llm_form.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import markdown2

from marker.processors.llm import BaseLLMProcessor

from google.ai.generativelanguage_v1beta.types import content
Expand All @@ -12,9 +10,9 @@

class LLMFormProcessor(BaseLLMProcessor):
block_types = (BlockTypes.Form,)
gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
form_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image of a text block and an html representation of the form in the image.
Your task is to correct any errors in the htmlrepresentation, and format it properly.
Your task is to correct any errors in the html representation, and format it properly.
Values and labels should appear in html tables, with the labels on the left side, and values on the right. The headers should be "Labels" and "Values". Other text in the form can appear between the tables. Only use the tags `table, p, span, i, b, th, td, tr, and div`. Do not omit any text from the form - make sure everything is included in the html representation. It should be as faithful to the original form as possible.
**Instructions:**
1. Carefully examine the provided form block image.
Expand Down Expand Up @@ -60,6 +58,9 @@ class LLMFormProcessor(BaseLLMProcessor):
</table>
```
**Input:**
```html
{block_html}
```
"""

def process_rewriting(self, document: Document, page: PageGroup, block: Block):
Expand All @@ -69,8 +70,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
return

block_html = block.render(document).html
prompt = self.form_rewriting_prompt.replace("{block_html}", block_html)

prompt = self.gemini_rewriting_prompt + '```html\n`' + block_html + '`\n```\n'
image = self.extract_image(document, block)
response_schema = content.Schema(
type=content.Type.OBJECT,
Expand Down
86 changes: 86 additions & 0 deletions marker/processors/llm/llm_handwriting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import markdown2

from marker.processors.llm import BaseLLMProcessor

from google.ai.generativelanguage_v1beta.types import content

from marker.schema import BlockTypes
from marker.schema.blocks import Equation
from marker.schema.document import Document
from marker.schema.groups.page import PageGroup

from typing import Annotated


class LLMHandwritingProcessor(BaseLLMProcessor):
block_types = (BlockTypes.Equation,)
min_handwriting_height: Annotated[
float,
"The minimum ratio between handwriting height and page height to consider for processing.",
] = 0.1
handwriting_generation_prompt: Annotated[
str,
"The prompt to use for OCRing handwriting.",
"Default is a string containing the Gemini prompt."
] = """You are an expert editor specializing in accurately reproducing text from images.
You will receive an image of a text block, along with the text that can be extracted. Your task is to generate markdown to properly represent the content of the image. Do not omit any text present in the image - make sure everything is included in the markdown representation. The markdown representation should be as faithful to the original image as possible.
Formatting should be in markdown, with the following rules:
- * for italics, ** for bold, and ` for inline code.
- Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest.
- Lists should be formatted with either - or 1. for unordered and ordered lists, respectively.
- Links should be formatted with [text](url).
- Use ``` for code blocks.
- Inline math should be formatted with <math>math expression</math>.
- Display math should be formatted with <math display="block">math expression</math>.
- Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right. The headers should be "Labels" and "Values". Other text in the form can appear between the tables.
- Tables should be formatted with markdown tables, with the headers bolded.
**Instructions:**
1. Carefully examine the provided block image.
2. Analyze the existing text representation.
3. Output the markdown representing the content of the image.
**Example:**
Input:
```text
This i sm handwritting.
```
Output:
```markdown
This is some *handwriting*.
```
**Input:**
```text
{extracted_text}
```
"""

def process_rewriting(self, document: Document, page: PageGroup, block: Equation):
text = block.raw_text(document)
prompt = self.handwriting_generation_prompt.replace("{handwriting_text}", text)

image = self.extract_image(document, block)
response_schema = content.Schema(
type=content.Type.OBJECT,
enum=[],
required=["markdown"],
properties={
"markdown": content.Schema(
type=content.Type.STRING
)
},
)

response = self.model.generate_response(prompt, image, block, response_schema)

if not response or "markdown" not in response:
block.update_metadata(llm_error_count=1)
return

markdown = response["markdown"]
if len(markdown) < len(text) * .5:
block.update_metadata(llm_error_count=1)
return

markdown = markdown.strip().lstrip("```markdown").rstrip("```").strip()
block.html = markdown2.markdown(markdown)
5 changes: 4 additions & 1 deletion marker/processors/llm/llm_image_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ class LLMImageDescriptionProcessor(BaseLLMProcessor):
Output:
In this figure, a bar chart titled "Fruit Preference Survey" is showing the number of people who prefer different types of fruits. The x-axis shows the types of fruits, and the y-axis shows the number of people. The bar chart shows that most people prefer apples, followed by bananas and oranges. 20 people prefer apples, 15 people prefer bananas, and 10 people prefer oranges.
**Input:**
```text
{raw_text}
```
"""

def process_rewriting(self, document: Document, page: PageGroup, block: Block):
Expand All @@ -44,7 +47,7 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
# Since this processor replaces images with descriptions
return

prompt = self.image_description_prompt + '```text\n`' + block.raw_text(document) + '`\n```\n'
prompt = self.image_description_prompt.replace("{raw_text}", block.raw_text(document))
image = self.extract_image(document, block)
response_schema = content.Schema(
type=content.Type.OBJECT,
Expand Down
16 changes: 13 additions & 3 deletions marker/processors/llm/llm_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,25 @@ class LLMTableProcessor(BaseLLMProcessor):
Tuple[BlockTypes],
"The block types to process.",
] = (BlockTypes.Table, BlockTypes.TableOfContents)
gemini_rewriting_prompt: Annotated[
table_rewriting_prompt: Annotated[
str,
"The prompt to use for rewriting text.",
"Default is a string containing the Gemini rewriting prompt."
] = """You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image of a text block and an html representation of the table in the image.
Your task is to correct any errors in the html representation. The html representation should be as faithful to the original table as possible.
Some guidelines:
- Make sure to reproduce the original values as faithfully as possible.
- If you see any math in a table cell, fence it with the <math display="inline"> tag. Block math should be fenced with <math display="block">.
- Replace any images with a description, like "Image: [description]".
- Only use the tags th, td, tr, span, i, b, math, and table. Only use the attributes display, style, colspan, and rowspan if necessary.
**Instructions:**
1. Carefully examine the provided text block image.
2. Analyze the html representation of the table.
3. If the html representation is largely correct, then write "No corrections needed."
4. If the html representation contains errors, generate the corrected html representation. Only use the tags th, td, tr, and table. Only use the attributes colspan and rowspan if necessary.
4. If the html representation contains errors, generate the corrected html representation.
5. Output only either the corrected html representation or "No corrections needed."
**Example:**
Input:
Expand All @@ -50,6 +57,9 @@ class LLMTableProcessor(BaseLLMProcessor):
No corrections needed.
```
**Input:**
```html
{block_html}
```
"""

def process_rewriting(self, document: Document, page: PageGroup, block: Block):
Expand All @@ -59,8 +69,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
return

block_html = block.render(document).html
prompt = self.table_rewriting_prompt.replace("{block_html}", block_html)

prompt = self.gemini_rewriting_prompt + '```html\n`' + block_html + '`\n```\n'
image = self.extract_image(document, block)
response_schema = content.Schema(
type=content.Type.OBJECT,
Expand Down
4 changes: 2 additions & 2 deletions marker/processors/llm/llm_table_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class LLMTableMergeProcessor(BaseLLMProcessor):
int,
"The maximum gap between columns to merge tables"
] = 50
gemini_table_merge_prompt: Annotated[
table_merge_prompt: Annotated[
str,
"The prompt to use for rewriting text.",
"Default is a string containing the Gemini rewriting prompt."
Expand Down Expand Up @@ -212,7 +212,7 @@ def process_rewriting(self, document: Document, blocks: List[Block]):
start_html = start_block.render(document).html
curr_html = curr_block.render(document).html

prompt = self.gemini_table_merge_prompt.replace("{{table1}}", start_html).replace("{{table2}}", curr_html)
prompt = self.table_merge_prompt.replace("{{table1}}", start_html).replace("{{table2}}", curr_html)

response_schema = content.Schema(
type=content.Type.OBJECT,
Expand Down
Loading

0 comments on commit 9ad7dc3

Please sign in to comment.