Skip to content

Commit

Permalink
Update llm processors
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 15, 2025
1 parent dbe1fc4 commit 230a299
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 13 deletions.
18 changes: 8 additions & 10 deletions marker/builders/llm_layout.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Annotated, Optional
from typing import Annotated

from google.ai.generativelanguage_v1beta.types import content
from surya.layout import LayoutPredictor
Expand Down Expand Up @@ -30,7 +29,7 @@ class LLMLayoutBuilder(LayoutBuilder):
confidence_threshold: Annotated[
float,
"The confidence threshold to use for relabeling.",
] = 0.75
] = 0.8
picture_height_threshold: Annotated[
float,
"The height threshold for pictures that may actually be complex regions.",
Expand All @@ -55,12 +54,12 @@ class LLMLayoutBuilder(LayoutBuilder):
str,
"The prompt to use for relabelling blocks.",
"Default is a string containing the Gemini relabelling prompt."
] = """You are a layout expert specializing in document analysis.
] = """You're a layout expert specializing in document analysis.
Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
You will be provided with an image of a layout block and the top k predictions from the current model, along with the per-label confidence scores.
Your job is to analyze the image and choose the single most appropriate label from the provided top k predictions.
Do not invent any new labels.
Carefully examine the image and consider the provided predictions. Take the model confidence scores into account. If the existing label is the most appropriate, you should not change it.
Carefully examine the image and consider the provided predictions. Take the model confidence scores into account. The confidence is reported on a 0-1 scale, with 1 being 100% confident. If the existing label is the most appropriate, you should not change it.
**Instructions**
1. Analyze the image and consider the provided top k predictions.
2. Write a short description of the image, and which of the potential labels you believe is the most accurate representation of the layout block.
Expand All @@ -78,7 +77,7 @@ class LLMLayoutBuilder(LayoutBuilder):
str,
"The prompt to use for complex relabelling blocks.",
"Default is a string containing the complex relabelling prompt."
] = """You are a layout expert specializing in document analysis.
] = """You're a layout expert specializing in document analysis.
Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
You will be provided with an image of a layout block and some potential labels that might be appropriate.
Your job is to analyze the image and choose the single most appropriate label from the provided labels.
Expand Down Expand Up @@ -134,25 +133,23 @@ def process_block_topk_relabeling(self, document: Document, page: PageGroup, blo
potential_labels = ""
for block_type in topk_types:
label_cls = get_block_class(block_type)
potential_labels += f"- `{block_type}` - {label_cls.block_description}\n"
potential_labels += f"- `{block_type}` - {label_cls.model_fields['block_description'].default}\n"

topk = ""
for k,v in block.top_k.items():
topk += f"- `{k}` - Confidence {round(v, 3)}\n"

prompt = self.topk_relabelling_prompt.replace("{potential_labels}", potential_labels).replace("{top_k}", topk)
print(prompt)

return self.process_block_relabeling(document, page, block, prompt)

def process_block_complex_relabeling(self, document: Document, page: PageGroup, block: Block):
potential_labels = ""
for block_type in [BlockTypes.Figure, BlockTypes.Picture, BlockTypes.ComplexRegion, BlockTypes.Table, BlockTypes.Form]:
label_cls = get_block_class(block_type)
potential_labels += f"- `{block_type}` - {label_cls.block_description}\n"
potential_labels += f"- `{block_type}` - {label_cls.model_fields['block_description'].default}\n"

complex_prompt = self.complex_relabeling_prompt.replace("{potential_labels}", potential_labels)
print(complex_prompt)
return self.process_block_relabeling(document, page, block, complex_prompt)

def process_block_relabeling(self, document: Document, page: PageGroup, block: Block, prompt: str):
Expand All @@ -172,6 +169,7 @@ def process_block_relabeling(self, document: Document, page: PageGroup, block: B
)

response = self.model.generate_response(prompt, image, block, response_schema)
print(response)
generated_label = None
if response and "label" in response:
generated_label = response["label"]
Expand Down
14 changes: 11 additions & 3 deletions marker/processors/llm/llm_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,17 @@ def parse_html_table(self, html_text: str, block: Block, page: PageGroup) -> Lis
# Initialize grid
rows = table.find_all('tr')
cells = []
max_cols = max(len(row.find_all(['td', 'th'])) for row in rows)
if max_cols == 0:
return []

# Find maximum number of columns in colspan-aware way
max_cols = 0
for row in rows:
row_tds = row.find_all(['td', 'th'])
curr_cols = 0
for cell in row_tds:
colspan = int(cell.get('colspan', 1))
curr_cols += colspan
if curr_cols > max_cols:
max_cols = curr_cols

grid = [[True] * max_cols for _ in range(len(rows))]

Expand Down

0 comments on commit 230a299

Please sign in to comment.