From dbe1fc4c811a36711dacfcefa2e727b13511cd8d Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 15 Jan 2025 15:03:03 -0500 Subject: [PATCH] Update layout prompts --- benchmarks/table/table.py | 105 ++++++++++++++++++++--- convert_single.py | 1 - marker/builders/llm_layout.py | 61 +++++++++---- marker/converters/table.py | 2 + marker/processors/llm/llm_table_merge.py | 23 +++-- marker/processors/table.py | 5 +- marker/renderers/json.py | 3 + marker/schema/blocks/base.py | 1 + marker/schema/blocks/caption.py | 1 + marker/schema/blocks/code.py | 1 + marker/schema/blocks/complexregion.py | 1 + marker/schema/blocks/equation.py | 1 + marker/schema/blocks/figure.py | 1 + marker/schema/blocks/footnote.py | 1 + marker/schema/blocks/form.py | 1 + marker/schema/blocks/handwriting.py | 1 + marker/schema/blocks/inlinemath.py | 1 + marker/schema/blocks/listitem.py | 1 + marker/schema/blocks/pagefooter.py | 1 + marker/schema/blocks/pageheader.py | 1 + marker/schema/blocks/picture.py | 1 + marker/schema/blocks/sectionheader.py | 1 + marker/schema/blocks/table.py | 1 + marker/schema/blocks/tablecell.py | 1 + marker/schema/blocks/text.py | 1 + marker/schema/blocks/toc.py | 1 + marker/schema/groups/figure.py | 1 + marker/schema/groups/list.py | 1 + marker/schema/groups/page.py | 1 + marker/schema/groups/picture.py | 1 + marker/schema/groups/table.py | 1 + marker/schema/text/line.py | 1 + marker/schema/text/span.py | 1 + 33 files changed, 186 insertions(+), 40 deletions(-) diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py index a6e4bd82..a5a0ef24 100644 --- a/benchmarks/table/table.py +++ b/benchmarks/table/table.py @@ -1,4 +1,10 @@ import os +from typing import List + +import numpy as np + +from marker.renderers.json import JSONOutput, JSONBlockOutput + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS import base64 @@ -10,8 +16,9 @@ from tabulate import tabulate import json from bs4 import BeautifulSoup -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor from pypdfium2._helpers.misc import PdfiumError +from marker.util import matrix_intersection_area from marker.config.parser import ConfigParser from marker.converters.table import TableConverter @@ -27,13 +34,24 @@ def update_teds_score(result): return result +def extract_tables(children: List[JSONBlockOutput]): + tables = [] + for child in children: + if child.block_type == 'Table': + tables.append(child) + elif child.children: + tables.extend(extract_tables(child.children)) + return tables + + @click.command(help="Benchmark Table to HTML Conversion") @click.argument("out_file", type=str) @click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use") @click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process") -def main(out_file: str, dataset: str, max_rows: int): +@click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use") +def main(out_file: str, dataset: str, max_rows: int, max_workers: int): models = create_model_dict() - config_parser = ConfigParser({'output_format': 'html'}) + config_parser = ConfigParser({'output_format': 'json'}) start = time.time() @@ -45,6 +63,7 @@ def main(out_file: str, dataset: str, max_rows: int): iterations = min(max_rows, len(dataset)) results = [] + total_unaligned = 0 for i in tqdm(range(iterations), desc='Converting Tables'): try: row = dataset[i] @@ -61,19 +80,74 @@ def main(out_file: str, dataset: str, max_rows: int): with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: temp_pdf_file.write(pdf_binary) temp_pdf_file.seek(0) - marker_table_html = converter(temp_pdf_file.name).html + tqdm.disable = True + marker_json = converter(temp_pdf_file.name).children + tqdm.disable = False - marker_table_soup = BeautifulSoup(marker_table_html, 'html.parser') - marker_detected_tables = marker_table_soup.find_all('table') - if len(marker_detected_tables)==0: + if len(marker_json) == 0 or len(gt_tables) == 0: print(f'No tables detected, skipping...') + total_unaligned += len(gt_tables) + continue + + marker_tables = extract_tables(marker_json) + marker_table_boxes = [table.bbox for table in marker_tables] + page_bbox = marker_json[0].bbox + + # Normalize the bboxes + for bbox in marker_table_boxes: + bbox[0] = bbox[0] / page_bbox[2] + bbox[1] = bbox[1] / page_bbox[3] + bbox[2] = bbox[2] / page_bbox[2] + bbox[3] = bbox[3] / page_bbox[3] + + gt_boxes = [table['normalized_bbox'] for table in gt_tables] + gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes] + marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes] + table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes) + + aligned_tables = [] + used_tables = set() + unaligned_tables = set() + for table_idx, alignment in enumerate(table_alignments): + try: + max_area = np.max(alignment) + aligned_idx = np.argmax(alignment) + except ValueError: + # No alignment found + unaligned_tables.add(table_idx) + continue + + if aligned_idx in used_tables: + # Marker table already aligned with another gt table + unaligned_tables.add(table_idx) + continue + + # Gt table doesn't align well with any marker table + gt_table_pct = gt_areas[table_idx] / max_area + if not .75 < gt_table_pct < 1.25: + unaligned_tables.add(table_idx) + continue + + # Marker table doesn't align with gt table + marker_table_pct = marker_areas[aligned_idx] / max_area + if not .75 < marker_table_pct < 1.25: + unaligned_tables.add(table_idx) + continue + + aligned_tables.append( + (marker_tables[aligned_idx], gt_tables[table_idx]) + ) + used_tables.add(aligned_idx) + + total_unaligned += len(unaligned_tables) - for marker_table_soup, gt_table in zip(marker_detected_tables, gt_tables): + for marker_table, gt_table in aligned_tables: gt_table_html = gt_table['html'] - + #marker wraps the table in which fintabnet data doesn't - marker_table_soup.find('tbody').unwrap() #Fintabnet doesn't use th tags, need to be replaced for fair comparison + marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser') + marker_table_soup.find('tbody').unwrap() for th_tag in marker_table_soup.find_all('th'): th_tag.name = 'td' marker_table_html = str(marker_table_soup) @@ -86,10 +160,15 @@ def main(out_file: str, dataset: str, max_rows: int): print('Broken PDF, Skipping...') continue - print(f"Total time: {time.time() - start}") + print(f"Total time: {time.time() - start}.") + print(f"Could not align {total_unaligned} tables from fintabnet.") - with ThreadPoolExecutor(max_workers=16) as executor: - results = list(tqdm(executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results))) + with ProcessPoolExecutor(max_workers=max_workers) as executor: + results = list( + tqdm( + executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results) + ) + ) avg_score = sum([r["score"] for r in results]) / len(results) headers = ["Avg score", "Total tables"] diff --git a/convert_single.py b/convert_single.py index 271833b6..637e1b10 100755 --- a/convert_single.py +++ b/convert_single.py @@ -9,7 +9,6 @@ from marker.config.parser import ConfigParser from marker.config.printer import CustomClickPrinter -from marker.converters.pdf import PdfConverter from marker.logger import configure_logging from marker.models import create_model_dict from marker.output import save_output diff --git a/marker/builders/llm_layout.py b/marker/builders/llm_layout.py index 0b574b76..17a007d3 100644 --- a/marker/builders/llm_layout.py +++ b/marker/builders/llm_layout.py @@ -30,7 +30,7 @@ class LLMLayoutBuilder(LayoutBuilder): confidence_threshold: Annotated[ float, "The confidence threshold to use for relabeling.", - ] = 0.7 + ] = 0.75 picture_height_threshold: Annotated[ float, "The height threshold for pictures that may actually be complex regions.", @@ -57,14 +57,22 @@ class LLMLayoutBuilder(LayoutBuilder): "Default is a string containing the Gemini relabelling prompt." ] = """You are a layout expert specializing in document analysis. Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model. -You will be provided with an image of a layout block and the top k predictions from the current model, along with their confidence scores. +You will be provided with an image of a layout block and the top k predictions from the current model, along with the per-label confidence scores. Your job is to analyze the image and choose the single most appropriate label from the provided top k predictions. Do not invent any new labels. -Carefully examine the image and consider the provided predictions. -Choose the label you believe is the most accurate representation of the layout block. +Carefully examine the image and consider the provided predictions. Take the model confidence scores into account. If the existing label is the most appropriate, you should not change it. +**Instructions** +1. Analyze the image and consider the provided top k predictions. +2. Write a short description of the image, and which of the potential labels you believe is the most accurate representation of the layout block. +3. Choose the single most appropriate label from the provided top k predictions. -Here are the top k predictions from the model followed by the image: +Here are descriptions of the layout blocks you can choose from: +{potential_labels} + +Here are the top k predictions from the model: + +{top_k} """ complex_relabeling_prompt: Annotated[ str, @@ -72,23 +80,19 @@ class LLMLayoutBuilder(LayoutBuilder): "Default is a string containing the complex relabelling prompt." ] = """You are a layout expert specializing in document analysis. Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model. -You will be provided with an image of a layout block and some potential labels. +You will be provided with an image of a layout block and some potential labels that might be appropriate. Your job is to analyze the image and choose the single most appropriate label from the provided labels. Do not invent any new labels. -Carefully examine the image and consider the provided predictions. -Choose the label you believe is the most accurate representation of the layout block. +**Instructions** +1. Analyze the image and consider the potential labels. +2. Write a short description of the image, and which of the potential labels you believe is the most accurate representation of the layout block. +3. Choose the single most appropriate label from the provided labels. Potential labels: -- Picture -- Table -- Form -- Figure - A graph or diagram with text. -- ComplexRegion - a complex region containing multiple text and other elements. +{potential_labels} Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form`. - -Here is the image of the layout block: """ def __init__(self, layout_model: LayoutPredictor, ocr_error_model: OCRErrorPredictor, config=None): @@ -126,13 +130,29 @@ def relabel_blocks(self, document: Document): pbar.close() def process_block_topk_relabeling(self, document: Document, page: PageGroup, block: Block): - topk = {str(k): round(v, 3) for k, v in block.top_k.items()} + topk_types = list(block.top_k.keys()) + potential_labels = "" + for block_type in topk_types: + label_cls = get_block_class(block_type) + potential_labels += f"- `{block_type}` - {label_cls.block_description}\n" + + topk = "" + for k,v in block.top_k.items(): + topk += f"- `{k}` - Confidence {round(v, 3)}\n" + + prompt = self.topk_relabelling_prompt.replace("{potential_labels}", potential_labels).replace("{top_k}", topk) + print(prompt) - prompt = self.topk_relabelling_prompt + '```json' + json.dumps(topk) + '```\n' return self.process_block_relabeling(document, page, block, prompt) def process_block_complex_relabeling(self, document: Document, page: PageGroup, block: Block): - complex_prompt = self.complex_relabeling_prompt + potential_labels = "" + for block_type in [BlockTypes.Figure, BlockTypes.Picture, BlockTypes.ComplexRegion, BlockTypes.Table, BlockTypes.Form]: + label_cls = get_block_class(block_type) + potential_labels += f"- `{block_type}` - {label_cls.block_description}\n" + + complex_prompt = self.complex_relabeling_prompt.replace("{potential_labels}", potential_labels) + print(complex_prompt) return self.process_block_relabeling(document, page, block, complex_prompt) def process_block_relabeling(self, document: Document, page: PageGroup, block: Block, prompt: str): @@ -140,8 +160,11 @@ def process_block_relabeling(self, document: Document, page: PageGroup, block: B response_schema = content.Schema( type=content.Type.OBJECT, enum=[], - required=["label"], + required=["image_description", "label"], properties={ + "image_description": content.Schema( + type=content.Type.STRING, + ), "label": content.Schema( type=content.Type.STRING, ), diff --git a/marker/converters/table.py b/marker/converters/table.py index a664fa49..1a56fc82 100644 --- a/marker/converters/table.py +++ b/marker/converters/table.py @@ -1,3 +1,4 @@ +from functools import cache from typing import Tuple, List from marker.builders.document import DocumentBuilder @@ -23,6 +24,7 @@ class TableConverter(PdfConverter): ) converter_block_types: List[BlockTypes] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents) + @cache def build_document(self, filepath: str): provider_cls = provider_from_filepath(filepath) layout_builder = self.resolve_dependencies(self.layout_builder_class) diff --git a/marker/processors/llm/llm_table_merge.py b/marker/processors/llm/llm_table_merge.py index d1561b76..4476c31e 100644 --- a/marker/processors/llm/llm_table_merge.py +++ b/marker/processors/llm/llm_table_merge.py @@ -32,6 +32,10 @@ class LLMTableMergeProcessor(BaseLLMProcessor): int, "The maximum distance between table edges for adjacency." ] = 20 + column_gap_threshold: Annotated[ + int, + "The maximum gap between columns to merge tables" + ] = 50 gemini_table_merge_prompt: Annotated[ str, "The prompt to use for rewriting text.", @@ -133,10 +137,8 @@ def rewrite_blocks(self, document: Document): for page in document.pages: page_blocks = page.contained_blocks(document, self.block_types) for block in page_blocks: - if prev_block is None: - subsequent_page_table = False - same_page_vertical_table = False - else: + merge_condition = False + if prev_block is not None: prev_cells = prev_block.contained_blocks(document, (BlockTypes.TableCell,)) curr_cells = block.contained_blocks(document, (BlockTypes.TableCell,)) row_match = abs(self.get_row_count(prev_cells) - self.get_row_count(curr_cells)) < 5, # Similar number of rows @@ -154,11 +156,20 @@ def rewrite_blocks(self, document: Document): prev_block.page_id == block.page_id, # On the same page (1 - self.vertical_table_height_threshold) < prev_block.polygon.height / block.polygon.height < (1 + self.vertical_table_height_threshold), # Similar height abs(block.polygon.x_start - prev_block.polygon.x_end) < self.vertical_table_distance_threshold, # Close together in x + abs(block.polygon.y_start - prev_block.polygon.y_start) < self.vertical_table_distance_threshold, # Close together in y row_match ]) - if prev_block is not None and \ - (subsequent_page_table or same_page_vertical_table): + same_page_new_column = all([ + prev_block.page_id == block.page_id, # On the same page + abs(block.polygon.x_start - prev_block.polygon.x_end) < self.column_gap_threshold, + block.y_start < prev_block.y_end, + block.polygon.width * (1 - self.vertical_table_height_threshold) < prev_block.polygon.width < block.polygon.width * (1 + self.vertical_table_height_threshold), # Similar width + col_match + ]) + merge_condition = any([subsequent_page_table, same_page_vertical_table, same_page_new_column]) + + if prev_block is not None and merge_condition: if prev_block not in table_run: table_run.append(prev_block) table_run.append(block) diff --git a/marker/processors/table.py b/marker/processors/table.py index 45c6bcc8..100090ed 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -105,7 +105,7 @@ def __call__(self, document: Document): colspan=cell.colspan, row_id=cell.row_id, col_id=cell.col_id, - is_header=cell.is_header, + is_header=bool(cell.is_header), page_id=page.page_id, ) page.add_full_block(cell_block) @@ -133,6 +133,9 @@ def normalize_spaces(text): def split_combined_rows(self, tables: List[TableResult]): for table in tables: + if len(table.cells) == 0: + # Skip empty tables + continue unique_rows = sorted(list(set([c.row_id for c in table.cells]))) new_cells = [] shift_up = 0 diff --git a/marker/renderers/json.py b/marker/renderers/json.py index fce9c289..116bc557 100644 --- a/marker/renderers/json.py +++ b/marker/renderers/json.py @@ -14,6 +14,7 @@ class JSONBlockOutput(BaseModel): block_type: str html: str polygon: List[List[float]] + bbox: List[float] children: List['JSONBlockOutput'] | None = None section_hierarchy: Dict[int, str] | None = None images: dict | None = None @@ -52,6 +53,7 @@ def extract_json(self, document: Document, block_output: BlockOutput): return JSONBlockOutput( html=html, polygon=block_output.polygon.polygon, + bbox=block_output.polygon.bbox, id=str(block_output.id), block_type=str(block_output.id.block_type), images=images, @@ -66,6 +68,7 @@ def extract_json(self, document: Document, block_output: BlockOutput): return JSONBlockOutput( html=block_output.html, polygon=block_output.polygon.polygon, + bbox=block_output.polygon.bbox, id=str(block_output.id), block_type=str(block_output.id.block_type), children=children, diff --git a/marker/schema/blocks/base.py b/marker/schema/blocks/base.py index 0fbafe0e..b2a03fa0 100644 --- a/marker/schema/blocks/base.py +++ b/marker/schema/blocks/base.py @@ -71,6 +71,7 @@ def to_path(self): class Block(BaseModel): polygon: PolygonBox + block_description: str block_type: Optional[BlockTypes] = None block_id: Optional[int] = None page_id: Optional[int] = None diff --git a/marker/schema/blocks/caption.py b/marker/schema/blocks/caption.py index 6e424747..3ca7544a 100644 --- a/marker/schema/blocks/caption.py +++ b/marker/schema/blocks/caption.py @@ -4,6 +4,7 @@ class Caption(Block): block_type: BlockTypes = BlockTypes.Caption + block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table. " def assemble_html(self, document, child_blocks, parent_structure): template = super().assemble_html(document, child_blocks, parent_structure) diff --git a/marker/schema/blocks/code.py b/marker/schema/blocks/code.py index ff65966c..337ca04e 100644 --- a/marker/schema/blocks/code.py +++ b/marker/schema/blocks/code.py @@ -7,6 +7,7 @@ class Code(Block): block_type: BlockTypes = BlockTypes.Code code: str | None = None + block_description: str = "A programming code block." def assemble_html(self, document, child_blocks, parent_structure): code = self.code or "" diff --git a/marker/schema/blocks/complexregion.py b/marker/schema/blocks/complexregion.py index cdfe8179..7b4f6e67 100644 --- a/marker/schema/blocks/complexregion.py +++ b/marker/schema/blocks/complexregion.py @@ -5,6 +5,7 @@ class ComplexRegion(Block): block_type: BlockTypes = BlockTypes.ComplexRegion html: str | None = None + block_description: str = "A complex region that can consist of multiple different types of blocks mixed with images. This block is chosen when it is difficult to categorize the region as a single block type." def assemble_html(self, document, child_blocks, parent_structure): if self.html: diff --git a/marker/schema/blocks/equation.py b/marker/schema/blocks/equation.py index b82f3c7d..e4ab59eb 100644 --- a/marker/schema/blocks/equation.py +++ b/marker/schema/blocks/equation.py @@ -7,6 +7,7 @@ class Equation(Block): block_type: BlockTypes = BlockTypes.Equation latex: str | None = None + block_description: str = "A block math equation." def assemble_html(self, document, child_blocks, parent_structure=None): if self.latex: diff --git a/marker/schema/blocks/figure.py b/marker/schema/blocks/figure.py index 74270bbe..4eade60d 100644 --- a/marker/schema/blocks/figure.py +++ b/marker/schema/blocks/figure.py @@ -5,6 +5,7 @@ class Figure(Block): block_type: BlockTypes = BlockTypes.Figure description: str | None = None + block_description: str = "A chart or other image that contains data." def assemble_html(self, document, child_blocks, parent_structure): if self.description: diff --git a/marker/schema/blocks/footnote.py b/marker/schema/blocks/footnote.py index f58983fb..1cd1978d 100644 --- a/marker/schema/blocks/footnote.py +++ b/marker/schema/blocks/footnote.py @@ -4,6 +4,7 @@ class Footnote(Block): block_type: BlockTypes = BlockTypes.Footnote + block_description: str = "A footnote that explains a term or concept in the document." def assemble_html(self, document, child_blocks, parent_structure): template = super().assemble_html(document, child_blocks, parent_structure) diff --git a/marker/schema/blocks/form.py b/marker/schema/blocks/form.py index 4185520d..30731a97 100644 --- a/marker/schema/blocks/form.py +++ b/marker/schema/blocks/form.py @@ -6,3 +6,4 @@ class Form(BaseTable): block_type: BlockTypes = BlockTypes.Form + block_description: str = "A form, such as a tax form, that contains fields and labels." diff --git a/marker/schema/blocks/handwriting.py b/marker/schema/blocks/handwriting.py index 540369ae..4eafa6f3 100644 --- a/marker/schema/blocks/handwriting.py +++ b/marker/schema/blocks/handwriting.py @@ -4,6 +4,7 @@ class Handwriting(Block): block_type: BlockTypes = BlockTypes.Handwriting + block_description: str = "A region that contains handwriting." def assemble_html(self, document, child_blocks, parent_structure): template = super().assemble_html(document, child_blocks, parent_structure) diff --git a/marker/schema/blocks/inlinemath.py b/marker/schema/blocks/inlinemath.py index 6e415745..d669406a 100644 --- a/marker/schema/blocks/inlinemath.py +++ b/marker/schema/blocks/inlinemath.py @@ -7,6 +7,7 @@ class InlineMath(Block): has_continuation: bool = False blockquote: bool = False blockquote_level: int = 0 + block_description: str = "A text block that contains inline math. This is not used for italic text or references - only for text that contains math." def assemble_html(self, document, child_blocks, parent_structure): if self.ignore_for_output: diff --git a/marker/schema/blocks/listitem.py b/marker/schema/blocks/listitem.py index 91ab539d..d8c45e6e 100644 --- a/marker/schema/blocks/listitem.py +++ b/marker/schema/blocks/listitem.py @@ -19,6 +19,7 @@ def replace_bullets(child_blocks): class ListItem(Block): block_type: BlockTypes = BlockTypes.ListItem list_indent_level: int = 0 + block_description: str = "A list item that is part of a list. This block is used to represent a single item in a list." def assemble_html(self, document, child_blocks, parent_structure): template = super().assemble_html(document, child_blocks, parent_structure) diff --git a/marker/schema/blocks/pagefooter.py b/marker/schema/blocks/pagefooter.py index e1127a6c..945199e5 100644 --- a/marker/schema/blocks/pagefooter.py +++ b/marker/schema/blocks/pagefooter.py @@ -4,6 +4,7 @@ class PageFooter(Block): block_type: str = BlockTypes.PageFooter + block_description: str = "Text that appears at the bottom of a page, like a page number." def assemble_html(self, document, child_blocks, parent_structure): if self.ignore_for_output: diff --git a/marker/schema/blocks/pageheader.py b/marker/schema/blocks/pageheader.py index 3b648c3b..b27e0b30 100644 --- a/marker/schema/blocks/pageheader.py +++ b/marker/schema/blocks/pageheader.py @@ -4,6 +4,7 @@ class PageHeader(Block): block_type: BlockTypes = BlockTypes.PageHeader + block_description: str = "Text that appears at the top of a page, like a page title." def assemble_html(self, document, child_blocks, parent_structure): if self.ignore_for_output: diff --git a/marker/schema/blocks/picture.py b/marker/schema/blocks/picture.py index a2be8394..5d0d633b 100644 --- a/marker/schema/blocks/picture.py +++ b/marker/schema/blocks/picture.py @@ -5,6 +5,7 @@ class Picture(Block): block_type: BlockTypes = BlockTypes.Picture description: str | None = None + block_description: str = "An image block that represents a picture." def assemble_html(self, document, child_blocks, parent_structure): if self.description: diff --git a/marker/schema/blocks/sectionheader.py b/marker/schema/blocks/sectionheader.py index 2a104f24..32468433 100644 --- a/marker/schema/blocks/sectionheader.py +++ b/marker/schema/blocks/sectionheader.py @@ -7,6 +7,7 @@ class SectionHeader(Block): block_type: BlockTypes = BlockTypes.SectionHeader heading_level: Optional[int] = None + block_description: str = "The header of a section of text or other blocks." def assemble_html(self, document, child_blocks, parent_structure): if self.ignore_for_output: diff --git a/marker/schema/blocks/table.py b/marker/schema/blocks/table.py index 812fd57d..3f45cdb7 100644 --- a/marker/schema/blocks/table.py +++ b/marker/schema/blocks/table.py @@ -4,3 +4,4 @@ class Table(BaseTable): block_type: BlockTypes = BlockTypes.Table + block_description: str = "A table of data, like a results table." diff --git a/marker/schema/blocks/tablecell.py b/marker/schema/blocks/tablecell.py index 276def77..09eaaa75 100644 --- a/marker/schema/blocks/tablecell.py +++ b/marker/schema/blocks/tablecell.py @@ -10,6 +10,7 @@ class TableCell(Block): col_id: int is_header: bool text: str = "" + block_description: str = "A cell in a table." def assemble_html(self, document, child_blocks, parent_structure=None): tag = "th" if self.is_header else "td" diff --git a/marker/schema/blocks/text.py b/marker/schema/blocks/text.py index edb25969..853a73a4 100644 --- a/marker/schema/blocks/text.py +++ b/marker/schema/blocks/text.py @@ -7,6 +7,7 @@ class Text(Block): has_continuation: bool = False blockquote: bool = False blockquote_level: int = 0 + block_description: str = "A paragraph or line of text." def assemble_html(self, document, child_blocks, parent_structure): if self.ignore_for_output: diff --git a/marker/schema/blocks/toc.py b/marker/schema/blocks/toc.py index 3e458372..e5a043ae 100644 --- a/marker/schema/blocks/toc.py +++ b/marker/schema/blocks/toc.py @@ -4,3 +4,4 @@ class TableOfContents(BaseTable): block_type: str = BlockTypes.TableOfContents + block_description: str = "A table of contents." diff --git a/marker/schema/groups/figure.py b/marker/schema/groups/figure.py index ce916cc6..e3517eb4 100644 --- a/marker/schema/groups/figure.py +++ b/marker/schema/groups/figure.py @@ -4,3 +4,4 @@ class FigureGroup(Group): block_type: BlockTypes = BlockTypes.FigureGroup + block_description: str = "A group that contains a figure and associated captions." diff --git a/marker/schema/groups/list.py b/marker/schema/groups/list.py index e45bdd15..6e4304dc 100644 --- a/marker/schema/groups/list.py +++ b/marker/schema/groups/list.py @@ -5,6 +5,7 @@ class ListGroup(Group): block_type: BlockTypes = BlockTypes.ListGroup has_continuation: bool = False + block_description: str = "A group of list items that should be rendered together." def assemble_html(self, document, child_blocks, parent_structure): template = super().assemble_html(document, child_blocks, parent_structure) diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index 4051cb55..6094faa9 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -22,6 +22,7 @@ class PageGroup(Group): layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong) excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,) maximum_assignment_distance: float = 20 # pixels + block_description: str = "A single page in the document." def incr_block_id(self): if self.block_id is None: diff --git a/marker/schema/groups/picture.py b/marker/schema/groups/picture.py index 36097dc2..c233ce27 100644 --- a/marker/schema/groups/picture.py +++ b/marker/schema/groups/picture.py @@ -4,3 +4,4 @@ class PictureGroup(Group): block_type: BlockTypes = BlockTypes.PictureGroup + block_description: str = "A picture along with associated captions." diff --git a/marker/schema/groups/table.py b/marker/schema/groups/table.py index 374f2a3e..86c13890 100644 --- a/marker/schema/groups/table.py +++ b/marker/schema/groups/table.py @@ -4,3 +4,4 @@ class TableGroup(Group): block_type: BlockTypes = BlockTypes.TableGroup + block_description: str = "A table along with associated captions." diff --git a/marker/schema/text/line.py b/marker/schema/text/line.py index 70469757..30525a38 100644 --- a/marker/schema/text/line.py +++ b/marker/schema/text/line.py @@ -35,6 +35,7 @@ def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str: class Line(Block): block_type: BlockTypes = BlockTypes.Line + block_description: str = "A line of text." def formatted_text(self, document): text = "" diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py index 4ac5ebca..185059b3 100644 --- a/marker/schema/text/span.py +++ b/marker/schema/text/span.py @@ -14,6 +14,7 @@ def cleanup_text(full_text): class Span(Block): block_type: BlockTypes = BlockTypes.Span + block_description: str = "A span of text inside a line." text: str font: str