Skip to content

Commit

Permalink
Merge pull request #375 from VikParuchuri/vik_v2
Browse files Browse the repository at this point in the history
Add code processor, fix issues with structure
  • Loading branch information
VikParuchuri authored Nov 20, 2024
2 parents 7b817ff + bf9199f commit d9352be
Show file tree
Hide file tree
Showing 10 changed files with 112 additions and 16 deletions.
6 changes: 3 additions & 3 deletions marker/v2/builders/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
class LayoutBuilder(BaseBuilder):
batch_size = None
layout_coverage_min_lines = 1
layout_coverage_threshold = .5
layout_coverage_threshold = .3

def __init__(self, layout_model, config=None):
self.layout_model = layout_model
Expand Down Expand Up @@ -73,7 +73,7 @@ def check_layout_coverage(
total_blocks = 0
for layout_block_id in document_page.structure:
layout_block = document_page.get_block(layout_block_id)
if layout_block.block_type in [BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table]:
if layout_block.block_type in [BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup]:
continue

total_blocks += 1
Expand All @@ -85,5 +85,5 @@ def check_layout_coverage(
if intersecting_lines > self.layout_coverage_min_lines:
covered_blocks += 1

coverage_ratio = covered_blocks / max(total_blocks, 1)
coverage_ratio = covered_blocks / total_blocks if total_blocks > 0 else 1
return coverage_ratio >= self.layout_coverage_threshold
26 changes: 21 additions & 5 deletions marker/v2/builders/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

class StructureBuilder(BaseBuilder):
gap_threshold: int = .05
list_gap_threshold: int = .1

def __init__(self, config=None):
super().__init__(config)
Expand All @@ -19,11 +20,17 @@ def __call__(self, document: Document):

def group_caption_blocks(self, page: PageGroup):
gap_threshold_px = self.gap_threshold * page.polygon.height
for i, block_id in enumerate(page.structure):
static_page_structure = page.structure.copy()
remove_ids = list()

for i, block_id in enumerate(static_page_structure):
block = page.get_block(block_id)
if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]:
continue

if block.block_id in remove_ids:
continue

block_structure = [block_id]
selected_polygons = [block.polygon]
for j, prev_block_id in enumerate(page.structure[:i][::-1]):
Expand Down Expand Up @@ -57,14 +64,21 @@ def group_caption_blocks(self, page: PageGroup):

# Update the structure of the page to reflect the new block
page.update_structure_item(block_id, group_block.id)
page.remove_structure_items(block_structure)
remove_ids.extend(block_structure)
page.remove_structure_items(remove_ids)

def group_lists(self, page: PageGroup):
gap_threshold_px = self.gap_threshold * page.polygon.height
for i, block_id in enumerate(page.structure):
gap_threshold_px = self.list_gap_threshold * page.polygon.height
static_page_structure = page.structure.copy()
remove_ids = list()
for i, block_id in enumerate(static_page_structure):
block = page.get_block(block_id)
if block.block_type not in [BlockTypes.ListItem]:
continue

if block.id in remove_ids:
continue

block_structure = [block_id]
selected_polygons = [block.polygon]

Expand All @@ -86,4 +100,6 @@ def group_lists(self, page: PageGroup):

# Update the structure of the page to reflect the new block
page.update_structure_item(block_id, group_block.id)
page.remove_structure_items(block_structure)
remove_ids.extend(block_structure)

page.remove_structure_items(remove_ids)
5 changes: 4 additions & 1 deletion marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import json

from marker.settings import settings
from marker.v2.processors.code import CodeProcessor
from marker.v2.processors.document_toc import DocumentTOCProcessor
from marker.v2.providers.pdf import PdfProvider
import os

from marker.v2.renderers.json import JSONRenderer
from marker.v2.util import parse_range_str

os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning

Expand Down Expand Up @@ -65,6 +67,7 @@ def __call__(self, filepath: str):
EquationProcessor(self.texify_model, self.config),
TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model, self.config),
SectionHeaderProcessor(self.config),
CodeProcessor(self.config),
DocumentTOCProcessor(self.config),
DebugProcessor(self.config),
]
Expand All @@ -84,7 +87,7 @@ def __call__(self, filepath: str):
@click.option("--force_ocr", is_flag=True)
def main(fpath: str, output_dir: str, debug: bool, output_format: str, pages: str, force_ocr: bool):
if pages is not None:
pages = list(map(int, pages.split(",")))
pages = parse_range_str(pages)

fname_base = os.path.splitext(os.path.basename(fpath))[0]
output_dir = os.path.join(output_dir, fname_base)
Expand Down
46 changes: 46 additions & 0 deletions marker/v2/processors/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from marker.v2.processors import BaseProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Code
from marker.v2.schema.document import Document


class CodeProcessor(BaseProcessor):
block_types = (BlockTypes.Code, )
y_top_threshold = 2 # pixels

def __call__(self, document: Document):
for page in document.pages:
for block in page.children:
if block.block_type not in self.block_types:
continue
self.format_block(document, block)

def format_block(self, document: Document, block: Code):
min_left = 9999 # will contain x- coord of column 0
total_width = 0
total_chars = 0
for line_id in block.structure:
line = document.get_block(line_id)
min_left = min(line.polygon.bbox[0], min_left)
total_width += line.polygon.width
total_chars += len(line.raw_text(document))

avg_char_width = total_width / max(total_chars, 1)
code_text = ""
is_new_line = False
for line_id in block.structure:
line = document.get_block(line_id)
text = line.raw_text(document)
if avg_char_width == 0:
prefix = ""
else:
total_spaces = int((line.polygon.bbox[0] - min_left) / avg_char_width)
prefix = " " * max(0, total_spaces)

if is_new_line:
text = prefix + text

code_text += text
is_new_line = text.endswith("\n")

block.code = code_text
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def raw_text(self, document: Document) -> str:
for block_id in self.structure:
block = document.get_block(block_id)
text += block.raw_text(document)
if isinstance(block, Line):
if isinstance(block, Line) and not text.endswith("\n"):
text += "\n"
return text

Expand Down
11 changes: 7 additions & 4 deletions marker/v2/schema/blocks/code.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import html

from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


class Code(Block):
block_type: BlockTypes = BlockTypes.Code
code: str | None = None

def assemble_html(self, child_blocks, parent_structure):
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>\n"
return f"<pre>{template}</pre>"
code = self.code or ""
return (f"<pre>"
f"{html.escape(code)}"
f"</pre>")
8 changes: 7 additions & 1 deletion marker/v2/schema/blocks/equation.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import html

from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block

Expand All @@ -7,4 +9,8 @@ class Equation(Block):
latex: str | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return f"<p><math>{self.latex}</math></p>"
if self.latex:
return f"<p><math>{html.escape(self.latex)}</math></p>"
else:
template = super().assemble_html(child_blocks, parent_structure)
return f"<p>{template}</p>"
6 changes: 5 additions & 1 deletion marker/v2/schema/blocks/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@ class Table(Block):
cells: List[SpanTableCell] | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return html_format(self.cells)
if self.cells:
return html_format(self.cells)
else:
template = super().assemble_html(child_blocks, parent_structure)
return f"<p>{template}</p>"
3 changes: 3 additions & 0 deletions marker/v2/schema/text/span.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import html
from typing import List, Literal

from marker.v2.schema import BlockTypes
Expand Down Expand Up @@ -40,6 +41,8 @@ def assemble_html(self, child_blocks, parent_structure):
if replaced_newline:
text += " "

text = html.escape(text)

if len(text) > 3:
if self.italic:
return f"<i>{text}</i>"
Expand Down
15 changes: 15 additions & 0 deletions marker/v2/util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

from pydantic import BaseModel


Expand All @@ -23,3 +25,16 @@ def assign_config(cls, config: BaseModel | dict | None):

if hasattr(cls, split_k):
setattr(cls, split_k, dict_config[k])


def parse_range_str(range_str: str) -> List[int]:
range_lst = range_str.split(",")
page_lst = []
for i in range_lst:
if "-" in i:
start, end = i.split("-")
page_lst += list(range(int(start), int(end) + 1))
else:
page_lst.append(int(i))
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
return page_lst

0 comments on commit d9352be

Please sign in to comment.