Skip to content

Commit

Permalink
Rollback change to move to double quotes
Browse files Browse the repository at this point in the history
  • Loading branch information
jpbalarini committed Nov 19, 2024
1 parent d1a4bba commit 9f2b0b0
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 99 deletions.
97 changes: 38 additions & 59 deletions llmsherpa/readers/layout_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,14 @@ class Block:
tag: str

def __init__(self, block_json=None):
self.tag = block_json["tag"] if block_json and "tag" in block_json else None
self.level = block_json["level"] if block_json and "level" in block_json else -1
self.page_idx = (
block_json["page_idx"] if block_json and "page_idx" in block_json else -1
)
self.block_idx = (
block_json["block_idx"] if block_json and "block_idx" in block_json else -1
)
self.top = block_json["top"] if block_json and "top" in block_json else -1
self.left = block_json["left"] if block_json and "left" in block_json else -1
self.bbox = block_json["bbox"] if block_json and "bbox" in block_json else []
self.sentences = (
block_json["sentences"] if block_json and "sentences" in block_json else []
)
self.tag = block_json['tag'] if block_json and 'tag' in block_json else None
self.level = block_json['level'] if block_json and 'level' in block_json else -1
self.page_idx = block_json['page_idx'] if block_json and 'page_idx' in block_json else -1
self.block_idx = block_json['block_idx'] if block_json and 'block_idx' in block_json else -1
self.top = block_json['top'] if block_json and 'top' in block_json else -1
self.left = block_json['left'] if block_json and 'left' in block_json else -1
self.bbox = block_json['bbox'] if block_json and 'bbox' in block_json else []
self.sentences = block_json['sentences'] if block_json and 'sentences' in block_json else []
self.children = []
self.parent = None
self.block_json = block_json
Expand Down Expand Up @@ -97,7 +91,7 @@ def parent_text(self):
for p in parent_chain:
if p.tag == "header":
header_texts.append(p.to_text())
elif p.tag in ["list_item", "para"]:
elif p.tag in ['list_item', 'para']:
para_texts.append(p.to_text())
text = " > ".join(header_texts)
if len(para_texts) > 0:
Expand All @@ -111,7 +105,7 @@ def to_context_text(self, include_section_info=True):
text = ""
if include_section_info:
text += self.parent_text() + "\n"
if self.tag in ["list_item", "para", "table"]:
if self.tag in ['list_item', 'para', 'table']:
text += self.to_text(include_children=True, recurse=True)
else:
text += self.to_text()
Expand All @@ -124,7 +118,7 @@ def iter_children(self, node, level, node_visitor):
for child in node.children:
node_visitor(child)
# print("-"*level, child.tag, f"({len(child.children)})", child.to_text())
if child.tag not in ["list_item", "para", "table"]:
if child.tag not in ['list_item', 'para', 'table']:
self.iter_children(child, level + 1, node_visitor)

def paragraphs(self):
Expand All @@ -134,7 +128,7 @@ def paragraphs(self):
paragraphs = []

def para_collector(node):
if node.tag == "para":
if node.tag == 'para':
paragraphs.append(node)

self.iter_children(self, 0, para_collector)
Expand All @@ -147,7 +141,7 @@ def chunks(self):
chunks = []

def chunk_collector(node):
if node.tag in ["para", "list_item", "table"]:
if node.tag in ['para', 'list_item', 'table']:
chunks.append(node)

self.iter_children(self, 0, chunk_collector)
Expand All @@ -160,9 +154,8 @@ def tables(self):
tables = []

def chunk_collector(node):
if node.tag in ["table"]:
if node.tag in ['table']:
tables.append(node)

self.iter_children(self, 0, chunk_collector)
return tables

Expand All @@ -173,13 +166,12 @@ def sections(self):
sections = []

def chunk_collector(node):
if node.tag in ["header"]:
if node.tag in ['header']:
sections.append(node)

self.iter_children(self, 0, chunk_collector)
return sections


class Paragraph(Block):
"""
A paragraph is a block of text. It can have children such as lists. A paragraph has tag 'para'.
Expand All @@ -188,6 +180,7 @@ class Paragraph(Block):
def __init__(self, para_json):
super().__init__(para_json)


def to_text(self, include_children=False, recurse=False):
"""
Converts the paragraph to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included.
Expand All @@ -202,9 +195,7 @@ def to_text(self, include_children=False, recurse=False):
para_text = "\n".join(self.sentences)
if include_children:
for child in self.children:
para_text += "\n" + child.to_text(
include_children=recurse, recurse=recurse
)
para_text += "\n" + child.to_text(include_children=recurse, recurse=recurse)
return para_text

def to_html(self, include_children=False, recurse=False):
Expand All @@ -224,9 +215,7 @@ def to_html(self, include_children=False, recurse=False):
if len(self.children) > 0:
html_str += "<ul>"
for child in self.children:
html_str = html_str + child.to_html(
include_children=recurse, recurse=recurse
)
html_str = html_str + child.to_html(include_children=recurse, recurse=recurse)
html_str += "</ul>"
html_str = html_str + "</p>"
return html_str
Expand Down Expand Up @@ -265,6 +254,7 @@ def __init__(self, section_json):
super().__init__(section_json)
self.title = "\n".join(self.sentences)


def to_text(self, include_children=False, recurse=False):
"""
Converts the section to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included.
Expand Down Expand Up @@ -325,14 +315,12 @@ class ListItem(Block):
"""
A list item is a block of text. It can have child list items. A list item has tag 'list_item'.
"""

def __init__(self, list_json):
super().__init__(list_json)

def to_text(self, include_children=False, recurse=False):
"""
Converts the list item to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included.
Parameters
----------
include_children: bool
Expand All @@ -349,7 +337,6 @@ def to_text(self, include_children=False, recurse=False):
def to_html(self, include_children=False, recurse=False):
"""
Converts the list item to html. If include_children is True, then the html of the children is also included. If recurse is True, then the html of the children's children are also included.
Parameters
----------
include_children: bool
Expand All @@ -363,9 +350,7 @@ def to_html(self, include_children=False, recurse=False):
if len(self.children) > 0:
html_str += "<ul>"
for child in self.children:
html_str = html_str + child.to_html(
include_children=recurse, recurse=recurse
)
html_str = html_str + child.to_html(include_children=recurse, recurse=recurse)
html_str += "</ul>"
html_str = html_str + f"</li>"
return html_str
Expand Down Expand Up @@ -400,8 +385,8 @@ class TableCell(Block):

def __init__(self, cell_json):
super().__init__(cell_json)
self.col_span = cell_json["col_span"] if "col_span" in cell_json else 1
self.cell_value = cell_json["cell_value"]
self.col_span = cell_json['col_span'] if 'col_span' in cell_json else 1
self.cell_value = cell_json['cell_value']
if not isinstance(self.cell_value, str):
self.cell_node = Paragraph(self.cell_value)
else:
Expand Down Expand Up @@ -446,11 +431,11 @@ class TableRow(Block):

def __init__(self, row_json):
self.cells = []
if row_json["type"] == "full_row":
if row_json['type'] == 'full_row':
cell = TableCell(row_json)
self.cells.append(cell)
else:
for cell_json in row_json["cells"]:
for cell_json in row_json['cells']:
cell = TableCell(cell_json)
self.cells.append(cell)

Expand Down Expand Up @@ -491,7 +476,7 @@ class TableHeader(Block):
def __init__(self, row_json):
super().__init__(row_json)
self.cells = []
for cell_json in row_json["cells"]:
for cell_json in row_json['cells']:
cell = TableCell(cell_json)
self.cells.append(cell)

Expand Down Expand Up @@ -543,9 +528,9 @@ def __init__(self, table_json, parent):
self.rows = []
self.headers = []
self.name = table_json["name"]
if "table_rows" in table_json:
for row_json in table_json["table_rows"]:
if row_json["type"] == "table_header":
if 'table_rows' in table_json:
for row_json in table_json['table_rows']:
if row_json['type'] == 'table_header':
row = TableHeader(row_json)
self.headers.append(row)
else:
Expand Down Expand Up @@ -595,9 +580,7 @@ class LayoutReader:
def debug(self, pdf_root):
def iter_children(node, level):
for child in node.children:
print(
"-" * level, child.tag, f"({len(child.children)})", child.to_text()
)
print("-" * level, child.tag, f"({len(child.children)})", child.to_text())
iter_children(child, level + 1)

iter_children(pdf_root, 0)
Expand All @@ -613,44 +596,40 @@ def read(self, blocks_json):
parent = root
list_stack = []
for block in blocks_json:
if block["tag"] != "list_item" and len(list_stack) > 0:
if block['tag'] != 'list_item' and len(list_stack) > 0:
list_stack = []
if block["tag"] == "para":
if block['tag'] == 'para':
node = Paragraph(block)
parent.add_child(node)
elif block["tag"] == "table":
elif block['tag'] == 'table':
node = Table(block, prev_node)
parent.add_child(node)
elif block["tag"] == "list_item":
elif block['tag'] == 'list_item':
node = ListItem(block)
# add lists as children to previous paragraph
# this handles examples like - The following items need to be addressed: 1) item 1 2) item 2 etc.
if prev_node.tag == "para" and prev_node.level == node.level:
if prev_node.tag == 'para' and prev_node.level == node.level:
list_stack.append(prev_node)
# sometimes there are lists within lists in legal documents
elif prev_node.tag == "list_item":
elif prev_node.tag == 'list_item':
if node.level > prev_node.level:
list_stack.append(prev_node)
elif node.level < prev_node.level:
while (
len(list_stack) > 0 and list_stack.pop().level > node.level
):
while (len(list_stack) > 0 and list_stack.pop().level > node.level):
pass
# list_stack.append(node)
if len(list_stack) > 0:
list_stack[-1].add_child(node)
else:
parent.add_child(node)

elif block["tag"] == "header":
elif block['tag'] == 'header':
node = Section(block)
if node.level > parent.level:
parent_stack.append(node)
parent.add_child(node)
else:
while (
len(parent_stack) > 1 and parent_stack[-1].level >= node.level
):
while (len(parent_stack) > 1 and parent_stack[-1].level >= node.level):
parent_stack.pop()
parent_stack[-1].add_child(node)
parent_stack.append(node)
Expand Down
Loading

0 comments on commit 9f2b0b0

Please sign in to comment.