diff --git a/llmsherpa/readers/layout_reader.py b/llmsherpa/readers/layout_reader.py
index dd5279d..73dc35a 100644
--- a/llmsherpa/readers/layout_reader.py
+++ b/llmsherpa/readers/layout_reader.py
@@ -32,20 +32,14 @@ class Block:
tag: str
def __init__(self, block_json=None):
- self.tag = block_json["tag"] if block_json and "tag" in block_json else None
- self.level = block_json["level"] if block_json and "level" in block_json else -1
- self.page_idx = (
- block_json["page_idx"] if block_json and "page_idx" in block_json else -1
- )
- self.block_idx = (
- block_json["block_idx"] if block_json and "block_idx" in block_json else -1
- )
- self.top = block_json["top"] if block_json and "top" in block_json else -1
- self.left = block_json["left"] if block_json and "left" in block_json else -1
- self.bbox = block_json["bbox"] if block_json and "bbox" in block_json else []
- self.sentences = (
- block_json["sentences"] if block_json and "sentences" in block_json else []
- )
+ self.tag = block_json['tag'] if block_json and 'tag' in block_json else None
+ self.level = block_json['level'] if block_json and 'level' in block_json else -1
+ self.page_idx = block_json['page_idx'] if block_json and 'page_idx' in block_json else -1
+ self.block_idx = block_json['block_idx'] if block_json and 'block_idx' in block_json else -1
+ self.top = block_json['top'] if block_json and 'top' in block_json else -1
+ self.left = block_json['left'] if block_json and 'left' in block_json else -1
+ self.bbox = block_json['bbox'] if block_json and 'bbox' in block_json else []
+ self.sentences = block_json['sentences'] if block_json and 'sentences' in block_json else []
self.children = []
self.parent = None
self.block_json = block_json
@@ -97,7 +91,7 @@ def parent_text(self):
for p in parent_chain:
if p.tag == "header":
header_texts.append(p.to_text())
- elif p.tag in ["list_item", "para"]:
+ elif p.tag in ['list_item', 'para']:
para_texts.append(p.to_text())
text = " > ".join(header_texts)
if len(para_texts) > 0:
@@ -111,7 +105,7 @@ def to_context_text(self, include_section_info=True):
text = ""
if include_section_info:
text += self.parent_text() + "\n"
- if self.tag in ["list_item", "para", "table"]:
+ if self.tag in ['list_item', 'para', 'table']:
text += self.to_text(include_children=True, recurse=True)
else:
text += self.to_text()
@@ -124,7 +118,7 @@ def iter_children(self, node, level, node_visitor):
for child in node.children:
node_visitor(child)
# print("-"*level, child.tag, f"({len(child.children)})", child.to_text())
- if child.tag not in ["list_item", "para", "table"]:
+ if child.tag not in ['list_item', 'para', 'table']:
self.iter_children(child, level + 1, node_visitor)
def paragraphs(self):
@@ -134,7 +128,7 @@ def paragraphs(self):
paragraphs = []
def para_collector(node):
- if node.tag == "para":
+ if node.tag == 'para':
paragraphs.append(node)
self.iter_children(self, 0, para_collector)
@@ -147,7 +141,7 @@ def chunks(self):
chunks = []
def chunk_collector(node):
- if node.tag in ["para", "list_item", "table"]:
+ if node.tag in ['para', 'list_item', 'table']:
chunks.append(node)
self.iter_children(self, 0, chunk_collector)
@@ -160,9 +154,8 @@ def tables(self):
tables = []
def chunk_collector(node):
- if node.tag in ["table"]:
+ if node.tag in ['table']:
tables.append(node)
-
self.iter_children(self, 0, chunk_collector)
return tables
@@ -173,13 +166,12 @@ def sections(self):
sections = []
def chunk_collector(node):
- if node.tag in ["header"]:
+ if node.tag in ['header']:
sections.append(node)
self.iter_children(self, 0, chunk_collector)
return sections
-
class Paragraph(Block):
"""
A paragraph is a block of text. It can have children such as lists. A paragraph has tag 'para'.
@@ -188,6 +180,7 @@ class Paragraph(Block):
def __init__(self, para_json):
super().__init__(para_json)
+
def to_text(self, include_children=False, recurse=False):
"""
Converts the paragraph to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included.
@@ -202,9 +195,7 @@ def to_text(self, include_children=False, recurse=False):
para_text = "\n".join(self.sentences)
if include_children:
for child in self.children:
- para_text += "\n" + child.to_text(
- include_children=recurse, recurse=recurse
- )
+ para_text += "\n" + child.to_text(include_children=recurse, recurse=recurse)
return para_text
def to_html(self, include_children=False, recurse=False):
@@ -224,9 +215,7 @@ def to_html(self, include_children=False, recurse=False):
if len(self.children) > 0:
html_str += "
"
for child in self.children:
- html_str = html_str + child.to_html(
- include_children=recurse, recurse=recurse
- )
+ html_str = html_str + child.to_html(include_children=recurse, recurse=recurse)
html_str += "
"
html_str = html_str + ""
return html_str
@@ -265,6 +254,7 @@ def __init__(self, section_json):
super().__init__(section_json)
self.title = "\n".join(self.sentences)
+
def to_text(self, include_children=False, recurse=False):
"""
Converts the section to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included.
@@ -325,14 +315,12 @@ class ListItem(Block):
"""
A list item is a block of text. It can have child list items. A list item has tag 'list_item'.
"""
-
def __init__(self, list_json):
super().__init__(list_json)
def to_text(self, include_children=False, recurse=False):
"""
Converts the list item to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included.
-
Parameters
----------
include_children: bool
@@ -349,7 +337,6 @@ def to_text(self, include_children=False, recurse=False):
def to_html(self, include_children=False, recurse=False):
"""
Converts the list item to html. If include_children is True, then the html of the children is also included. If recurse is True, then the html of the children's children are also included.
-
Parameters
----------
include_children: bool
@@ -363,9 +350,7 @@ def to_html(self, include_children=False, recurse=False):
if len(self.children) > 0:
html_str += ""
for child in self.children:
- html_str = html_str + child.to_html(
- include_children=recurse, recurse=recurse
- )
+ html_str = html_str + child.to_html(include_children=recurse, recurse=recurse)
html_str += "
"
html_str = html_str + f""
return html_str
@@ -400,8 +385,8 @@ class TableCell(Block):
def __init__(self, cell_json):
super().__init__(cell_json)
- self.col_span = cell_json["col_span"] if "col_span" in cell_json else 1
- self.cell_value = cell_json["cell_value"]
+ self.col_span = cell_json['col_span'] if 'col_span' in cell_json else 1
+ self.cell_value = cell_json['cell_value']
if not isinstance(self.cell_value, str):
self.cell_node = Paragraph(self.cell_value)
else:
@@ -446,11 +431,11 @@ class TableRow(Block):
def __init__(self, row_json):
self.cells = []
- if row_json["type"] == "full_row":
+ if row_json['type'] == 'full_row':
cell = TableCell(row_json)
self.cells.append(cell)
else:
- for cell_json in row_json["cells"]:
+ for cell_json in row_json['cells']:
cell = TableCell(cell_json)
self.cells.append(cell)
@@ -491,7 +476,7 @@ class TableHeader(Block):
def __init__(self, row_json):
super().__init__(row_json)
self.cells = []
- for cell_json in row_json["cells"]:
+ for cell_json in row_json['cells']:
cell = TableCell(cell_json)
self.cells.append(cell)
@@ -543,9 +528,9 @@ def __init__(self, table_json, parent):
self.rows = []
self.headers = []
self.name = table_json["name"]
- if "table_rows" in table_json:
- for row_json in table_json["table_rows"]:
- if row_json["type"] == "table_header":
+ if 'table_rows' in table_json:
+ for row_json in table_json['table_rows']:
+ if row_json['type'] == 'table_header':
row = TableHeader(row_json)
self.headers.append(row)
else:
@@ -595,9 +580,7 @@ class LayoutReader:
def debug(self, pdf_root):
def iter_children(node, level):
for child in node.children:
- print(
- "-" * level, child.tag, f"({len(child.children)})", child.to_text()
- )
+ print("-" * level, child.tag, f"({len(child.children)})", child.to_text())
iter_children(child, level + 1)
iter_children(pdf_root, 0)
@@ -613,28 +596,26 @@ def read(self, blocks_json):
parent = root
list_stack = []
for block in blocks_json:
- if block["tag"] != "list_item" and len(list_stack) > 0:
+ if block['tag'] != 'list_item' and len(list_stack) > 0:
list_stack = []
- if block["tag"] == "para":
+ if block['tag'] == 'para':
node = Paragraph(block)
parent.add_child(node)
- elif block["tag"] == "table":
+ elif block['tag'] == 'table':
node = Table(block, prev_node)
parent.add_child(node)
- elif block["tag"] == "list_item":
+ elif block['tag'] == 'list_item':
node = ListItem(block)
# add lists as children to previous paragraph
# this handles examples like - The following items need to be addressed: 1) item 1 2) item 2 etc.
- if prev_node.tag == "para" and prev_node.level == node.level:
+ if prev_node.tag == 'para' and prev_node.level == node.level:
list_stack.append(prev_node)
# sometimes there are lists within lists in legal documents
- elif prev_node.tag == "list_item":
+ elif prev_node.tag == 'list_item':
if node.level > prev_node.level:
list_stack.append(prev_node)
elif node.level < prev_node.level:
- while (
- len(list_stack) > 0 and list_stack.pop().level > node.level
- ):
+ while (len(list_stack) > 0 and list_stack.pop().level > node.level):
pass
# list_stack.append(node)
if len(list_stack) > 0:
@@ -642,15 +623,13 @@ def read(self, blocks_json):
else:
parent.add_child(node)
- elif block["tag"] == "header":
+ elif block['tag'] == 'header':
node = Section(block)
if node.level > parent.level:
parent_stack.append(node)
parent.add_child(node)
else:
- while (
- len(parent_stack) > 1 and parent_stack[-1].level >= node.level
- ):
+ while (len(parent_stack) > 1 and parent_stack[-1].level >= node.level):
parent_stack.pop()
parent_stack[-1].add_child(node)
parent_stack.append(node)
diff --git a/llmsherpa/readers/tests/test_layout_reader.py b/llmsherpa/readers/tests/test_layout_reader.py
index 71b414f..3c5a03a 100644
--- a/llmsherpa/readers/tests/test_layout_reader.py
+++ b/llmsherpa/readers/tests/test_layout_reader.py
@@ -31,20 +31,20 @@ def get_document(self, file_name):
def test_list_child_of_header(self):
pdf = self.read_layout("list_test.json")
self.assertEqual(len(pdf.children[0].children), 3)
- self.assertEqual(pdf.children[0].children[0].tag, "list_item")
+ self.assertEqual(pdf.children[0].children[0].tag, 'list_item')
# self.assertEqual(sum([1, 2, 3]), 6, "Should be 6")
def test_list_child_of_para(self):
doc = self.read_layout("list_test.json")
- self.assertEqual(doc.children[0].children[2].tag, "para")
- self.assertEqual(len(doc.children[0].children[2].children), 2)
- self.assertEqual(doc.children[0].children[2].children[0].tag, "list_item")
+ self.assertEqual(doc.children[0].children[2].tag, 'para')
+ self.assertEqual(len(doc.children[0].children[2].children), 2)
+ self.assertEqual(doc.children[0].children[2].children[0].tag, 'list_item')
def test_nested_lists(self):
doc = self.read_layout("nested_list_test.json")
- self.assertEqual(len(doc.children[0].children), 2)
- self.assertEqual(len(doc.children[0].children[0].children), 2)
- self.assertEqual(len(doc.children[1].children[0].children[1].children), 2)
+ self.assertEqual(len(doc.children[0].children), 2)
+ self.assertEqual(len(doc.children[0].children[0].children), 2)
+ self.assertEqual(len(doc.children[1].children[0].children[1].children), 2)
parent_text = """
Article II
Section 1
@@ -58,43 +58,37 @@ def test_nested_lists(self):
def test_nested_lists_with_para(self):
doc = self.read_layout("nested_list_test.json")
- self.assertEqual(doc.children[2].children[0].tag, "para")
+ self.assertEqual(doc.children[2].children[0].tag, 'para')
self.assertEqual(len(doc.children[2].children), 2)
- self.assertEqual(len(doc.children[2].children[0].children), 2)
- self.assertEqual(doc.children[2].children[1].tag, "para")
+ self.assertEqual(len(doc.children[2].children[0].children), 2)
+ self.assertEqual(doc.children[2].children[1].tag, 'para')
def test_nested_headers(self):
doc = self.read_layout("header_test.json")
- self.assertEqual(len(doc.children[0].children), 2)
- self.assertEqual(len(doc.children[0].children[0].children), 2)
- self.assertEqual(len(doc.children[1].children[0].children[1].children), 2)
- self.assertEqual(
- doc.children[1].children[0].children[1].parent_text(),
- "Article II > Section 1",
- )
+ self.assertEqual(len(doc.children[0].children), 2)
+ self.assertEqual(len(doc.children[0].children[0].children), 2)
+ self.assertEqual(len(doc.children[1].children[0].children[1].children), 2)
+ self.assertEqual(doc.children[1].children[0].children[1].parent_text(), "Article II > Section 1")
def test_ooo_nested_headers(self):
# OutOfOrder Header test case
doc = self.read_layout("ooo_header_test.json")
- self.assertEqual(len(doc.children[0].children), 0)
- self.assertEqual(len(doc.children[1].children), 0)
- self.assertEqual(len(doc.children[2].children), 2)
- self.assertEqual(len(doc.children[2].children[0].children), 2)
- self.assertEqual(len(doc.children[3].children[0].children[1].children), 2)
- self.assertEqual(
- doc.children[3].children[0].children[1].parent_text(),
- "Article II > Section 1",
- )
+ self.assertEqual(len(doc.children[0].children), 0)
+ self.assertEqual(len(doc.children[1].children), 0)
+ self.assertEqual(len(doc.children[2].children), 2)
+ self.assertEqual(len(doc.children[2].children[0].children), 2)
+ self.assertEqual(len(doc.children[3].children[0].children[1].children), 2)
+ self.assertEqual(doc.children[3].children[0].children[1].parent_text(),"Article II > Section 1")
def test_ooo_nested_header_children(self):
# OutOfOrder Header children test case
doc = self.read_layout("ooo_header_child_test.json")
- self.assertEqual(len(doc.children[0].children), 0)
- self.assertEqual(len(doc.children[1].children), 3)
- self.assertEqual(len(doc.children[1].children[0].children), 0)
- self.assertEqual(len(doc.children[1].children[1].children), 2)
- self.assertEqual(len(doc.children[1].children[2].children), 3)
- self.assertEqual(len(doc.children[1].children[2].children[0].children), 0)
+ self.assertEqual(len(doc.children[0].children), 0)
+ self.assertEqual(len(doc.children[1].children), 3)
+ self.assertEqual(len(doc.children[1].children[0].children), 0)
+ self.assertEqual(len(doc.children[1].children[1].children), 2)
+ self.assertEqual(len(doc.children[1].children[2].children), 3)
+ self.assertEqual(len(doc.children[1].children[2].children[0].children), 0)
def test_table(self):
doc = self.read_layout("table_test.json")
@@ -113,10 +107,7 @@ def test_paragraph_iterator(self):
b) Disclaimer 2
"""
correct_text = self.clean_text(correct_text)
- self.assertEqual(
- paras[0].to_text(include_children=True, recurse=True),
- correct_text.replace(" *", ""),
- )
+ self.assertEqual(paras[0].to_text(include_children=True, recurse=True), correct_text.replace(" *", ""))
def test_chunk_iterator(self):
doc = self.read_layout("chunk_test.json")
@@ -166,9 +157,7 @@ def test_to_text(self):
def test_to_html(self):
doc = self.get_document("to_html_test.json")
- correct_html = (
- "Heading 1
Heading 2
Heading 3
"
- )
+ correct_html = ("Heading 1
Heading 2
Heading 3
")
self.assertEqual(doc.to_html(), correct_html)
def test_to_markdown(self):
@@ -180,5 +169,5 @@ def test_to_markdown(self):
self.assertEqual(doc.to_markdown(), correct_markdown)
-if __name__ == "__main__":
+if __name__ == '__main__':
unittest.main()