diff --git a/llmsherpa/readers/layout_reader.py b/llmsherpa/readers/layout_reader.py index dd5279d..73dc35a 100644 --- a/llmsherpa/readers/layout_reader.py +++ b/llmsherpa/readers/layout_reader.py @@ -32,20 +32,14 @@ class Block: tag: str def __init__(self, block_json=None): - self.tag = block_json["tag"] if block_json and "tag" in block_json else None - self.level = block_json["level"] if block_json and "level" in block_json else -1 - self.page_idx = ( - block_json["page_idx"] if block_json and "page_idx" in block_json else -1 - ) - self.block_idx = ( - block_json["block_idx"] if block_json and "block_idx" in block_json else -1 - ) - self.top = block_json["top"] if block_json and "top" in block_json else -1 - self.left = block_json["left"] if block_json and "left" in block_json else -1 - self.bbox = block_json["bbox"] if block_json and "bbox" in block_json else [] - self.sentences = ( - block_json["sentences"] if block_json and "sentences" in block_json else [] - ) + self.tag = block_json['tag'] if block_json and 'tag' in block_json else None + self.level = block_json['level'] if block_json and 'level' in block_json else -1 + self.page_idx = block_json['page_idx'] if block_json and 'page_idx' in block_json else -1 + self.block_idx = block_json['block_idx'] if block_json and 'block_idx' in block_json else -1 + self.top = block_json['top'] if block_json and 'top' in block_json else -1 + self.left = block_json['left'] if block_json and 'left' in block_json else -1 + self.bbox = block_json['bbox'] if block_json and 'bbox' in block_json else [] + self.sentences = block_json['sentences'] if block_json and 'sentences' in block_json else [] self.children = [] self.parent = None self.block_json = block_json @@ -97,7 +91,7 @@ def parent_text(self): for p in parent_chain: if p.tag == "header": header_texts.append(p.to_text()) - elif p.tag in ["list_item", "para"]: + elif p.tag in ['list_item', 'para']: para_texts.append(p.to_text()) text = " > ".join(header_texts) if len(para_texts) > 0: @@ -111,7 +105,7 @@ def to_context_text(self, include_section_info=True): text = "" if include_section_info: text += self.parent_text() + "\n" - if self.tag in ["list_item", "para", "table"]: + if self.tag in ['list_item', 'para', 'table']: text += self.to_text(include_children=True, recurse=True) else: text += self.to_text() @@ -124,7 +118,7 @@ def iter_children(self, node, level, node_visitor): for child in node.children: node_visitor(child) # print("-"*level, child.tag, f"({len(child.children)})", child.to_text()) - if child.tag not in ["list_item", "para", "table"]: + if child.tag not in ['list_item', 'para', 'table']: self.iter_children(child, level + 1, node_visitor) def paragraphs(self): @@ -134,7 +128,7 @@ def paragraphs(self): paragraphs = [] def para_collector(node): - if node.tag == "para": + if node.tag == 'para': paragraphs.append(node) self.iter_children(self, 0, para_collector) @@ -147,7 +141,7 @@ def chunks(self): chunks = [] def chunk_collector(node): - if node.tag in ["para", "list_item", "table"]: + if node.tag in ['para', 'list_item', 'table']: chunks.append(node) self.iter_children(self, 0, chunk_collector) @@ -160,9 +154,8 @@ def tables(self): tables = [] def chunk_collector(node): - if node.tag in ["table"]: + if node.tag in ['table']: tables.append(node) - self.iter_children(self, 0, chunk_collector) return tables @@ -173,13 +166,12 @@ def sections(self): sections = [] def chunk_collector(node): - if node.tag in ["header"]: + if node.tag in ['header']: sections.append(node) self.iter_children(self, 0, chunk_collector) return sections - class Paragraph(Block): """ A paragraph is a block of text. It can have children such as lists. A paragraph has tag 'para'. @@ -188,6 +180,7 @@ class Paragraph(Block): def __init__(self, para_json): super().__init__(para_json) + def to_text(self, include_children=False, recurse=False): """ Converts the paragraph to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included. @@ -202,9 +195,7 @@ def to_text(self, include_children=False, recurse=False): para_text = "\n".join(self.sentences) if include_children: for child in self.children: - para_text += "\n" + child.to_text( - include_children=recurse, recurse=recurse - ) + para_text += "\n" + child.to_text(include_children=recurse, recurse=recurse) return para_text def to_html(self, include_children=False, recurse=False): @@ -224,9 +215,7 @@ def to_html(self, include_children=False, recurse=False): if len(self.children) > 0: html_str += "" html_str = html_str + "

" return html_str @@ -265,6 +254,7 @@ def __init__(self, section_json): super().__init__(section_json) self.title = "\n".join(self.sentences) + def to_text(self, include_children=False, recurse=False): """ Converts the section to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included. @@ -325,14 +315,12 @@ class ListItem(Block): """ A list item is a block of text. It can have child list items. A list item has tag 'list_item'. """ - def __init__(self, list_json): super().__init__(list_json) def to_text(self, include_children=False, recurse=False): """ Converts the list item to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included. - Parameters ---------- include_children: bool @@ -349,7 +337,6 @@ def to_text(self, include_children=False, recurse=False): def to_html(self, include_children=False, recurse=False): """ Converts the list item to html. If include_children is True, then the html of the children is also included. If recurse is True, then the html of the children's children are also included. - Parameters ---------- include_children: bool @@ -363,9 +350,7 @@ def to_html(self, include_children=False, recurse=False): if len(self.children) > 0: html_str += "" html_str = html_str + f"" return html_str @@ -400,8 +385,8 @@ class TableCell(Block): def __init__(self, cell_json): super().__init__(cell_json) - self.col_span = cell_json["col_span"] if "col_span" in cell_json else 1 - self.cell_value = cell_json["cell_value"] + self.col_span = cell_json['col_span'] if 'col_span' in cell_json else 1 + self.cell_value = cell_json['cell_value'] if not isinstance(self.cell_value, str): self.cell_node = Paragraph(self.cell_value) else: @@ -446,11 +431,11 @@ class TableRow(Block): def __init__(self, row_json): self.cells = [] - if row_json["type"] == "full_row": + if row_json['type'] == 'full_row': cell = TableCell(row_json) self.cells.append(cell) else: - for cell_json in row_json["cells"]: + for cell_json in row_json['cells']: cell = TableCell(cell_json) self.cells.append(cell) @@ -491,7 +476,7 @@ class TableHeader(Block): def __init__(self, row_json): super().__init__(row_json) self.cells = [] - for cell_json in row_json["cells"]: + for cell_json in row_json['cells']: cell = TableCell(cell_json) self.cells.append(cell) @@ -543,9 +528,9 @@ def __init__(self, table_json, parent): self.rows = [] self.headers = [] self.name = table_json["name"] - if "table_rows" in table_json: - for row_json in table_json["table_rows"]: - if row_json["type"] == "table_header": + if 'table_rows' in table_json: + for row_json in table_json['table_rows']: + if row_json['type'] == 'table_header': row = TableHeader(row_json) self.headers.append(row) else: @@ -595,9 +580,7 @@ class LayoutReader: def debug(self, pdf_root): def iter_children(node, level): for child in node.children: - print( - "-" * level, child.tag, f"({len(child.children)})", child.to_text() - ) + print("-" * level, child.tag, f"({len(child.children)})", child.to_text()) iter_children(child, level + 1) iter_children(pdf_root, 0) @@ -613,28 +596,26 @@ def read(self, blocks_json): parent = root list_stack = [] for block in blocks_json: - if block["tag"] != "list_item" and len(list_stack) > 0: + if block['tag'] != 'list_item' and len(list_stack) > 0: list_stack = [] - if block["tag"] == "para": + if block['tag'] == 'para': node = Paragraph(block) parent.add_child(node) - elif block["tag"] == "table": + elif block['tag'] == 'table': node = Table(block, prev_node) parent.add_child(node) - elif block["tag"] == "list_item": + elif block['tag'] == 'list_item': node = ListItem(block) # add lists as children to previous paragraph # this handles examples like - The following items need to be addressed: 1) item 1 2) item 2 etc. - if prev_node.tag == "para" and prev_node.level == node.level: + if prev_node.tag == 'para' and prev_node.level == node.level: list_stack.append(prev_node) # sometimes there are lists within lists in legal documents - elif prev_node.tag == "list_item": + elif prev_node.tag == 'list_item': if node.level > prev_node.level: list_stack.append(prev_node) elif node.level < prev_node.level: - while ( - len(list_stack) > 0 and list_stack.pop().level > node.level - ): + while (len(list_stack) > 0 and list_stack.pop().level > node.level): pass # list_stack.append(node) if len(list_stack) > 0: @@ -642,15 +623,13 @@ def read(self, blocks_json): else: parent.add_child(node) - elif block["tag"] == "header": + elif block['tag'] == 'header': node = Section(block) if node.level > parent.level: parent_stack.append(node) parent.add_child(node) else: - while ( - len(parent_stack) > 1 and parent_stack[-1].level >= node.level - ): + while (len(parent_stack) > 1 and parent_stack[-1].level >= node.level): parent_stack.pop() parent_stack[-1].add_child(node) parent_stack.append(node) diff --git a/llmsherpa/readers/tests/test_layout_reader.py b/llmsherpa/readers/tests/test_layout_reader.py index 71b414f..3c5a03a 100644 --- a/llmsherpa/readers/tests/test_layout_reader.py +++ b/llmsherpa/readers/tests/test_layout_reader.py @@ -31,20 +31,20 @@ def get_document(self, file_name): def test_list_child_of_header(self): pdf = self.read_layout("list_test.json") self.assertEqual(len(pdf.children[0].children), 3) - self.assertEqual(pdf.children[0].children[0].tag, "list_item") + self.assertEqual(pdf.children[0].children[0].tag, 'list_item') # self.assertEqual(sum([1, 2, 3]), 6, "Should be 6") def test_list_child_of_para(self): doc = self.read_layout("list_test.json") - self.assertEqual(doc.children[0].children[2].tag, "para") - self.assertEqual(len(doc.children[0].children[2].children), 2) - self.assertEqual(doc.children[0].children[2].children[0].tag, "list_item") + self.assertEqual(doc.children[0].children[2].tag, 'para') + self.assertEqual(len(doc.children[0].children[2].children), 2) + self.assertEqual(doc.children[0].children[2].children[0].tag, 'list_item') def test_nested_lists(self): doc = self.read_layout("nested_list_test.json") - self.assertEqual(len(doc.children[0].children), 2) - self.assertEqual(len(doc.children[0].children[0].children), 2) - self.assertEqual(len(doc.children[1].children[0].children[1].children), 2) + self.assertEqual(len(doc.children[0].children), 2) + self.assertEqual(len(doc.children[0].children[0].children), 2) + self.assertEqual(len(doc.children[1].children[0].children[1].children), 2) parent_text = """ Article II Section 1 @@ -58,43 +58,37 @@ def test_nested_lists(self): def test_nested_lists_with_para(self): doc = self.read_layout("nested_list_test.json") - self.assertEqual(doc.children[2].children[0].tag, "para") + self.assertEqual(doc.children[2].children[0].tag, 'para') self.assertEqual(len(doc.children[2].children), 2) - self.assertEqual(len(doc.children[2].children[0].children), 2) - self.assertEqual(doc.children[2].children[1].tag, "para") + self.assertEqual(len(doc.children[2].children[0].children), 2) + self.assertEqual(doc.children[2].children[1].tag, 'para') def test_nested_headers(self): doc = self.read_layout("header_test.json") - self.assertEqual(len(doc.children[0].children), 2) - self.assertEqual(len(doc.children[0].children[0].children), 2) - self.assertEqual(len(doc.children[1].children[0].children[1].children), 2) - self.assertEqual( - doc.children[1].children[0].children[1].parent_text(), - "Article II > Section 1", - ) + self.assertEqual(len(doc.children[0].children), 2) + self.assertEqual(len(doc.children[0].children[0].children), 2) + self.assertEqual(len(doc.children[1].children[0].children[1].children), 2) + self.assertEqual(doc.children[1].children[0].children[1].parent_text(), "Article II > Section 1") def test_ooo_nested_headers(self): # OutOfOrder Header test case doc = self.read_layout("ooo_header_test.json") - self.assertEqual(len(doc.children[0].children), 0) - self.assertEqual(len(doc.children[1].children), 0) - self.assertEqual(len(doc.children[2].children), 2) - self.assertEqual(len(doc.children[2].children[0].children), 2) - self.assertEqual(len(doc.children[3].children[0].children[1].children), 2) - self.assertEqual( - doc.children[3].children[0].children[1].parent_text(), - "Article II > Section 1", - ) + self.assertEqual(len(doc.children[0].children), 0) + self.assertEqual(len(doc.children[1].children), 0) + self.assertEqual(len(doc.children[2].children), 2) + self.assertEqual(len(doc.children[2].children[0].children), 2) + self.assertEqual(len(doc.children[3].children[0].children[1].children), 2) + self.assertEqual(doc.children[3].children[0].children[1].parent_text(),"Article II > Section 1") def test_ooo_nested_header_children(self): # OutOfOrder Header children test case doc = self.read_layout("ooo_header_child_test.json") - self.assertEqual(len(doc.children[0].children), 0) - self.assertEqual(len(doc.children[1].children), 3) - self.assertEqual(len(doc.children[1].children[0].children), 0) - self.assertEqual(len(doc.children[1].children[1].children), 2) - self.assertEqual(len(doc.children[1].children[2].children), 3) - self.assertEqual(len(doc.children[1].children[2].children[0].children), 0) + self.assertEqual(len(doc.children[0].children), 0) + self.assertEqual(len(doc.children[1].children), 3) + self.assertEqual(len(doc.children[1].children[0].children), 0) + self.assertEqual(len(doc.children[1].children[1].children), 2) + self.assertEqual(len(doc.children[1].children[2].children), 3) + self.assertEqual(len(doc.children[1].children[2].children[0].children), 0) def test_table(self): doc = self.read_layout("table_test.json") @@ -113,10 +107,7 @@ def test_paragraph_iterator(self): b) Disclaimer 2 """ correct_text = self.clean_text(correct_text) - self.assertEqual( - paras[0].to_text(include_children=True, recurse=True), - correct_text.replace(" *", ""), - ) + self.assertEqual(paras[0].to_text(include_children=True, recurse=True), correct_text.replace(" *", "")) def test_chunk_iterator(self): doc = self.read_layout("chunk_test.json") @@ -166,9 +157,7 @@ def test_to_text(self): def test_to_html(self): doc = self.get_document("to_html_test.json") - correct_html = ( - "

Heading 1

Heading 2

Heading 3

" - ) + correct_html = ("

Heading 1

Heading 2

Heading 3

") self.assertEqual(doc.to_html(), correct_html) def test_to_markdown(self): @@ -180,5 +169,5 @@ def test_to_markdown(self): self.assertEqual(doc.to_markdown(), correct_markdown) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main()