-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f26834c
commit 360126e
Showing
11 changed files
with
100 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from statistics import mean | ||
|
||
from marker.processors import BaseProcessor | ||
from marker.schema import BlockTypes | ||
from marker.schema.document import Document | ||
|
||
|
||
class OrderProcessor(BaseProcessor): | ||
""" | ||
A processor for sorting the blocks in order if needed. This can help when the layout image was sliced. | ||
""" | ||
block_types = tuple() | ||
|
||
def __call__(self, document: Document): | ||
for page in document.pages: | ||
if page.text_extraction_method != "pdftext": | ||
continue | ||
|
||
if not page.layout_sliced: | ||
continue | ||
|
||
block_idxs = {} | ||
for block_id in page.structure: | ||
block = document.get_block(block_id) | ||
spans = block.contained_blocks(document, (BlockTypes.Span, )) | ||
if len(spans) == 0: | ||
continue | ||
|
||
block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2 | ||
|
||
for block_id in page.structure: | ||
if block_id in block_idxs and block_idxs[block_id] > 0: | ||
continue | ||
block = document.get_block(block_id) | ||
prev_block = document.get_prev_block(block) | ||
next_block = document.get_next_block(block) | ||
|
||
while prev_block and prev_block.id not in block_idxs: | ||
prev_block = document.get_prev_block(prev_block) | ||
|
||
if not prev_block: | ||
while next_block and next_block.id not in block_idxs: | ||
next_block = document.get_next_block(next_block) | ||
|
||
if not next_block and not prev_block: | ||
block_idxs[block_id] = 0 | ||
elif prev_block: | ||
block_idxs[block_id] = block_idxs[prev_block.id] + 1 | ||
else: | ||
block_idxs[block_id] = block_idxs[next_block.id] - 1 | ||
|
||
page.structure = sorted(page.structure, key=lambda x: block_idxs[x]) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,28 @@ | ||
import re | ||
|
||
from marker.schema import BlockTypes | ||
from marker.schema.blocks import Block | ||
|
||
|
||
def superscript(child_blocks): | ||
# Superscript leading symbol or digit sequence | ||
first_block = None | ||
while len(child_blocks) > 0: | ||
first_block = child_blocks[0] | ||
child_blocks = first_block.children | ||
|
||
if first_block is not None and first_block.id.block_type == BlockTypes.Line: | ||
digit_start = r"^([0-9\W]+)(.*)" | ||
first_block.html = re.sub(digit_start, r"<sup>\1</sup>\2", first_block.html.lstrip()) | ||
|
||
|
||
class Footnote(Block): | ||
block_type: BlockTypes = BlockTypes.Footnote | ||
|
||
def assemble_html(self, child_blocks, parent_structure): | ||
template = super().assemble_html(child_blocks, parent_structure) | ||
template = template.replace("\n", " ") | ||
|
||
# Add superscripts to start | ||
superscript(child_blocks) | ||
return f"<p>{template}</p>" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters