Skip to content

Commit

Permalink
Fix issues with fixed thresholds
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 7, 2024
1 parent c8c1f06 commit 01c18b8
Show file tree
Hide file tree
Showing 12 changed files with 127 additions and 168 deletions.
4 changes: 3 additions & 1 deletion convert_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ def main():
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor, langs=langs)

fname = os.path.basename(fname)
save_markdown(args.output, fname, full_text, images, out_meta)
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)

print(f"Saved markdown to the {subfolder_path} folder")


if __name__ == "__main__":
Expand Down
7 changes: 5 additions & 2 deletions marker/cleaners/headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@
from typing import List, Tuple


def filter_common_elements(lines, page_count):
def filter_common_elements(lines, page_count, threshold=.6):
# We can't filter if we don't have enough pages to find common elements
if page_count < 3:
return []
text = [s.text for line in lines for s in line.spans if len(s.text) > 4]
counter = Counter(text)
common = [k for k, v in counter.items() if v > page_count * .6]
common = [k for k, v in counter.items() if v > page_count * threshold]
bad_span_ids = [s.span_id for line in lines for s in line.spans if s.text in common]
return bad_span_ids

Expand Down
3 changes: 2 additions & 1 deletion marker/cleaners/headings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List

from marker.settings import settings
from marker.schema.bbox import rescale_bbox
from marker.schema.block import bbox_from_lines
from marker.schema.page import Page
Expand All @@ -21,7 +22,7 @@ def split_heading_blocks(pages: List[Page]):
heading_lines = []
for line_idx, line in enumerate(block.lines):
for (heading_box, label) in page_heading_boxes:
if line.intersection_pct(heading_box) > .8:
if line.intersection_pct(heading_box) > settings.BBOX_INTERSECTION_THRESH:
heading_lines.append((line_idx, label))
break

Expand Down
2 changes: 1 addition & 1 deletion marker/equations/equations.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def find_equation_blocks(page, processor):
for region_idx, region in enumerate(equation_regions):
for block_idx, block in enumerate(page.blocks):
for line_idx, line in enumerate(block.lines):
if line.intersection_pct(region) > .8:
if line.intersection_pct(region) > settings.BBOX_INTERSECTION_THRESH:
# We will remove this line from the block
lines_to_remove[region_idx].append((block_idx, line_idx))
equation_lines[region_idx].append(line)
Expand Down
3 changes: 2 additions & 1 deletion marker/images/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from marker.pdf.images import render_bbox_image
from marker.schema.bbox import rescale_bbox
from marker.schema.block import find_insert_block, Span
from marker.settings import settings


def find_image_blocks(page):
Expand All @@ -13,7 +14,7 @@ def find_image_blocks(page):
for region_idx, region in enumerate(image_regions):
for block_idx, block in enumerate(page.blocks):
for line_idx, line in enumerate(block.lines):
if line.intersection_pct(region) > .8:
if line.intersection_pct(region) > settings.BBOX_INTERSECTION_THRESH:
line.spans = [] # We will remove this line from the block

if region_idx not in insert_points:
Expand Down
4 changes: 3 additions & 1 deletion marker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,6 @@ def save_markdown(out_folder, fname, full_text, images, out_metadata):

for filename, image in images.items():
image_filepath = os.path.join(subfolder_path, filename)
image.save(image_filepath, "PNG")
image.save(image_filepath, "PNG")

return subfolder_path
4 changes: 0 additions & 4 deletions marker/pdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ def find_filetype(fpath):
# The mimetype is not always consistent, so use in to check the most common formats
if "pdf" in mimetype:
return "pdf"
#elif "epub" in mimetype:
# return "epub"
#elif "mobi" in mimetype:
# return "mobi"
elif mimetype in settings.SUPPORTED_FILETYPES:
return settings.SUPPORTED_FILETYPES[mimetype]
else:
Expand Down
5 changes: 1 addition & 4 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,6 @@ def TORCH_DEVICE_MODEL(self) -> str:

SUPPORTED_FILETYPES: Dict = {
"application/pdf": "pdf",
#"application/epub+zip": "epub",
#"application/x-mobipocket-ebook": "mobi",
#"application/vnd.ms-xpsdocument": "xps",
#"application/x-fictionbook+xml": "fb2"
}

# Text line Detection
Expand Down Expand Up @@ -69,6 +65,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
SURYA_LAYOUT_DPI: int = 96
BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout2"
BBOX_INTERSECTION_THRESH: float = 0.7 # How much the layout and pdf bboxes need to overlap to be the same

# Ordering model
SURYA_ORDER_DPI: int = 96
Expand Down
4 changes: 3 additions & 1 deletion marker/tables/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from sklearn.cluster import DBSCAN
import numpy as np

from marker.settings import settings


def cluster_coords(coords):
if len(coords) == 0:
Expand All @@ -28,7 +30,7 @@ def find_column_separators(page: Page, table_box, round_factor=4, min_count=1):

line_boxes = [p.bbox for p in page.text_lines.bboxes]
line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes]
line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8]
line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > settings.BBOX_INTERSECTION_THRESH]

for cell in line_boxes:
left_edges.append(cell[0] / round_factor * round_factor)
Expand Down
5 changes: 3 additions & 2 deletions marker/tables/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from tabulate import tabulate
from typing import List

from marker.settings import settings
from marker.tables.cells import assign_cells_to_columns
from marker.tables.utils import sort_table_blocks, replace_dots, replace_newlines

Expand Down Expand Up @@ -57,7 +58,7 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> L
for line_idx, line in enumerate(sorted_lines):
line_bbox = line["bbox"]
intersect_pct = box_intersection_pct(line_bbox, table_box)
if intersect_pct < .7:
if intersect_pct < settings.BBOX_INTERSECTION_THRESH:
continue
for span in line["spans"]:
for char in span["chars"]:
Expand Down Expand Up @@ -118,7 +119,7 @@ def format_tables(pages: List[Page]):
for table_idx, table_box in enumerate(page_table_boxes):
for block_idx, block in enumerate(page.blocks):
intersect_pct = block.intersection_pct(table_box)
if intersect_pct > .7 and block.block_type == "Table":
if intersect_pct > settings.BBOX_INTERSECTION_THRESH and block.block_type == "Table":
if table_idx not in table_insert_points:
table_insert_points[table_idx] = block_idx - len(blocks_to_remove) + table_idx # Where to insert the new table
blocks_to_remove.add(block_idx)
Expand Down
Loading

0 comments on commit 01c18b8

Please sign in to comment.