From 04bb7ad25bd808630c6878420a4f47ad43ca0d85 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 15 Jan 2025 13:01:50 -0500 Subject: [PATCH] Minor cleanups --- README.md | 2 +- benchmarks/table/scoring.py | 33 ++++++++++++++------------------- benchmarks/table/table.py | 18 +++++++++--------- 3 files changed, 24 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 7f67e3fe..418f0395 100644 --- a/README.md +++ b/README.md @@ -421,7 +421,7 @@ python benchmarks/overall.py data/pdfs data/references report.json The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with: ```shell -python benchmarks/table/table.py table_report.json --max 1000 +python benchmarks/table/table.py table_report.json --max_rows 1000 ``` # Thanks diff --git a/benchmarks/table/scoring.py b/benchmarks/table/scoring.py index 81715182..940bd6e4 100644 --- a/benchmarks/table/scoring.py +++ b/benchmarks/table/scoring.py @@ -1,16 +1,12 @@ -''' +"""" TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD -''' +""" -from typing import List - -from tqdm import tqdm import distance from apted import APTED, Config from apted.helpers import Tree from lxml import html from collections import deque -import numpy as np def wrap_table_html(table_html:str)->str: return f'{table_html}' @@ -21,7 +17,9 @@ def __init__(self, tag, colspan=None, rowspan=None, content=None, *children): self.colspan = colspan self.rowspan = rowspan self.content = content - self.children = list(children) + + # Sets self.name and self.children + super().__init__(tag, *children) def bracket(self): """Show tree using brackets notation""" @@ -37,17 +35,12 @@ def bracket(self): class CustomConfig(Config): @staticmethod def maximum(*sequences): - """Get maximum possible value - """ return max(map(len, sequences)) def normalized_distance(self, *sequences): - """Get distance from 0 to 1 - """ return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) def rename(self, node1, node2): - """Compares attributes of trees""" if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): return 1. if node1.tag == 'td': @@ -56,8 +49,9 @@ def rename(self, node1, node2): return 0. def tokenize(node): - ''' Tokenizes table cells - ''' + """ + Tokenizes table cells + """ global __tokens__ __tokens__.append('<%s>' % node.tag) if node.text is not None: @@ -70,8 +64,9 @@ def tokenize(node): __tokens__ += list(node.tail) def tree_convert_html(node, convert_cell=False, parent=None): - ''' Converts HTML tree to the format required by apted - ''' + """ + Converts HTML tree to the format required by apted + """ global __tokens__ if node.tag == 'td': if convert_cell: @@ -95,9 +90,9 @@ def tree_convert_html(node, convert_cell=False, parent=None): return new_node def similarity_eval_html(pred, true, structure_only=False): - ''' Computes TEDS score between the prediction and the ground truth of a - given samples - ''' + """ + Computes TEDS score between the prediction and the ground truth of a given samples + """ pred, true = html.fromstring(pred), html.fromstring(true) if pred.xpath('body/table') and true.xpath('body/table'): pred = pred.xpath('body/table')[0] diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py index a11c0cf7..a6e4bd82 100644 --- a/benchmarks/table/table.py +++ b/benchmarks/table/table.py @@ -1,5 +1,7 @@ -import base64 import os +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS + +import base64 import time import datasets from tqdm import tqdm @@ -11,8 +13,6 @@ from concurrent.futures import ThreadPoolExecutor from pypdfium2._helpers.misc import PdfiumError -os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS - from marker.config.parser import ConfigParser from marker.converters.table import TableConverter from marker.models import create_model_dict @@ -30,10 +30,10 @@ def update_teds_score(result): @click.command(help="Benchmark Table to HTML Conversion") @click.argument("out_file", type=str) @click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use") -@click.option("--max", type=int, default=None, help="Maximum number of PDFs to process") -def main(out_file, dataset, max): +@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process") +def main(out_file: str, dataset: str, max_rows: int): models = create_model_dict() - config_parser = ConfigParser({}) + config_parser = ConfigParser({'output_format': 'html'}) start = time.time() @@ -41,8 +41,8 @@ def main(out_file, dataset, max): dataset = dataset.shuffle(seed=0) iterations = len(dataset) - if max is not None: - iterations = min(max, len(dataset)) + if max_rows is not None: + iterations = min(max_rows, len(dataset)) results = [] for i in tqdm(range(iterations), desc='Converting Tables'): @@ -55,7 +55,7 @@ def main(out_file, dataset, max): config=config_parser.generate_config_dict(), artifact_dict=models, processor_list=config_parser.get_processors(), - renderer='marker.renderers.html.HTMLRenderer' + renderer=config_parser.get_renderer() ) with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: