From 04bb7ad25bd808630c6878420a4f47ad43ca0d85 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 15 Jan 2025 13:01:50 -0500
Subject: [PATCH] Minor cleanups

---
 README.md                   |  2 +-
 benchmarks/table/scoring.py | 33 ++++++++++++++-------------------
 benchmarks/table/table.py   | 18 +++++++++---------
 3 files changed, 24 insertions(+), 29 deletions(-)
diff --git a/README.md b/README.md
index 7f67e3fe..418f0395 100644
--- a/README.md
+++ b/README.md
@@ -421,7 +421,7 @@ python benchmarks/overall.py data/pdfs data/references report.json
 The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
 
 ```shell
-python benchmarks/table/table.py table_report.json --max 1000
+python benchmarks/table/table.py table_report.json --max_rows 1000
 ```
 
 # Thanks
diff --git a/benchmarks/table/scoring.py b/benchmarks/table/scoring.py
index 81715182..940bd6e4 100644
--- a/benchmarks/table/scoring.py
+++ b/benchmarks/table/scoring.py
@@ -1,16 +1,12 @@
-'''
+""""
 TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
-'''
+"""
 
-from typing import List
-
-from tqdm import tqdm
 import distance
 from apted import APTED, Config
 from apted.helpers import Tree
 from lxml import html
 from collections import deque
-import numpy as np
 
 def wrap_table_html(table_html:str)->str:
     return f'<html><body>{table_html}</body></html>'
@@ -21,7 +17,9 @@ def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
         self.colspan = colspan
         self.rowspan = rowspan
         self.content = content
-        self.children = list(children)
+
+        # Sets self.name and self.children
+        super().__init__(tag, *children)
 
     def bracket(self):
         """Show tree using brackets notation"""
@@ -37,17 +35,12 @@ def bracket(self):
 class CustomConfig(Config):
     @staticmethod
     def maximum(*sequences):
-        """Get maximum possible value
-        """
         return max(map(len, sequences))
 
     def normalized_distance(self, *sequences):
-        """Get distance from 0 to 1
-        """
         return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
 
     def rename(self, node1, node2):
-        """Compares attributes of trees"""
         if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
             return 1.
         if node1.tag == 'td':
@@ -56,8 +49,9 @@ def rename(self, node1, node2):
         return 0.
 
 def tokenize(node):
-    ''' Tokenizes table cells
-    '''
+    """
+    Tokenizes table cells
+    """
     global __tokens__
     __tokens__.append('<%s>' % node.tag)
     if node.text is not None:
@@ -70,8 +64,9 @@ def tokenize(node):
             __tokens__ += list(node.tail)
 
 def tree_convert_html(node, convert_cell=False, parent=None):
-    ''' Converts HTML tree to the format required by apted
-    '''
+    """
+    Converts HTML tree to the format required by apted
+    """
     global __tokens__
     if node.tag == 'td':
         if convert_cell:
@@ -95,9 +90,9 @@ def tree_convert_html(node, convert_cell=False, parent=None):
         return new_node
 
 def similarity_eval_html(pred, true, structure_only=False):
-    ''' Computes TEDS score between the prediction and the ground truth of a
-        given samples
-    '''
+    """
+    Computes TEDS score between the prediction and the ground truth of a given samples
+    """
     pred, true = html.fromstring(pred), html.fromstring(true)
     if pred.xpath('body/table') and true.xpath('body/table'):
         pred = pred.xpath('body/table')[0]
diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py
index a11c0cf7..a6e4bd82 100644
--- a/benchmarks/table/table.py
+++ b/benchmarks/table/table.py
@@ -1,5 +1,7 @@
-import base64
 import os
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
+
+import base64
 import time
 import datasets
 from tqdm import tqdm
@@ -11,8 +13,6 @@
 from concurrent.futures import ThreadPoolExecutor
 from pypdfium2._helpers.misc import PdfiumError
 
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
-
 from marker.config.parser import ConfigParser
 from marker.converters.table import TableConverter
 from marker.models import create_model_dict
@@ -30,10 +30,10 @@ def update_teds_score(result):
 @click.command(help="Benchmark Table to HTML Conversion")
 @click.argument("out_file", type=str)
 @click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
-@click.option("--max", type=int, default=None, help="Maximum number of PDFs to process")
-def main(out_file, dataset, max):
+@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
+def main(out_file: str, dataset: str, max_rows: int):
     models = create_model_dict()
-    config_parser = ConfigParser({})
+    config_parser = ConfigParser({'output_format': 'html'})
     start = time.time()
 
 
@@ -41,8 +41,8 @@ def main(out_file, dataset, max):
     dataset = dataset.shuffle(seed=0)
 
     iterations = len(dataset)
-    if max is not None:
-        iterations = min(max, len(dataset))
+    if max_rows is not None:
+        iterations = min(max_rows, len(dataset))
 
     results = []
     for i in tqdm(range(iterations), desc='Converting Tables'):
@@ -55,7 +55,7 @@ def main(out_file, dataset, max):
                 config=config_parser.generate_config_dict(),
                 artifact_dict=models,
                 processor_list=config_parser.get_processors(),
-                renderer='marker.renderers.html.HTMLRenderer'
+                renderer=config_parser.get_renderer()
             )
 
             with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: