Skip to content

Commit

Permalink
Merge pull request #480 from tarun-menta/table_benchmarks
Browse files Browse the repository at this point in the history
Table benchmarks
  • Loading branch information
VikParuchuri authored Jan 15, 2025
2 parents 9324f10 + b7e32ef commit 50b5573
Show file tree
Hide file tree
Showing 5 changed files with 418 additions and 3 deletions.
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ There are some settings that you may find useful if things aren't working the wa
Pass the `debug` option to activate debug mode. This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information.

# Benchmarks

## Overall PDF Conversion
Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct.

**Speed**
Expand All @@ -393,6 +393,13 @@ Marker takes about 6GB of VRAM on average per task, so you can convert 8 documen

![Benchmark results](data/images/per_doc.png)

## Table Conversion
Marker can extract tables from your PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a [tree edit distance] based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves an average score of `0.65` via this approach.

| Avg score | Total tables |
|-------------|----------------|
| 0.65 | 1149 |

## Running your own benchmarks

You can benchmark the performance of marker on your machine. Install marker manually with:
Expand All @@ -402,12 +409,21 @@ git clone https://github.com/VikParuchuri/marker.git
poetry install
```

### Overall PDF Conversion

Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:

```shell
python benchmarks/overall.py data/pdfs data/references report.json
```

### Table Conversion
The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:

```shell
python benchmarks/table/table.py table_report.json --max 1000
```

# Thanks

This work would not have been possible without amazing open source models and datasets, including (but not limited to):
Expand Down
114 changes: 114 additions & 0 deletions benchmarks/table/scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
'''
TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
'''

from typing import List

from tqdm import tqdm
import distance
from apted import APTED, Config
from apted.helpers import Tree
from lxml import html
from collections import deque
import numpy as np

def wrap_table_html(table_html:str)->str:
return f'<html><body>{table_html}</body></html>'

class TableTree(Tree):
def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
self.tag = tag
self.colspan = colspan
self.rowspan = rowspan
self.content = content
self.children = list(children)

def bracket(self):
"""Show tree using brackets notation"""
if self.tag == 'td':
result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
(self.tag, self.colspan, self.rowspan, self.content)
else:
result = '"tag": %s' % self.tag
for child in self.children:
result += child.bracket()
return "{{{}}}".format(result)

class CustomConfig(Config):
@staticmethod
def maximum(*sequences):
"""Get maximum possible value
"""
return max(map(len, sequences))

def normalized_distance(self, *sequences):
"""Get distance from 0 to 1
"""
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)

def rename(self, node1, node2):
"""Compares attributes of trees"""
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
return 1.
if node1.tag == 'td':
if node1.content or node2.content:
return self.normalized_distance(node1.content, node2.content)
return 0.

def tokenize(node):
''' Tokenizes table cells
'''
global __tokens__
__tokens__.append('<%s>' % node.tag)
if node.text is not None:
__tokens__ += list(node.text)
for n in node.getchildren():
tokenize(n)
if node.tag != 'unk':
__tokens__.append('</%s>' % node.tag)
if node.tag != 'td' and node.tail is not None:
__tokens__ += list(node.tail)

def tree_convert_html(node, convert_cell=False, parent=None):
''' Converts HTML tree to the format required by apted
'''
global __tokens__
if node.tag == 'td':
if convert_cell:
__tokens__ = []
tokenize(node)
cell = __tokens__[1:-1].copy()
else:
cell = []
new_node = TableTree(node.tag,
int(node.attrib.get('colspan', '1')),
int(node.attrib.get('rowspan', '1')),
cell, *deque())
else:
new_node = TableTree(node.tag, None, None, None, *deque())
if parent is not None:
parent.children.append(new_node)
if node.tag != 'td':
for n in node.getchildren():
tree_convert_html(n, convert_cell, new_node)
if parent is None:
return new_node

def similarity_eval_html(pred, true, structure_only=False):
''' Computes TEDS score between the prediction and the ground truth of a
given samples
'''
pred, true = html.fromstring(pred), html.fromstring(true)
if pred.xpath('body/table') and true.xpath('body/table'):
pred = pred.xpath('body/table')[0]
true = true.xpath('body/table')[0]
n_nodes_pred = len(pred.xpath(".//*"))
n_nodes_true = len(true.xpath(".//*"))
tree_pred = tree_convert_html(pred, convert_cell=not structure_only)
tree_true = tree_convert_html(true, convert_cell=not structure_only)
n_nodes = max(n_nodes_pred, n_nodes_true)
distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
return 1.0 - (float(distance) / n_nodes)
else:
return 0.0

105 changes: 105 additions & 0 deletions benchmarks/table/table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import base64
import os
import time
import datasets
from tqdm import tqdm
import tempfile
import click
from tabulate import tabulate
import json
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from pypdfium2._helpers.misc import PdfiumError

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS

from marker.config.parser import ConfigParser
from marker.converters.table import TableConverter
from marker.models import create_model_dict

from scoring import wrap_table_html, similarity_eval_html

def update_teds_score(result):
prediction, ground_truth = result['marker_table'], result['gt_table']
prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
score = similarity_eval_html(prediction, ground_truth)
result.update({'score':score})
return result


@click.command(help="Benchmark Table to HTML Conversion")
@click.argument("out_file", type=str)
@click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
@click.option("--max", type=int, default=None, help="Maximum number of PDFs to process")
def main(out_file, dataset, max):
models = create_model_dict()
config_parser = ConfigParser({})
start = time.time()


dataset = datasets.load_dataset(dataset, split='train')
dataset = dataset.shuffle(seed=0)

iterations = len(dataset)
if max is not None:
iterations = min(max, len(dataset))

results = []
for i in tqdm(range(iterations), desc='Converting Tables'):
try:
row = dataset[i]
pdf_binary = base64.b64decode(row['pdf'])
gt_tables = row['tables'] #Already sorted by reading order, which is what marker returns

converter = TableConverter(
config=config_parser.generate_config_dict(),
artifact_dict=models,
processor_list=config_parser.get_processors(),
renderer='marker.renderers.html.HTMLRenderer'
)

with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
temp_pdf_file.write(pdf_binary)
temp_pdf_file.seek(0)
marker_table_html = converter(temp_pdf_file.name).html

marker_table_soup = BeautifulSoup(marker_table_html, 'html.parser')
marker_detected_tables = marker_table_soup.find_all('table')
if len(marker_detected_tables)==0:
print(f'No tables detected, skipping...')

for marker_table_soup, gt_table in zip(marker_detected_tables, gt_tables):
gt_table_html = gt_table['html']

#marker wraps the table in <tbody> which fintabnet data doesn't
marker_table_soup.find('tbody').unwrap()
#Fintabnet doesn't use th tags, need to be replaced for fair comparison
for th_tag in marker_table_soup.find_all('th'):
th_tag.name = 'td'
marker_table_html = str(marker_table_soup)

results.append({
"marker_table": marker_table_html,
"gt_table": gt_table_html
})
except PdfiumError:
print('Broken PDF, Skipping...')
continue

print(f"Total time: {time.time() - start}")

with ThreadPoolExecutor(max_workers=16) as executor:
results = list(tqdm(executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results)))
avg_score = sum([r["score"] for r in results]) / len(results)

headers = ["Avg score", "Total tables"]
data = [f"{avg_score:.3f}", len(results)]
table = tabulate([data], headers=headers, tablefmt="github")
print(table)
print("Avg score computed by comparing marker predicted HTML with original HTML")

with open(out_file, "w+") as f:
json.dump(results, f, indent=2)

if __name__ == '__main__':
main()
Loading

0 comments on commit 50b5573

Please sign in to comment.