-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into block-display-tool-pr
- Loading branch information
Showing
194 changed files
with
146,544 additions
and
46,420 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
name: Test CLI scripts | ||
|
||
on: [push] | ||
|
||
env: | ||
TORCH_DEVICE: "cpu" | ||
OCR_ENGINE: "surya" | ||
|
||
jobs: | ||
tests: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Set up Python 3.11 | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: 3.11 | ||
- name: Install python dependencies | ||
run: | | ||
pip install poetry | ||
poetry install | ||
- name: Download benchmark data | ||
run: | | ||
wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi" | ||
unzip -o benchmark_data.zip | ||
- name: Test single script | ||
run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 | ||
- name: Test convert script | ||
run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0 | ||
- name: Text convert script multiple workers | ||
run: poetry run marker benchmark_data/pdfs --max_files 2 --workers 2 --page_range 0-5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import json | ||
from PIL import Image | ||
import google.generativeai as genai | ||
from google.ai.generativelanguage_v1beta.types import content | ||
from marker.settings import settings | ||
|
||
prompt = """ | ||
You're an expert document analyst who is good at turning tables in documents into HTML. Analyze the provided image, and convert it to a faithful HTML representation. | ||
Guidelines: | ||
- Keep the HTML simple and concise. | ||
- Only include the <table> tag and contents. | ||
- Only use <table>, <tr>, and <td> tags. Only use the colspan and rowspan attributes if necessary. Do not use <tbody>, <thead>, or <th> tags. | ||
- Make sure the table is as faithful to the image as possible with the given tags. | ||
**Instructions** | ||
1. Analyze the image, and determine the table structure. | ||
2. Convert the table image to HTML, following the guidelines above. | ||
3. Output only the HTML for the table, starting with the <table> tag and ending with the </table> tag. | ||
""".strip() | ||
|
||
genai.configure(api_key=settings.GOOGLE_API_KEY) | ||
|
||
def gemini_table_rec(image: Image.Image): | ||
schema = content.Schema( | ||
type=content.Type.OBJECT, | ||
required=["table_html"], | ||
properties={ | ||
"table_html": content.Schema( | ||
type=content.Type.STRING, | ||
) | ||
} | ||
) | ||
|
||
model = genai.GenerativeModel("gemini-1.5-flash") | ||
|
||
responses = model.generate_content( | ||
[image, prompt], # According to gemini docs, it performs better if the image is the first element | ||
stream=False, | ||
generation_config={ | ||
"temperature": 0, | ||
"response_schema": schema, | ||
"response_mime_type": "application/json", | ||
}, | ||
request_options={'timeout': 60} | ||
) | ||
|
||
output = responses.candidates[0].content.parts[0].text | ||
return json.loads(output)["table_html"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
"""" | ||
TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD | ||
""" | ||
|
||
import distance | ||
from apted import APTED, Config | ||
from apted.helpers import Tree | ||
from lxml import html | ||
from collections import deque | ||
|
||
def wrap_table_html(table_html:str)->str: | ||
return f'<html><body>{table_html}</body></html>' | ||
|
||
class TableTree(Tree): | ||
def __init__(self, tag, colspan=None, rowspan=None, content=None, *children): | ||
self.tag = tag | ||
self.colspan = colspan | ||
self.rowspan = rowspan | ||
self.content = content | ||
|
||
# Sets self.name and self.children | ||
super().__init__(tag, *children) | ||
|
||
def bracket(self): | ||
"""Show tree using brackets notation""" | ||
if self.tag == 'td': | ||
result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \ | ||
(self.tag, self.colspan, self.rowspan, self.content) | ||
else: | ||
result = '"tag": %s' % self.tag | ||
for child in self.children: | ||
result += child.bracket() | ||
return "{{{}}}".format(result) | ||
|
||
class CustomConfig(Config): | ||
@staticmethod | ||
def maximum(*sequences): | ||
return max(map(len, sequences)) | ||
|
||
def normalized_distance(self, *sequences): | ||
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) | ||
|
||
def rename(self, node1, node2): | ||
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): | ||
return 1. | ||
if node1.tag == 'td': | ||
if node1.content or node2.content: | ||
return self.normalized_distance(node1.content, node2.content) | ||
return 0. | ||
|
||
def tokenize(node): | ||
""" | ||
Tokenizes table cells | ||
""" | ||
global __tokens__ | ||
__tokens__.append('<%s>' % node.tag) | ||
if node.text is not None: | ||
__tokens__ += list(node.text) | ||
for n in node.getchildren(): | ||
tokenize(n) | ||
if node.tag != 'unk': | ||
__tokens__.append('</%s>' % node.tag) | ||
if node.tag != 'td' and node.tail is not None: | ||
__tokens__ += list(node.tail) | ||
|
||
def tree_convert_html(node, convert_cell=False, parent=None): | ||
""" | ||
Converts HTML tree to the format required by apted | ||
""" | ||
global __tokens__ | ||
if node.tag == 'td': | ||
if convert_cell: | ||
__tokens__ = [] | ||
tokenize(node) | ||
cell = __tokens__[1:-1].copy() | ||
else: | ||
cell = [] | ||
new_node = TableTree(node.tag, | ||
int(node.attrib.get('colspan', '1')), | ||
int(node.attrib.get('rowspan', '1')), | ||
cell, *deque()) | ||
else: | ||
new_node = TableTree(node.tag, None, None, None, *deque()) | ||
if parent is not None: | ||
parent.children.append(new_node) | ||
if node.tag != 'td': | ||
for n in node.getchildren(): | ||
tree_convert_html(n, convert_cell, new_node) | ||
if parent is None: | ||
return new_node | ||
|
||
def similarity_eval_html(pred, true, structure_only=False): | ||
""" | ||
Computes TEDS score between the prediction and the ground truth of a given samples | ||
""" | ||
pred, true = html.fromstring(pred), html.fromstring(true) | ||
if pred.xpath('body/table') and true.xpath('body/table'): | ||
pred = pred.xpath('body/table')[0] | ||
true = true.xpath('body/table')[0] | ||
n_nodes_pred = len(pred.xpath(".//*")) | ||
n_nodes_true = len(true.xpath(".//*")) | ||
tree_pred = tree_convert_html(pred, convert_cell=not structure_only) | ||
tree_true = tree_convert_html(true, convert_cell=not structure_only) | ||
n_nodes = max(n_nodes_pred, n_nodes_true) | ||
distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance() | ||
return 1.0 - (float(distance) / n_nodes) | ||
else: | ||
return 0.0 | ||
|
Oops, something went wrong.