diff --git a/README.md b/README.md index 880dba9e..5d952a0f 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,7 @@ Options: - `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults. - `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables. -The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you don't need OCR, marker can work with any language. +The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py). If you don't need OCR, marker can work with any language. ## Convert multiple files @@ -445,4 +445,4 @@ This work would not have been possible without amazing open source models and da - Pypdfium2/pdfium - DocLayNet from IBM -Thank you to the authors of these models and datasets for making them available to the community! \ No newline at end of file +Thank you to the authors of these models and datasets for making them available to the community! diff --git a/marker/scripts/server.py b/marker/scripts/server.py index 24d6746b..47a1875b 100644 --- a/marker/scripts/server.py +++ b/marker/scripts/server.py @@ -62,7 +62,7 @@ class CommonParams(BaseModel): ] = None languages: Annotated[ Optional[str], - Field(description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/languages.py.", example=None) + Field(description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py.", example=None) ] = None force_ocr: Annotated[ bool, diff --git a/marker/scripts/streamlit_app.py b/marker/scripts/streamlit_app.py index e4f107d8..5b38e052 100644 --- a/marker/scripts/streamlit_app.py +++ b/marker/scripts/streamlit_app.py @@ -7,18 +7,41 @@ import base64 import io +import json import re +import string import tempfile from typing import Any, Dict import pypdfium2 import streamlit as st +import streamlit.components.v1 as components from PIL import Image from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.config.parser import ConfigParser from marker.output import text_from_rendered +from marker.schema import BlockTypes + +COLORS = [ + "#4e79a7", + "#f28e2c", + "#e15759", + "#76b7b2", + "#59a14f", + "#edc949", + "#af7aa1", + "#ff9da7", + "#9c755f", + "#bab0ab" +] + +with open( + os.path.join(os.path.dirname(__file__), "streamlit_app_blocks_viz.html") +) as f: + BLOCKS_VIZ_TMPL = string.Template(f.read()) + @st.cache_resource() def load_models(): @@ -86,6 +109,31 @@ def page_count(pdf_file: UploadedFile): return 1 +def pillow_image_to_base64_string(img: Image) -> str: + buffered = io.BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + +def block_display(image: Image, blocks: dict = {}, dpi=96): + image_data_url = ( + 'data:image/jpeg;base64,' + pillow_image_to_base64_string(image) + ) + + template_values = { + "image_data_url": image_data_url, + "image_width": image.width, "image_height": image.height, + "blocks_json": blocks, "colors_json": json.dumps(COLORS), + "block_types_json": json.dumps({ + bt.name: i for i, bt in enumerate(BlockTypes) + }) + } + return components.html( + BLOCKS_VIZ_TMPL.substitute(**template_values), + height=image.height, width=image.width + ) + + st.set_page_config(layout="wide") col1, col2 = st.columns([.5, .5]) @@ -111,14 +159,18 @@ def page_count(pdf_file: UploadedFile): page_count = page_count(in_file) page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count) pil_image = get_page_image(in_file, page_number) + image_placeholder = st.empty() + + with image_placeholder: + block_display(pil_image) - st.image(pil_image, caption="File preview", use_container_width=True) page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}") output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0) run_marker = st.sidebar.button("Run Marker") use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False) +show_blocks = st.sidebar.checkbox("Show Blocks", help="Display detected blocks, only when output is JSON", value=False, disabled=output_format != "json") force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False) strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False) debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False) @@ -158,6 +210,10 @@ def page_count(pdf_file: UploadedFile): elif output_format == "html": st.html(text) +if output_format == "json" and show_blocks: + with image_placeholder: + block_display(pil_image, text) + if debug: with col1: debug_data_path = rendered.metadata.get("debug_data_path") @@ -168,4 +224,3 @@ def page_count(pdf_file: UploadedFile): layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png") img = Image.open(layout_image_path) st.image(img, caption="Layout debug image", use_container_width=True) - diff --git a/marker/scripts/streamlit_app_blocks_viz.html b/marker/scripts/streamlit_app_blocks_viz.html new file mode 100644 index 00000000..b31ee0a8 --- /dev/null +++ b/marker/scripts/streamlit_app_blocks_viz.html @@ -0,0 +1,234 @@ + + + + + + + + +
+ + +

+
+
+ +
+ Image + +
+ + + + + diff --git a/pyproject.toml b/pyproject.toml index 0377a77a..6f43ac0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,8 @@ packages = [ {include = "marker"} ] include = [ - "marker/scripts/*.sh" + "marker/scripts/*.sh", + "marker/scripts/marker/scripts/streamlit_app_blocks_viz.html", ] [tool.poetry.dependencies] diff --git a/signatures/version1/cla.json b/signatures/version1/cla.json index aeba1b2d..cf7a0a19 100644 --- a/signatures/version1/cla.json +++ b/signatures/version1/cla.json @@ -143,6 +143,22 @@ "created_at": "2025-01-05T16:23:12Z", "repoId": 712111618, "pullRequestNo": 464 + }, + { + "name": "jazzido", + "id": 27584, + "comment_id": 2610428000, + "created_at": "2025-01-23T17:01:02Z", + "repoId": 712111618, + "pullRequestNo": 502 + }, + { + "name": "tagliala", + "id": 556268, + "comment_id": 2614522545, + "created_at": "2025-01-26T17:44:13Z", + "repoId": 712111618, + "pullRequestNo": 507 } ] } \ No newline at end of file