Merge remote-tracking branch 'origin/dev' into dev

VikParuchuri · Jan 28, 2025 · 44c289b · 44c289b
2 parents 7118921 + 5f0bfd3
commit 44c289b
Show file tree

Hide file tree

Showing 6 changed files with 312 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -117,7 +117,7 @@ Options:
 - `config --help`: List all available builders, processors, and converters, and their associated configuration.  These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
 - `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`.  The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.
 
-The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py).  If you don't need OCR, marker can work with any language.
+The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py).  If you don't need OCR, marker can work with any language.
 
 ## Convert multiple files
 
@@ -445,4 +445,4 @@ This work would not have been possible without amazing open source models and da
 - Pypdfium2/pdfium
 - DocLayNet from IBM
 
-Thank you to the authors of these models and datasets for making them available to the community!
+Thank you to the authors of these models and datasets for making them available to the community!
diff --git a/marker/scripts/server.py b/marker/scripts/server.py
@@ -62,7 +62,7 @@ class CommonParams(BaseModel):
     ] = None
     languages: Annotated[
         Optional[str],
-        Field(description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/languages.py.", example=None)
+        Field(description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py.", example=None)
     ] = None
     force_ocr: Annotated[
         bool,

diff --git a/marker/scripts/streamlit_app.py b/marker/scripts/streamlit_app.py
@@ -7,18 +7,41 @@
 
 import base64
 import io
+import json
 import re
+import string
 import tempfile
 from typing import Any, Dict
 
 import pypdfium2
 import streamlit as st
+import streamlit.components.v1 as components
 from PIL import Image
 
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.config.parser import ConfigParser
 from marker.output import text_from_rendered
+from marker.schema import BlockTypes
+
+COLORS = [
+    "#4e79a7",
+    "#f28e2c",
+    "#e15759",
+    "#76b7b2",
+    "#59a14f",
+    "#edc949",
+    "#af7aa1",
+    "#ff9da7",
+    "#9c755f",
+    "#bab0ab"
+]
+
+with open(
+    os.path.join(os.path.dirname(__file__), "streamlit_app_blocks_viz.html")
+) as f:
+    BLOCKS_VIZ_TMPL = string.Template(f.read())
+
 
 @st.cache_resource()
 def load_models():
@@ -86,6 +109,31 @@ def page_count(pdf_file: UploadedFile):
         return 1
 
 
+def pillow_image_to_base64_string(img: Image) -> str:
+    buffered = io.BytesIO()
+    img.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+
+def block_display(image: Image, blocks: dict = {}, dpi=96):
+    image_data_url = (
+        'data:image/jpeg;base64,' + pillow_image_to_base64_string(image)
+    )
+
+    template_values = {
+        "image_data_url": image_data_url,
+        "image_width": image.width, "image_height": image.height,
+        "blocks_json": blocks, "colors_json": json.dumps(COLORS),
+        "block_types_json": json.dumps({
+            bt.name: i for i, bt in enumerate(BlockTypes)
+        })
+    }
+    return components.html(
+        BLOCKS_VIZ_TMPL.substitute(**template_values),
+        height=image.height, width=image.width
+    )
+
+
 st.set_page_config(layout="wide")
 col1, col2 = st.columns([.5, .5])
 
@@ -111,14 +159,18 @@ def page_count(pdf_file: UploadedFile):
     page_count = page_count(in_file)
     page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count)
     pil_image = get_page_image(in_file, page_number)
+    image_placeholder = st.empty()
+
+    with image_placeholder:
+        block_display(pil_image)
 
-    st.image(pil_image, caption="File preview", use_container_width=True)
 
 page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
 output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
 run_marker = st.sidebar.button("Run Marker")
 
 use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False)
+show_blocks = st.sidebar.checkbox("Show Blocks", help="Display detected blocks, only when output is JSON", value=False, disabled=output_format != "json")
 force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
 strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False)
 debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
@@ -158,6 +210,10 @@ def page_count(pdf_file: UploadedFile):
     elif output_format == "html":
         st.html(text)
 
+if output_format == "json" and show_blocks:
+    with image_placeholder:
+        block_display(pil_image, text)
+
 if debug:
     with col1:
         debug_data_path = rendered.metadata.get("debug_data_path")
@@ -168,4 +224,3 @@ def page_count(pdf_file: UploadedFile):
             layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png")
             img = Image.open(layout_image_path)
             st.image(img, caption="Layout debug image", use_container_width=True)
-
diff --git a/marker/scripts/streamlit_app_blocks_viz.html b/marker/scripts/streamlit_app_blocks_viz.html
@@ -0,0 +1,234 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <style>
+        body {
+            font-family: "Source Sans Pro",sans-serif;
+            font-weight: 400;
+            -moz-osx-font-smoothing: auto
+        }
+
+        .tippy-box {
+            font-size: 10px
+        }
+
+        .image-container {
+            position: relative;
+            width: 90%
+        }
+
+        .image-container img {
+            width: 100%;
+            height: auto
+        }
+
+        .blocks-overlay {
+            position: absolute;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%
+        }
+
+        .blocks-overlay rect.block {
+            fill-opacity: .2;
+            stroke-opacity: .5
+        }
+
+        .blocks-overlay rect.block:hover {
+            stroke-opacity: 1;
+            cursor: pointer
+        }
+
+        #block-info-dialog {
+            width: 65%
+        }
+
+        #block-info-dialog button.close-button {
+            font-size: 20px;
+            position: absolute;
+            top: 0;
+            right: 0;
+            margin: 0;
+            border: 0;
+            background: 0 0;
+            padding: 0 4px 0 0;
+            cursor: pointer
+        }
+
+        #block-info-dialog button.close-button:focus {
+            outline: 0
+        }
+
+        #block-info-dialog button.close-button::after {
+            content: "╳"
+        }
+
+        #block-info-dialog button.copy-json-button {
+            font-size: 10px;
+            color: #bababa;
+            cursor: pointer;
+            position: absolute;
+            bottom: 3px;
+            right: 3px;
+            border: 0;
+            background: 0 0
+        }
+
+        #block-info-dialog button.copy-json-button:hover {
+            color: #666
+        }
+
+        #block-info-dialog button.copy-json-button:active {
+            color: #000
+        }
+
+        #block-info-dialog h1 {
+            margin: 0 0 10px;
+            text-align: left;
+            font-size: 1em
+        }
+
+        #block-info-dialog .text-content {
+            overflow-y: auto;
+            font-family: monospace;
+            white-space: pre
+        }
+
+        #block-info-dialog .images {
+            display: flex;
+            flex-wrap: wrap;
+            justify-content: center;
+            gap: 10px;
+            margin-top: 10px
+        }
+
+        #block-info-dialog .images img {
+            max-width: 40%;
+            height: auto
+        }
+    </style>
+  </head>
+  <body>
+    <div style="text-align: center" class="image-container">
+      <dialog id="block-info-dialog">
+        <button 
+          class="close-button"
+          onclick="document.querySelector('#block-info-dialog').close()"
+        ></button>
+        <h1></h1>
+        <div class="text-content"></div>
+        <div class="images"></div>
+        <button
+          class="copy-json-button"
+          onclick="navigator.clipboard.writeText(this.parentNode.dataset.blockJSON)">
+          copy block JSON
+        </button>
+      </dialog>
+      <img
+        src="$image_data_url"
+        style="max-width: 100%; height: auto"
+        alt="Image"
+      />
+      <svg
+        class="blocks-overlay"
+        width="$image_width"
+        height="$image_height"
+      ></svg>
+    </div>
+    <script src="https://unpkg.com/@popperjs/core@2"></script>
+    <script src="https://unpkg.com/tippy.js@6"></script>
+    <script>
+      const f = () => {
+        const BLOCKS = $blocks_json;
+        const COLORS = $colors_json;
+        const BLOCK_TYPES = $block_types_json;
+        const blocksById = {};
+        const blockInfoDialog = document.querySelector("dialog#block-info-dialog");
+
+        function blockTypeColor(blockType) {
+          return COLORS[BLOCK_TYPES[blockType] % COLORS.length];
+        }
+
+        function traverseAndGenerateSVG(block) {
+          let svg = "";
+
+          if (block.polygon) {
+            const color = blockTypeColor(block.block_type);
+
+            // dollar signs are escaped because this files gets read into a template string
+            svg += `<rect id="$${block.id}"
+                                class="block type-$${block.block_type}"
+                                data-type="$${block.block_type}"
+                                x="$${block.polygon[0][0]}" y="$${block.polygon[0][1]}"
+                                width="$${
+                                  block.polygon[1][0] - block.polygon[0][0]
+                                }"
+                                height="$${
+                                  block.polygon[3][1] - block.polygon[1][1]
+                                }"
+                                fill=$${color} stroke=$${color}>
+                          </rect>`;
+
+            blocksById[block.id] = block;
+          }
+
+          if (Array.isArray(block.children) && block.children.length > 0) {
+            block.children.forEach((child) => {
+              svg += traverseAndGenerateSVG(child);
+            });
+          }
+
+          return svg;
+        }
+
+        if (Object.keys(BLOCKS).length == 0) {
+          // bail out if no blocks
+          return;
+        }
+
+        const [vbWidth, vbHeight] = BLOCKS.children[0].polygon[2];
+        document
+            .querySelector("svg")
+            .setAttribute("viewBox", `0 0 $${vbWidth} $${vbHeight}`);
+
+        const blocksOverlay = document.querySelector(".blocks-overlay");
+        blocksOverlay.innerHTML = traverseAndGenerateSVG(BLOCKS.children[0]);
+
+        tippy("rect.block", {
+            content: (block) => block.getAttribute("data-type"),
+            placement: "top-start",
+            arrow: false,
+            offset: [0, 5],
+        });
+
+        blocksOverlay.addEventListener("click", (event) => {
+            if (event.target.tagName !== "rect") return;
+
+            const blockId = event.target.id;
+            const block = blocksById[blockId];
+
+            blockInfoDialog.querySelector("h1").innerHTML = `
+              $${blockId} <span style="color: $${blockTypeColor(block.block_type)}">($${block.block_type})</span>
+            `;
+            blockInfoDialog.querySelector(".text-content").textContent = block.html;
+
+            blockInfoDialog.dataset.blockJSON = JSON.stringify(block, null, 2);
+
+            if (block.images) {
+                const imagesDiv = blockInfoDialog.querySelector(".images");
+                imagesDiv.innerHTML = "";
+                for ([id, image] of Object.entries(block.images)) {
+                  const img = document.createElement("img");
+                  img.src = "data:image/jpeg;base64," + image;
+                  imagesDiv.appendChild(img);
+                }
+            }
+            blockInfoDialog.showModal();
+        });
+      }; f();
+    </script>
+  </body>
+</html>
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,8 @@ packages = [
     {include = "marker"}
 ]
 include = [
-    "marker/scripts/*.sh"
+    "marker/scripts/*.sh",
+    "marker/scripts/marker/scripts/streamlit_app_blocks_viz.html",
 ]
 
 [tool.poetry.dependencies]