Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 28, 2025
2 parents 7118921 + 5f0bfd3 commit 44c289b
Show file tree
Hide file tree
Showing 6 changed files with 312 additions and 6 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ Options:
- `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
- `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.

The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you don't need OCR, marker can work with any language.
The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py). If you don't need OCR, marker can work with any language.

## Convert multiple files

Expand Down Expand Up @@ -445,4 +445,4 @@ This work would not have been possible without amazing open source models and da
- Pypdfium2/pdfium
- DocLayNet from IBM

Thank you to the authors of these models and datasets for making them available to the community!
Thank you to the authors of these models and datasets for making them available to the community!
2 changes: 1 addition & 1 deletion marker/scripts/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class CommonParams(BaseModel):
] = None
languages: Annotated[
Optional[str],
Field(description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/languages.py.", example=None)
Field(description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py.", example=None)
] = None
force_ocr: Annotated[
bool,
Expand Down
59 changes: 57 additions & 2 deletions marker/scripts/streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,41 @@

import base64
import io
import json
import re
import string
import tempfile
from typing import Any, Dict

import pypdfium2
import streamlit as st
import streamlit.components.v1 as components
from PIL import Image

from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
from marker.output import text_from_rendered
from marker.schema import BlockTypes

COLORS = [
"#4e79a7",
"#f28e2c",
"#e15759",
"#76b7b2",
"#59a14f",
"#edc949",
"#af7aa1",
"#ff9da7",
"#9c755f",
"#bab0ab"
]

with open(
os.path.join(os.path.dirname(__file__), "streamlit_app_blocks_viz.html")
) as f:
BLOCKS_VIZ_TMPL = string.Template(f.read())


@st.cache_resource()
def load_models():
Expand Down Expand Up @@ -86,6 +109,31 @@ def page_count(pdf_file: UploadedFile):
return 1


def pillow_image_to_base64_string(img: Image) -> str:
buffered = io.BytesIO()
img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")


def block_display(image: Image, blocks: dict = {}, dpi=96):
image_data_url = (
'data:image/jpeg;base64,' + pillow_image_to_base64_string(image)
)

template_values = {
"image_data_url": image_data_url,
"image_width": image.width, "image_height": image.height,
"blocks_json": blocks, "colors_json": json.dumps(COLORS),
"block_types_json": json.dumps({
bt.name: i for i, bt in enumerate(BlockTypes)
})
}
return components.html(
BLOCKS_VIZ_TMPL.substitute(**template_values),
height=image.height, width=image.width
)


st.set_page_config(layout="wide")
col1, col2 = st.columns([.5, .5])

Expand All @@ -111,14 +159,18 @@ def page_count(pdf_file: UploadedFile):
page_count = page_count(in_file)
page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count)
pil_image = get_page_image(in_file, page_number)
image_placeholder = st.empty()

with image_placeholder:
block_display(pil_image)

st.image(pil_image, caption="File preview", use_container_width=True)

page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
run_marker = st.sidebar.button("Run Marker")

use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False)
show_blocks = st.sidebar.checkbox("Show Blocks", help="Display detected blocks, only when output is JSON", value=False, disabled=output_format != "json")
force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False)
debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
Expand Down Expand Up @@ -158,6 +210,10 @@ def page_count(pdf_file: UploadedFile):
elif output_format == "html":
st.html(text)

if output_format == "json" and show_blocks:
with image_placeholder:
block_display(pil_image, text)

if debug:
with col1:
debug_data_path = rendered.metadata.get("debug_data_path")
Expand All @@ -168,4 +224,3 @@ def page_count(pdf_file: UploadedFile):
layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png")
img = Image.open(layout_image_path)
st.image(img, caption="Layout debug image", use_container_width=True)

234 changes: 234 additions & 0 deletions marker/scripts/streamlit_app_blocks_viz.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<style>
body {
font-family: "Source Sans Pro",sans-serif;
font-weight: 400;
-moz-osx-font-smoothing: auto
}

.tippy-box {
font-size: 10px
}

.image-container {
position: relative;
width: 90%
}

.image-container img {
width: 100%;
height: auto
}

.blocks-overlay {
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%
}

.blocks-overlay rect.block {
fill-opacity: .2;
stroke-opacity: .5
}

.blocks-overlay rect.block:hover {
stroke-opacity: 1;
cursor: pointer
}

#block-info-dialog {
width: 65%
}

#block-info-dialog button.close-button {
font-size: 20px;
position: absolute;
top: 0;
right: 0;
margin: 0;
border: 0;
background: 0 0;
padding: 0 4px 0 0;
cursor: pointer
}

#block-info-dialog button.close-button:focus {
outline: 0
}

#block-info-dialog button.close-button::after {
content: "╳"
}

#block-info-dialog button.copy-json-button {
font-size: 10px;
color: #bababa;
cursor: pointer;
position: absolute;
bottom: 3px;
right: 3px;
border: 0;
background: 0 0
}

#block-info-dialog button.copy-json-button:hover {
color: #666
}

#block-info-dialog button.copy-json-button:active {
color: #000
}

#block-info-dialog h1 {
margin: 0 0 10px;
text-align: left;
font-size: 1em
}

#block-info-dialog .text-content {
overflow-y: auto;
font-family: monospace;
white-space: pre
}

#block-info-dialog .images {
display: flex;
flex-wrap: wrap;
justify-content: center;
gap: 10px;
margin-top: 10px
}

#block-info-dialog .images img {
max-width: 40%;
height: auto
}
</style>
</head>
<body>
<div style="text-align: center" class="image-container">
<dialog id="block-info-dialog">
<button
class="close-button"
onclick="document.querySelector('#block-info-dialog').close()"
></button>
<h1></h1>
<div class="text-content"></div>
<div class="images"></div>
<button
class="copy-json-button"
onclick="navigator.clipboard.writeText(this.parentNode.dataset.blockJSON)">
copy block JSON
</button>
</dialog>
<img
src="$image_data_url"
style="max-width: 100%; height: auto"
alt="Image"
/>
<svg
class="blocks-overlay"
width="$image_width"
height="$image_height"
></svg>
</div>
<script src="https://unpkg.com/@popperjs/core@2"></script>
<script src="https://unpkg.com/tippy.js@6"></script>
<script>
const f = () => {
const BLOCKS = $blocks_json;
const COLORS = $colors_json;
const BLOCK_TYPES = $block_types_json;
const blocksById = {};
const blockInfoDialog = document.querySelector("dialog#block-info-dialog");

function blockTypeColor(blockType) {
return COLORS[BLOCK_TYPES[blockType] % COLORS.length];
}

function traverseAndGenerateSVG(block) {
let svg = "";

if (block.polygon) {
const color = blockTypeColor(block.block_type);

// dollar signs are escaped because this files gets read into a template string
svg += `<rect id="$${block.id}"
class="block type-$${block.block_type}"
data-type="$${block.block_type}"
x="$${block.polygon[0][0]}" y="$${block.polygon[0][1]}"
width="$${
block.polygon[1][0] - block.polygon[0][0]
}"
height="$${
block.polygon[3][1] - block.polygon[1][1]
}"
fill=$${color} stroke=$${color}>
</rect>`;

blocksById[block.id] = block;
}

if (Array.isArray(block.children) && block.children.length > 0) {
block.children.forEach((child) => {
svg += traverseAndGenerateSVG(child);
});
}

return svg;
}

if (Object.keys(BLOCKS).length == 0) {
// bail out if no blocks
return;
}

const [vbWidth, vbHeight] = BLOCKS.children[0].polygon[2];
document
.querySelector("svg")
.setAttribute("viewBox", `0 0 $${vbWidth} $${vbHeight}`);

const blocksOverlay = document.querySelector(".blocks-overlay");
blocksOverlay.innerHTML = traverseAndGenerateSVG(BLOCKS.children[0]);

tippy("rect.block", {
content: (block) => block.getAttribute("data-type"),
placement: "top-start",
arrow: false,
offset: [0, 5],
});

blocksOverlay.addEventListener("click", (event) => {
if (event.target.tagName !== "rect") return;

const blockId = event.target.id;
const block = blocksById[blockId];

blockInfoDialog.querySelector("h1").innerHTML = `
$${blockId} <span style="color: $${blockTypeColor(block.block_type)}">($${block.block_type})</span>
`;
blockInfoDialog.querySelector(".text-content").textContent = block.html;

blockInfoDialog.dataset.blockJSON = JSON.stringify(block, null, 2);

if (block.images) {
const imagesDiv = blockInfoDialog.querySelector(".images");
imagesDiv.innerHTML = "";
for ([id, image] of Object.entries(block.images)) {
const img = document.createElement("img");
img.src = "data:image/jpeg;base64," + image;
imagesDiv.appendChild(img);
}
}
blockInfoDialog.showModal();
});
}; f();
</script>
</body>
</html>
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ packages = [
{include = "marker"}
]
include = [
"marker/scripts/*.sh"
"marker/scripts/*.sh",
"marker/scripts/marker/scripts/streamlit_app_blocks_viz.html",
]

[tool.poetry.dependencies]
Expand Down
Loading

0 comments on commit 44c289b

Please sign in to comment.