From e147ae63a5477768ea4cea533f997faed0fca2ce Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 24 Jan 2025 17:03:15 -0500 Subject: [PATCH] Fix span id issue --- marker/renderers/markdown.py | 8 +++++--- pyproject.toml | 2 +- tests/builders/test_pdf_links.py | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index fef44e30..0762ab3c 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -168,11 +168,13 @@ def convert_table(self, el, text, convert_as_inline): def convert_a(self, el, text, convert_as_inline): text = self.escape(text) text = re.sub(r"([\[\]])", r"\\\1", text) - return super().convert_a(el, self.escape(text), convert_as_inline) + return super().convert_a(el, text, convert_as_inline) def convert_span(self, el, text, convert_as_inline): - return f'' - + if el.get("id"): + return f'{text}' + else: + return text class MarkdownOutput(BaseModel): markdown: str diff --git a/pyproject.toml b/pyproject.toml index 1c647961..0377a77a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "1.3.1" +version = "1.3.2" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md" diff --git a/tests/builders/test_pdf_links.py b/tests/builders/test_pdf_links.py index c639c59e..300a7579 100644 --- a/tests/builders/test_pdf_links.py +++ b/tests/builders/test_pdf_links.py @@ -29,7 +29,7 @@ def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf markdown = markdown_output.markdown assert '[II.](#page-1-0)' in markdown - assert 'II. THEORETICAL FRAMEWORK' in markdown + assert 'II. THEORETICAL FRAMEWORK' in markdown - for ref in set([f'' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]): + for ref in set([f'' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]): assert ref in markdown, f"Reference {ref} not found in markdown"