diff --git a/marker/converters/table.py b/marker/converters/table.py index ac555d87..a664fa49 100644 --- a/marker/converters/table.py +++ b/marker/converters/table.py @@ -25,7 +25,6 @@ class TableConverter(PdfConverter): def build_document(self, filepath: str): provider_cls = provider_from_filepath(filepath) - pdf_provider = provider_cls(filepath, self.config) layout_builder = self.resolve_dependencies(self.layout_builder_class) ocr_builder = self.resolve_dependencies(OcrBuilder) document_builder = DocumentBuilder(self.config) diff --git a/marker/renderers/__init__.py b/marker/renderers/__init__.py index 5a53432c..e45744e7 100644 --- a/marker/renderers/__init__.py +++ b/marker/renderers/__init__.py @@ -47,7 +47,11 @@ def merge_consecutive_tags(html, tag): return html def replace_whitespace(match): - return match.group(1) + whitespace = match.group(1) + if len(whitespace) == 0: + return "" + else: + return " " pattern = fr'(\s*)<{tag}>' @@ -57,9 +61,6 @@ def replace_whitespace(match): break html = new_merged - # Replace consecutive whitespace - html = re.sub(r'\s+', ' ', html) - return html def generate_page_stats(self, document: Document, document_output): diff --git a/marker/renderers/html.py b/marker/renderers/html.py index 6f31a738..4f3a27c7 100644 --- a/marker/renderers/html.py +++ b/marker/renderers/html.py @@ -78,12 +78,10 @@ def extract_html(self, document, document_output, level=0): images.update(sub_images) ref.replace_with(BeautifulSoup(f"{content}", 'html.parser')) + output = str(soup) if level == 0: - output = soup.prettify() output = self.merge_consecutive_tags(output, 'b') output = self.merge_consecutive_tags(output, 'i') - else: - output = str(soup) return output, images