Skip to content

Commit

Permalink
Fix whitespace inside tags
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 15, 2025
1 parent 81fd95e commit 7678463
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 7 deletions.
9 changes: 5 additions & 4 deletions marker/renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@ def merge_consecutive_tags(html, tag):
return html

def replace_whitespace(match):
return match.group(1)
whitespace = match.group(1)
if len(whitespace) == 0:
return ""
else:
return " "

pattern = fr'</{tag}>(\s*)<{tag}>'

Expand All @@ -57,9 +61,6 @@ def replace_whitespace(match):
break
html = new_merged

# Replace consecutive whitespace
html = re.sub(r'\s+', ' ', html)

return html

def generate_page_stats(self, document: Document, document_output):
Expand Down
4 changes: 1 addition & 3 deletions marker/renderers/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,10 @@ def extract_html(self, document, document_output, level=0):
images.update(sub_images)
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))

output = str(soup)
if level == 0:
output = soup.prettify()
output = self.merge_consecutive_tags(output, 'b')
output = self.merge_consecutive_tags(output, 'i')
else:
output = str(soup)

return output, images

Expand Down

0 comments on commit 7678463

Please sign in to comment.