Skip to content

Commit

Permalink
Notion: Fix bugs in nesting, callouts, #-escapes, <br> in formatting,…
Browse files Browse the repository at this point in the history
… highlights (#335)

* fixes nested blocks in callouts & callout titles;
* fixes leading <br>s before <strong|em> tags;
* improve block equation stability
* fix issue with nested highlights
* fix unescaped #tags from Notion — updated regex
* fix #s escaping within inline code
* fixes bug where <br> breaks Notion formatting;
* fixed splitBrs func that had no effect;
* now parses strikethroughs correctly;
* fix leading <br> for inline math [Notion]
* fix type stability in Notion callout conversion
  • Loading branch information
felciabatta authored Dec 17, 2024
1 parent 239464c commit 52cef2b
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 38 deletions.
150 changes: 116 additions & 34 deletions src/formats/notion/convert-to-md.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@ import { FrontMatterCache, htmlToMarkdown, moment } from 'obsidian';
import { parseFilePath } from '../../filesystem';
import { parseHTML, serializeFrontMatter } from '../../util';
import { ZipEntryFile } from '../../zip';
import { NotionLink, NotionProperty, NotionPropertyType, NotionResolverInfo, YamlProperty } from './notion-types';
import {
NotionLink,
NotionProperty,
NotionPropertyType,
NotionResolverInfo,
YamlProperty,
FormatTagName,
} from './notion-types';
import {
escapeHashtags,
getNotionId,
Expand Down Expand Up @@ -52,9 +59,8 @@ export async function readToMarkdown(info: NotionResolverInfo, file: ZipEntryFil
}
}

replaceNestedTags(body, 'strong');
replaceNestedTags(body, 'em');
fixNotionEmbeds(body);
fixFormatTags(body, ['strong', 'em', 'mark', 'del']);
fixNotionBookmarks(body);
// fixEquations must come before fixNotionCallouts
fixEquations(body);
stripLinkFormatting(body);
Expand All @@ -73,14 +79,7 @@ export async function readToMarkdown(info: NotionResolverInfo, file: ZipEntryFil
replaceTableOfContents(body);
formatDatabases(body);

let htmlString = body.innerHTML;

// Simpler to just use the HTML string for this replacement
splitBrsInFormatting(htmlString, 'strong');
splitBrsInFormatting(htmlString, 'em');


let markdownBody = htmlToMarkdown(htmlString);
let markdownBody = htmlToMarkdown(body.innerHTML);
if (info.singleLineBreaks) {
// Making sure that any blockquote is preceded by an empty line (otherwise messes up formatting with consecutive blockquotes / callouts)
markdownBody = markdownBody.replace(/\n\n(?!>)/g, '\n');
Expand Down Expand Up @@ -230,12 +229,20 @@ function fixDoubleBackslash(markdownBody: string) {
function fixEquations(body: HTMLElement) {
// Style tags before equations mess up formatting
removeTags(body, 'style');
// Notion adds an extra <br> if there is math just after a linebreak
stripLeadingBr(body, 'span.notion-text-equation-token');
const dom = body.ownerDocument;
// Display Equations
const figEqnEls = body.findAll('figure.equation');
for (const figEqn of figEqnEls) {
const annotation = figEqn.find('annotation');
if (!annotation) continue;
figEqn.replaceWith(`$$${formatMath(annotation.textContent)}$$`);
// Turn into <div> for reliable Markdown conversion
const mathDiv = dom.createElement('div');
mathDiv.className = 'annotation';
// Put in <div> to aid stability of htmlToMarkdown conversion
mathDiv.appendText(`$$${formatMath(annotation.textContent)}$$`);
figEqn.replaceWith(mathDiv);
}
// Inline Equations
const spanEqnEls = body.findAll('span.notion-text-equation-token');
Expand All @@ -254,7 +261,7 @@ function fixEquations(body: HTMLElement) {
* matched by "\\\\" and "\s" in the regex.
*/
function formatMath(math: string | null | undefined, inline: boolean=false): string {
let regex = new RegExp(/^(?:[\s\r\n]|\\\\|\\\s)*(.*?)[\s\r\n\\]*$/, 's');
let regex = new RegExp(/^(?:\s|\\\\|\\\s)*(.*?)[\s\\]*$/, 's');
return math?.replace(regex, '$1').replace(/[\r\n]+/g, (inline ? ' ' : '\n')) ?? '';
}

Expand All @@ -268,29 +275,56 @@ function isCallout(element: Element) {
}

function fixNotionCallouts(body: HTMLElement) {
const dom = body.ownerDocument;
for (let callout of body.findAll('figure.callout')) {
// Can have 1–2 children; we always want .lastElementChild for callout content.
const description = callout.lastElementChild?.textContent;
let calloutBlock = `> [!important]\n> ${description}\n`;
if (callout.nextElementSibling && isCallout(callout.nextElementSibling)) {
calloutBlock += '\n';
}
const content = callout.lastElementChild?.childNodes;
if (!content) continue;
// Reformat as blockquote; HTMLtoMarkdown will convert automatically
const calloutBlock = dom.createElement('blockquote');
calloutBlock.append(...Array.from(content));
// Add & format callout title element
quoteToCallout(calloutBlock);
callout.replaceWith(calloutBlock);
}
}

function fixNotionEmbeds(body: HTMLElement) {
// Notion embeds are a box with images and description, we simplify for Obsidian.
for (let embed of body.findAll('a.bookmark.source')) {
const link = embed.getAttribute('href');
const title = embed.find('div.bookmark-title')?.textContent;
const description = stripToSentence(embed.find('div.bookmark-description')?.textContent ?? '');
/**
* Converts a blockquote into an Obsidian-style callout
*
* Checks if calloutBlock.firstChild is a valid title
* Forces title into <p>, to avoid #text node concatenating with other elements
* Blockquote formatting enables htmlToMarkdown to deal with nesting
*
* If the callout is empty, an empty callout will still be created
*/
function quoteToCallout(quoteBlock: HTMLQuoteElement): void {
const node: ChildNode | null = quoteBlock.firstChild;
const name = node?.nodeName ?? '';
const titlePar = quoteBlock.ownerDocument.createElement('p');
let titleTxt = '';
if (name == '#text') titleTxt = node?.textContent ?? '';
else if (name == 'P') titleTxt = (<Element>node).innerHTML;
else if (['EM', 'STRONG', 'DEL', 'MARK'].includes(name)) titleTxt = (<Element>node).outerHTML;
else (quoteBlock.prepend(titlePar));
// callout title must fit on one line in the MD file
titleTxt = titleTxt.replace(/<br>/g, '&lt;br&gt;');
titlePar.innerHTML= `[!important] ${titleTxt}`;
quoteBlock.firstChild?.replaceWith(titlePar);
}

function fixNotionBookmarks(body: HTMLElement) {
// Notion bookmarks are a box with images and description, we simplify for Obsidian.
for (let bookmark of body.findAll('a.bookmark.source')) {
const link = bookmark.getAttribute('href');
const title = bookmark.find('div.bookmark-title')?.textContent;
const description = stripToSentence(bookmark.find('div.bookmark-description')?.textContent ?? '');
let calloutBlock = `> [!info] ${title}\n` + `> ${description}\n` + `> [${link}](${link})\n`;
if (embed.nextElementSibling && isCallout(embed.nextElementSibling)) {
if (bookmark.nextElementSibling && isCallout(bookmark.nextElementSibling)) {
// separate callouts with spaces
calloutBlock += '\n';
}
embed.replaceWith(calloutBlock);
bookmark.replaceWith(calloutBlock);
}
}

Expand Down Expand Up @@ -328,7 +362,24 @@ function removeTags(body: HTMLElement, tag: string) {
}
}

function replaceNestedTags(body: HTMLElement, tag: 'strong' | 'em') {
/**
* Fixes issues with formatting tags in Notion HTML export
*
* This includes:
* - reducing nested tags
* - merging adjacent tags
* - stripping leading <br> artificats
* - splitting tags at nested <br> points
*/
function fixFormatTags(body: HTMLElement, tagNames: FormatTagName[]) {
// must occur in the order shown
for (const t of tagNames) replaceNestedTags(body, t);
for (const t of tagNames) mergeAdjacentTags(body, t);
for (const t of tagNames) stripLeadingBr(body, t);
for (const t of tagNames) splitBrsInFormatting(body, t);
}

function replaceNestedTags(body: HTMLElement, tag: FormatTagName) {
for (const el of body.findAll(tag)) {
if (!el.parentElement || el.parentElement.tagName === tag.toUpperCase()) {
continue;
Expand All @@ -341,15 +392,46 @@ function replaceNestedTags(body: HTMLElement, tag: 'strong' | 'em') {
}
}

function splitBrsInFormatting(htmlString: string, tag: 'strong' | 'em') {
const tags = htmlString.match(new RegExp(`<${tag}>(.|\n)*</${tag}>`));
/**
* Merges tags if identical tags are placed next to each other.
*/
function mergeAdjacentTags(body: HTMLElement, tagName: FormatTagName) {
const tags = body.findAll(tagName);
if (!tags) return;
const regex = new RegExp(`</${tagName}>( *)<${tagName}>`, 'g');
for (const tag of tags) {
if (!tag || !tag.parentElement) continue;
const parent = tag.parentElement;
let parentHTML = parent?.innerHTML;
parent.innerHTML = parentHTML?.replace(regex, '$1');
}
}

/**
* Strips leading <br> artificats created by Notion
* These often occur before strong | em | mark | del tags
*/
function stripLeadingBr(body: HTMLElement, tagName: FormatTagName) {
const tags = body.findAll(tagName);
if (!tags) return;
for (const tag of tags) {
const prevNode = tag.previousSibling;
prevNode?.nodeName == 'BR' && prevNode?.remove();
}
}

function splitBrsInFormatting(body: HTMLElement, tagName: FormatTagName) {
// Simpler to just use the HTML string for this replacement
let htmlString = body.innerHTML;
const tags = htmlString.match(new RegExp(`<${tagName}>.*?</${tagName}>`, 'sg'));
if (!tags) return;
for (let tag of tags.filter((tag) => tag.contains('<br />'))) {
for (let tag of tags.filter((tag) => tag.includes('<br>'))) {
htmlString = htmlString.replace(
tag,
tag.split('<br />').join(`</${tag}><br /><${tag}>`)
tag.split('<br>').join(`</${tagName}><br><${tagName}>`)
);
}
body.innerHTML = htmlString;
}

function replaceTableOfContents(body: HTMLElement) {
Expand All @@ -362,8 +444,8 @@ function replaceTableOfContents(body: HTMLElement) {
}

function encodeNewlinesToBr(body: HTMLElement) {
body.innerHTML = body.innerHTML.replace(/\n/g, '<br />');
// Since <br /> is ignored in codeblocks, we replace with newlines
body.innerHTML = body.innerHTML.replace(/(?:\n|<br ?\/>)/g, '<br>');
// Since <br> is ignored in codeblocks, we replace with newlines
for (const block of body.findAll('code')) {
for (const br of block.findAll('br')) {
br.replaceWith('\n');
Expand Down
2 changes: 2 additions & 0 deletions src/formats/notion/notion-types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ export type YamlProperty = {
title: string;
};

export type FormatTagName = 'strong' | 'em' | 'mark' | 'del' | string;

export type NotionLink =
{
type: 'relation';
Expand Down
28 changes: 24 additions & 4 deletions src/formats/notion/notion-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,28 @@ export function stripParentDirectories(relativeURI: string) {
return relativeURI.replace(/^(\.\.\/)+/, '');
}

/**
* Replace all tag-like things #<word> in the document with \#<word>.
* Useful for programs (like Notion) that don't support #<word> tags.
*
* Obsidian #tag may contain
* - Alphanumeric chars
* - Any non-ASCI char (U0080 and greater)
* - Forwardslahes, hyphens, underscores
*
* Must contain at least one non-numeric char
* Full #tag regex is:
*
* /#\d*?(?:[-_/a-z]|[^\x00-\x7F])(?:[-/\w]|[^\x00-\x7F])*()/gi
*
* But only need up to first non-numeric char to match valid #tag:
*
* /#\d*?(?:[-_/a-z]|[^\x00-\x7F])/gi
*
* @todo Currently cannot ignore #s in multine code/math blocks as this function parses one line at a time.
*/
export function escapeHashtags(body: string) {
const tagExp = /#[a-z0-9\-]+/gi;
const tagExp = /#\d*?(?:[-_/a-z]|[^\x00-\x7F])/gi;

if (!tagExp.test(body)) return body;
const lines = body.split('\n');
Expand All @@ -41,11 +61,11 @@ export function escapeHashtags(body: string) {
if (!hashtags) continue;
let newLine = lines[i];
for (let hashtag of hashtags) {
// skipping any internal links [[ # ]], URLS [ # ]() or []( # ), or already escaped hashtags \#, replace all tag-like things #<word> in the document with \#<word>. Useful for programs (like Notion) that don't support #<word> tags.
// skipping any internal links [[ # ]], URLS [ # ]() or []( # ),
// code ` # ` or already escaped hashtags \#
const hashtagInLink = new RegExp(
`\\[\\[[^\\]]*${hashtag}[^\\]]*\\]\\]|\\[[^\\]]*${hashtag}[^\\]]*\\]\\([^\\)]*\\)|\\[[^\\]]*\\]\\([^\\)]*${hashtag}[^\\)]*\\)|\\\\${hashtag}`
`\\[\\[[^\\]]*${hashtag}[^\\]]*\\]\\]|\\[[^\\]]*${hashtag}[^\\]]*\\]\\([^\\)]*\\)|\\[[^\\]]*\\]\\([^\\)]*${hashtag}[^\\)]*\\)|\\\\${hashtag}|\`[^\`]*${hashtag}[^\`]*\``
);

if (hashtagInLink.test(newLine)) continue;
newLine = newLine.replace(hashtag, '\\' + hashtag);
}
Expand Down
Binary file removed tests/notion/math-test.zip
Binary file not shown.
Binary file added tests/notion/notion-testspace.zip
Binary file not shown.

0 comments on commit 52cef2b

Please sign in to comment.