Skip to content

Commit

Permalink
Ignore paragraphs that are wrapped in subscript (HTML importer)
Browse files Browse the repository at this point in the history
  • Loading branch information
Blake-Madden committed Dec 31, 2023
1 parent 0b747b8 commit 685892d
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 57 deletions.
48 changes: 42 additions & 6 deletions src/import/html_extract_text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ namespace lily_of_the_valley
//------------------------------------------------------------------
void html_extract_text::parse_raw_text(const wchar_t* text, size_t textSize)
{
/* Note about superscripts and subscripts.
Some pages apply this to entire paragraphs to make them appear in
a smaller font, which is what browsers then render them as.
There is no way to tell what the author's intent is, but if more than
four characters, then it's probably not really meant to be a super or subscript.*/
constexpr size_t maxSubscriptLength{ 4 };
if (textSize > 0)
{
size_t currentStartPosition{ 0 };
Expand Down Expand Up @@ -344,14 +350,29 @@ namespace lily_of_the_valley
if (m_superscript_stack > 0)
{
// convert what we can along the way
for (size_t i = 0; i < index; ++i)
{ add_character(string_util::to_superscript(text[i])); }
if (index <= maxSubscriptLength)
{
for (size_t i = 0; i < index; ++i)
{ add_character(string_util::to_superscript(text[i])); }
}
else
{
add_characters({ text, index });
}
}
else if (m_subscript_stack > 0)
{
// convert what we can along the way
for (size_t i = 0; i < index; ++i)
{ add_character(string_util::to_subscript(text[i])); }
// (if really a subscript)
if (index <= maxSubscriptLength)
{
for (size_t i = 0; i < index; ++i)
{ add_character(string_util::to_subscript(text[i])); }
}
else
{
add_characters({ text, index });
}
}
else
{ add_characters(text, index); }
Expand All @@ -376,14 +397,29 @@ namespace lily_of_the_valley
if (m_superscript_stack > 0)
{
// convert what we can along the way
if (textSize <= maxSubscriptLength)
{
for (size_t i = 0; i < textSize; ++i)
{ add_character(string_util::to_superscript(text[i])); }
}
else
{
add_characters({ text, textSize });
}
}
else if (m_subscript_stack > 0)
{
// convert what we can along the way
for (size_t i = 0; i < textSize; ++i)
{ add_character(string_util::to_subscript(text[i])); }
// (if really a subscript)
if (textSize <= maxSubscriptLength)
{
for (size_t i = 0; i < textSize; ++i)
{ add_character(string_util::to_subscript(text[i])); }
}
else
{
add_characters({ text, textSize });
}
}
else
{ add_characters(text, textSize); }
Expand Down
121 changes: 70 additions & 51 deletions tests/htmlimporttests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,76 @@ TEST_CASE("Str Chr Not Quoted", "[html import]")
}
}

TEST_CASE("HTML parser subscripts", "[html import]")
{
SECTION("Superscript")
{
html_extract_text filter_html;
const wchar_t* text = L"H<sup>2</sup>O<sup>37i</sup>";
std::wstring p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"H²O³⁷ⁱ") == p);
text = L"H<sup>2</sup>O<sup>37Zi</sup>";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"H²O³⁷Zⁱ") == p);
}
SECTION("Subscript")
{
html_extract_text filter_html;
const wchar_t* text = L"H<sub>2</sub>O<sub>37h</sub>";
std::wstring p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"H₂O₃₇ₕ") == p);
text = L"H<sub>2</sub>O<sub>37Zh</sub>";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"H₂O₃₇Zₕ") == p);
}
SECTION("Not really a script")
{
html_extract_text filter_html;
wchar_t* text = L"<sub>Hello22</sub> some text <sub>Hello2</sub>";
std::wstring p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello22 some text Hello2") == p);

text = L"<sup>Hello2</sup>";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello2") == p);
}
}

TEST_CASE("HTML parser tags", "[html import]")
{
SECTION("Find Tag")
{
const wchar_t* text = L"body bgcolor=\'#FF0000\' color=\'#FF0000\'>there<br />world<br >!";
CHECK(html_extract_text::find_tag(text, L"bgcolor", 7, false) == text+5);
CHECK(html_extract_text::find_tag(text, L"BGCOLOR", 7, false) == text+5);
CHECK(html_extract_text::find_tag(text, L"color", 5, false) == text+23);
CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr);
CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr);
CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr);
CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text);
}
SECTION("Find Tag 2")
{
const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there<br />world<br >!";
CHECK(html_extract_text::find_tag(text, L"STYLE", 5, false) == text+5);
CHECK(html_extract_text::find_tag(text, L"color", 5, false) == nullptr);
CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr);
CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr);
CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr);
CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text);
}
SECTION("Find Tag Quotable")
{
const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there<br />world<br >!";
CHECK(html_extract_text::find_tag(text, L"STYLE", 5, true) == text+5);
CHECK(html_extract_text::find_tag(text, L"color", 5, true) == text+12);
CHECK(html_extract_text::find_tag(text, L"width", 5, true) == text+26);
CHECK(html_extract_text::find_tag(nullptr, L"width", 5, true) == nullptr);
CHECK(html_extract_text::find_tag(text, nullptr, 0, true) == nullptr);
CHECK(html_extract_text::find_tag(text, L"body", 4, true) == text);
}
}

TEST_CASE("HTML Parser", "[html import]")
{
SECTION("Find Bookmark")
Expand Down Expand Up @@ -345,26 +415,6 @@ TEST_CASE("HTML Parser", "[html import]")
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"hello\n\n\n\nItem 1:\tThe definition\n\n") == p);
}
SECTION("Superscript")
{
html_extract_text filter_html;
const wchar_t* text = L"H<sup>2</sup>O<sup>37i</sup>";
std::wstring p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"H²O³⁷ⁱ") == p);
text = L"H<sup>2</sup>O<sup>37Zi</sup>";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"H²O³⁷Zⁱ") == p);
}
SECTION("Subscript")
{
html_extract_text filter_html;
const wchar_t* text = L"H<sub>2</sub>O<sub>37h</sub>";
std::wstring p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"H₂O₃₇ₕ") == p);
text = L"H<sub>2</sub>O<sub>37Zh</sub>";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"H₂O₃₇Zₕ") == p);
}
SECTION("Table")
{
html_extract_text filter_html;
Expand Down Expand Up @@ -462,37 +512,6 @@ TEST_CASE("HTML Parser", "[html import]")
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wcscmp(p, L"A comprehensive (4 pages long) review of") == 0);
}
SECTION("Find Tag")
{
const wchar_t* text = L"body bgcolor=\'#FF0000\' color=\'#FF0000\'>there<br />world<br >!";
CHECK(html_extract_text::find_tag(text, L"bgcolor", 7, false) == text+5);
CHECK(html_extract_text::find_tag(text, L"BGCOLOR", 7, false) == text+5);
CHECK(html_extract_text::find_tag(text, L"color", 5, false) == text+23);
CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr);
CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr);
CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr);
CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text);
}
SECTION("Find Tag 2")
{
const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there<br />world<br >!";
CHECK(html_extract_text::find_tag(text, L"STYLE", 5, false) == text+5);
CHECK(html_extract_text::find_tag(text, L"color", 5, false) == nullptr);
CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr);
CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr);
CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr);
CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text);
}
SECTION("Find Tag Quotable")
{
const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there<br />world<br >!";
CHECK(html_extract_text::find_tag(text, L"STYLE", 5, true) == text+5);
CHECK(html_extract_text::find_tag(text, L"color", 5, true) == text+12);
CHECK(html_extract_text::find_tag(text, L"width", 5, true) == text+26);
CHECK(html_extract_text::find_tag(nullptr, L"width", 5, true) == nullptr);
CHECK(html_extract_text::find_tag(text, nullptr, 0, true) == nullptr);
CHECK(html_extract_text::find_tag(text, L"body", 4, true) == text);
}
SECTION("Read Element As String")
{
const wchar_t* text = L"<h1>My header</H1>";
Expand Down

0 comments on commit 685892d

Please sign in to comment.