diff --git a/src/import/html_extract_text.cpp b/src/import/html_extract_text.cpp
index 0ff74da7..908ad0a8 100644
--- a/src/import/html_extract_text.cpp
+++ b/src/import/html_extract_text.cpp
@@ -121,6 +121,12 @@ namespace lily_of_the_valley
//------------------------------------------------------------------
void html_extract_text::parse_raw_text(const wchar_t* text, size_t textSize)
{
+ /* Note about superscripts and subscripts.
+ Some pages apply this to entire paragraphs to make them appear in
+ a smaller font, which is what browsers then render them as.
+ There is no way to tell what the author's intent is, but if more than
+ four characters, then it's probably not really meant to be a super or subscript.*/
+ constexpr size_t maxSubscriptLength{ 4 };
if (textSize > 0)
{
size_t currentStartPosition{ 0 };
@@ -344,14 +350,29 @@ namespace lily_of_the_valley
if (m_superscript_stack > 0)
{
// convert what we can along the way
- for (size_t i = 0; i < index; ++i)
- { add_character(string_util::to_superscript(text[i])); }
+ if (index <= maxSubscriptLength)
+ {
+ for (size_t i = 0; i < index; ++i)
+ { add_character(string_util::to_superscript(text[i])); }
+ }
+ else
+ {
+ add_characters({ text, index });
+ }
}
else if (m_subscript_stack > 0)
{
// convert what we can along the way
- for (size_t i = 0; i < index; ++i)
- { add_character(string_util::to_subscript(text[i])); }
+ // (if really a subscript)
+ if (index <= maxSubscriptLength)
+ {
+ for (size_t i = 0; i < index; ++i)
+ { add_character(string_util::to_subscript(text[i])); }
+ }
+ else
+ {
+ add_characters({ text, index });
+ }
}
else
{ add_characters(text, index); }
@@ -376,14 +397,29 @@ namespace lily_of_the_valley
if (m_superscript_stack > 0)
{
// convert what we can along the way
+ if (textSize <= maxSubscriptLength)
+ {
for (size_t i = 0; i < textSize; ++i)
{ add_character(string_util::to_superscript(text[i])); }
+ }
+ else
+ {
+ add_characters({ text, textSize });
+ }
}
else if (m_subscript_stack > 0)
{
// convert what we can along the way
- for (size_t i = 0; i < textSize; ++i)
- { add_character(string_util::to_subscript(text[i])); }
+ // (if really a subscript)
+ if (textSize <= maxSubscriptLength)
+ {
+ for (size_t i = 0; i < textSize; ++i)
+ { add_character(string_util::to_subscript(text[i])); }
+ }
+ else
+ {
+ add_characters({ text, textSize });
+ }
}
else
{ add_characters(text, textSize); }
diff --git a/tests/htmlimporttests.cpp b/tests/htmlimporttests.cpp
index d3bf52f5..f49cfe9f 100644
--- a/tests/htmlimporttests.cpp
+++ b/tests/htmlimporttests.cpp
@@ -191,6 +191,76 @@ TEST_CASE("Str Chr Not Quoted", "[html import]")
}
}
+TEST_CASE("HTML parser subscripts", "[html import]")
+ {
+ SECTION("Superscript")
+ {
+ html_extract_text filter_html;
+ const wchar_t* text = L"H2O37i";
+ std::wstring p = filter_html(text, std::wcslen(text), true, false);
+ CHECK(std::wstring(L"H²O³⁷ⁱ") == p);
+ text = L"H2O37Zi";
+ p = filter_html(text, std::wcslen(text), true, false);
+ CHECK(std::wstring(L"H²O³⁷Zⁱ") == p);
+ }
+ SECTION("Subscript")
+ {
+ html_extract_text filter_html;
+ const wchar_t* text = L"H2O37h";
+ std::wstring p = filter_html(text, std::wcslen(text), true, false);
+ CHECK(std::wstring(L"H₂O₃₇ₕ") == p);
+ text = L"H2O37Zh";
+ p = filter_html(text, std::wcslen(text), true, false);
+ CHECK(std::wstring(L"H₂O₃₇Zₕ") == p);
+ }
+ SECTION("Not really a script")
+ {
+ html_extract_text filter_html;
+ wchar_t* text = L"Hello22 some text Hello2";
+ std::wstring p = filter_html(text, std::wcslen(text), true, false);
+ CHECK(std::wstring(L"Hello22 some text Hello2") == p);
+
+ text = L"Hello2";
+ p = filter_html(text, std::wcslen(text), true, false);
+ CHECK(std::wstring(L"Hello2") == p);
+ }
+ }
+
+TEST_CASE("HTML parser tags", "[html import]")
+ {
+ SECTION("Find Tag")
+ {
+ const wchar_t* text = L"body bgcolor=\'#FF0000\' color=\'#FF0000\'>there
world
!";
+ CHECK(html_extract_text::find_tag(text, L"bgcolor", 7, false) == text+5);
+ CHECK(html_extract_text::find_tag(text, L"BGCOLOR", 7, false) == text+5);
+ CHECK(html_extract_text::find_tag(text, L"color", 5, false) == text+23);
+ CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr);
+ CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr);
+ CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr);
+ CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text);
+ }
+ SECTION("Find Tag 2")
+ {
+ const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there
world
!";
+ CHECK(html_extract_text::find_tag(text, L"STYLE", 5, false) == text+5);
+ CHECK(html_extract_text::find_tag(text, L"color", 5, false) == nullptr);
+ CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr);
+ CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr);
+ CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr);
+ CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text);
+ }
+ SECTION("Find Tag Quotable")
+ {
+ const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there
world
!";
+ CHECK(html_extract_text::find_tag(text, L"STYLE", 5, true) == text+5);
+ CHECK(html_extract_text::find_tag(text, L"color", 5, true) == text+12);
+ CHECK(html_extract_text::find_tag(text, L"width", 5, true) == text+26);
+ CHECK(html_extract_text::find_tag(nullptr, L"width", 5, true) == nullptr);
+ CHECK(html_extract_text::find_tag(text, nullptr, 0, true) == nullptr);
+ CHECK(html_extract_text::find_tag(text, L"body", 4, true) == text);
+ }
+ }
+
TEST_CASE("HTML Parser", "[html import]")
{
SECTION("Find Bookmark")
@@ -345,26 +415,6 @@ TEST_CASE("HTML Parser", "[html import]")
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"hello\n\n\n\nItem 1:\tThe definition\n\n") == p);
}
- SECTION("Superscript")
- {
- html_extract_text filter_html;
- const wchar_t* text = L"H2O37i";
- std::wstring p = filter_html(text, std::wcslen(text), true, false);
- CHECK(std::wstring(L"H²O³⁷ⁱ") == p);
- text = L"H2O37Zi";
- p = filter_html(text, std::wcslen(text), true, false);
- CHECK(std::wstring(L"H²O³⁷Zⁱ") == p);
- }
- SECTION("Subscript")
- {
- html_extract_text filter_html;
- const wchar_t* text = L"H2O37h";
- std::wstring p = filter_html(text, std::wcslen(text), true, false);
- CHECK(std::wstring(L"H₂O₃₇ₕ") == p);
- text = L"H2O37Zh";
- p = filter_html(text, std::wcslen(text), true, false);
- CHECK(std::wstring(L"H₂O₃₇Zₕ") == p);
- }
SECTION("Table")
{
html_extract_text filter_html;
@@ -462,37 +512,6 @@ TEST_CASE("HTML Parser", "[html import]")
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wcscmp(p, L"A comprehensive (4 pages long) review of") == 0);
}
- SECTION("Find Tag")
- {
- const wchar_t* text = L"body bgcolor=\'#FF0000\' color=\'#FF0000\'>there
world
!";
- CHECK(html_extract_text::find_tag(text, L"bgcolor", 7, false) == text+5);
- CHECK(html_extract_text::find_tag(text, L"BGCOLOR", 7, false) == text+5);
- CHECK(html_extract_text::find_tag(text, L"color", 5, false) == text+23);
- CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr);
- CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr);
- CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr);
- CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text);
- }
- SECTION("Find Tag 2")
- {
- const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there
world
!";
- CHECK(html_extract_text::find_tag(text, L"STYLE", 5, false) == text+5);
- CHECK(html_extract_text::find_tag(text, L"color", 5, false) == nullptr);
- CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr);
- CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr);
- CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr);
- CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text);
- }
- SECTION("Find Tag Quotable")
- {
- const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there
world
!";
- CHECK(html_extract_text::find_tag(text, L"STYLE", 5, true) == text+5);
- CHECK(html_extract_text::find_tag(text, L"color", 5, true) == text+12);
- CHECK(html_extract_text::find_tag(text, L"width", 5, true) == text+26);
- CHECK(html_extract_text::find_tag(nullptr, L"width", 5, true) == nullptr);
- CHECK(html_extract_text::find_tag(text, nullptr, 0, true) == nullptr);
- CHECK(html_extract_text::find_tag(text, L"body", 4, true) == text);
- }
SECTION("Read Element As String")
{
const wchar_t* text = L"