diff --git a/src/import/html_extract_text.cpp b/src/import/html_extract_text.cpp index 0ff74da7..908ad0a8 100644 --- a/src/import/html_extract_text.cpp +++ b/src/import/html_extract_text.cpp @@ -121,6 +121,12 @@ namespace lily_of_the_valley //------------------------------------------------------------------ void html_extract_text::parse_raw_text(const wchar_t* text, size_t textSize) { + /* Note about superscripts and subscripts. + Some pages apply this to entire paragraphs to make them appear in + a smaller font, which is what browsers then render them as. + There is no way to tell what the author's intent is, but if more than + four characters, then it's probably not really meant to be a super or subscript.*/ + constexpr size_t maxSubscriptLength{ 4 }; if (textSize > 0) { size_t currentStartPosition{ 0 }; @@ -344,14 +350,29 @@ namespace lily_of_the_valley if (m_superscript_stack > 0) { // convert what we can along the way - for (size_t i = 0; i < index; ++i) - { add_character(string_util::to_superscript(text[i])); } + if (index <= maxSubscriptLength) + { + for (size_t i = 0; i < index; ++i) + { add_character(string_util::to_superscript(text[i])); } + } + else + { + add_characters({ text, index }); + } } else if (m_subscript_stack > 0) { // convert what we can along the way - for (size_t i = 0; i < index; ++i) - { add_character(string_util::to_subscript(text[i])); } + // (if really a subscript) + if (index <= maxSubscriptLength) + { + for (size_t i = 0; i < index; ++i) + { add_character(string_util::to_subscript(text[i])); } + } + else + { + add_characters({ text, index }); + } } else { add_characters(text, index); } @@ -376,14 +397,29 @@ namespace lily_of_the_valley if (m_superscript_stack > 0) { // convert what we can along the way + if (textSize <= maxSubscriptLength) + { for (size_t i = 0; i < textSize; ++i) { add_character(string_util::to_superscript(text[i])); } + } + else + { + add_characters({ text, textSize }); + } } else if (m_subscript_stack > 0) { // convert what we can along the way - for (size_t i = 0; i < textSize; ++i) - { add_character(string_util::to_subscript(text[i])); } + // (if really a subscript) + if (textSize <= maxSubscriptLength) + { + for (size_t i = 0; i < textSize; ++i) + { add_character(string_util::to_subscript(text[i])); } + } + else + { + add_characters({ text, textSize }); + } } else { add_characters(text, textSize); } diff --git a/tests/htmlimporttests.cpp b/tests/htmlimporttests.cpp index d3bf52f5..f49cfe9f 100644 --- a/tests/htmlimporttests.cpp +++ b/tests/htmlimporttests.cpp @@ -191,6 +191,76 @@ TEST_CASE("Str Chr Not Quoted", "[html import]") } } +TEST_CASE("HTML parser subscripts", "[html import]") + { + SECTION("Superscript") + { + html_extract_text filter_html; + const wchar_t* text = L"H2O37i"; + std::wstring p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"H²O³⁷ⁱ") == p); + text = L"H2O37Zi"; + p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"H²O³⁷Zⁱ") == p); + } + SECTION("Subscript") + { + html_extract_text filter_html; + const wchar_t* text = L"H2O37h"; + std::wstring p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"H₂O₃₇ₕ") == p); + text = L"H2O37Zh"; + p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"H₂O₃₇Zₕ") == p); + } + SECTION("Not really a script") + { + html_extract_text filter_html; + wchar_t* text = L"Hello22 some text Hello2"; + std::wstring p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello22 some text Hello2") == p); + + text = L"Hello2"; + p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello2") == p); + } + } + +TEST_CASE("HTML parser tags", "[html import]") + { + SECTION("Find Tag") + { + const wchar_t* text = L"body bgcolor=\'#FF0000\' color=\'#FF0000\'>there
world
!"; + CHECK(html_extract_text::find_tag(text, L"bgcolor", 7, false) == text+5); + CHECK(html_extract_text::find_tag(text, L"BGCOLOR", 7, false) == text+5); + CHECK(html_extract_text::find_tag(text, L"color", 5, false) == text+23); + CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr); + CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr); + CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr); + CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text); + } + SECTION("Find Tag 2") + { + const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there
world
!"; + CHECK(html_extract_text::find_tag(text, L"STYLE", 5, false) == text+5); + CHECK(html_extract_text::find_tag(text, L"color", 5, false) == nullptr); + CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr); + CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr); + CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr); + CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text); + } + SECTION("Find Tag Quotable") + { + const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there
world
!"; + CHECK(html_extract_text::find_tag(text, L"STYLE", 5, true) == text+5); + CHECK(html_extract_text::find_tag(text, L"color", 5, true) == text+12); + CHECK(html_extract_text::find_tag(text, L"width", 5, true) == text+26); + CHECK(html_extract_text::find_tag(nullptr, L"width", 5, true) == nullptr); + CHECK(html_extract_text::find_tag(text, nullptr, 0, true) == nullptr); + CHECK(html_extract_text::find_tag(text, L"body", 4, true) == text); + } + } + TEST_CASE("HTML Parser", "[html import]") { SECTION("Find Bookmark") @@ -345,26 +415,6 @@ TEST_CASE("HTML Parser", "[html import]") p = filter_html(text, std::wcslen(text), true, false); CHECK(std::wstring(L"hello\n\n\n\nItem 1:\tThe definition\n\n") == p); } - SECTION("Superscript") - { - html_extract_text filter_html; - const wchar_t* text = L"H2O37i"; - std::wstring p = filter_html(text, std::wcslen(text), true, false); - CHECK(std::wstring(L"H²O³⁷ⁱ") == p); - text = L"H2O37Zi"; - p = filter_html(text, std::wcslen(text), true, false); - CHECK(std::wstring(L"H²O³⁷Zⁱ") == p); - } - SECTION("Subscript") - { - html_extract_text filter_html; - const wchar_t* text = L"H2O37h"; - std::wstring p = filter_html(text, std::wcslen(text), true, false); - CHECK(std::wstring(L"H₂O₃₇ₕ") == p); - text = L"H2O37Zh"; - p = filter_html(text, std::wcslen(text), true, false); - CHECK(std::wstring(L"H₂O₃₇Zₕ") == p); - } SECTION("Table") { html_extract_text filter_html; @@ -462,37 +512,6 @@ TEST_CASE("HTML Parser", "[html import]") p = filter_html(text, std::wcslen(text), true, false); CHECK(std::wcscmp(p, L"A comprehensive (4 pages long) review of") == 0); } - SECTION("Find Tag") - { - const wchar_t* text = L"body bgcolor=\'#FF0000\' color=\'#FF0000\'>there
world
!"; - CHECK(html_extract_text::find_tag(text, L"bgcolor", 7, false) == text+5); - CHECK(html_extract_text::find_tag(text, L"BGCOLOR", 7, false) == text+5); - CHECK(html_extract_text::find_tag(text, L"color", 5, false) == text+23); - CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr); - CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr); - CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr); - CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text); - } - SECTION("Find Tag 2") - { - const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there
world
!"; - CHECK(html_extract_text::find_tag(text, L"STYLE", 5, false) == text+5); - CHECK(html_extract_text::find_tag(text, L"color", 5, false) == nullptr); - CHECK(html_extract_text::find_tag(text, L"width", 5, false) == nullptr); - CHECK(html_extract_text::find_tag(nullptr, L"width", 5, false) == nullptr); - CHECK(html_extract_text::find_tag(text, nullptr, 0, false) == nullptr); - CHECK(html_extract_text::find_tag(text, L"body", 4, false) == text); - } - SECTION("Find Tag Quotable") - { - const wchar_t* text = L"body style=\"color=#FF0000 width=250\">there
world
!"; - CHECK(html_extract_text::find_tag(text, L"STYLE", 5, true) == text+5); - CHECK(html_extract_text::find_tag(text, L"color", 5, true) == text+12); - CHECK(html_extract_text::find_tag(text, L"width", 5, true) == text+26); - CHECK(html_extract_text::find_tag(nullptr, L"width", 5, true) == nullptr); - CHECK(html_extract_text::find_tag(text, nullptr, 0, true) == nullptr); - CHECK(html_extract_text::find_tag(text, L"body", 4, true) == text); - } SECTION("Read Element As String") { const wchar_t* text = L"

My header

";