diff --git a/CHANGES.rst b/CHANGES.rst index 8b78a93..8c4e986 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,6 +6,14 @@ lxml_html_clean changelog Unreleased ========== +0.3.1 (2024-10-09) +================== + +Features added +-------------- + +* Do not parse URL addresses when it is not necessary. + 0.3.0 (2024-10-09) ================== diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index b410382..a71da81 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -524,6 +524,8 @@ def allow_embedded_url(self, el, url): """ if self.whitelist_tags is not None and el.tag not in self.whitelist_tags: return False + if not self.host_whitelist: + return False parts = urlsplit(url) if parts.scheme not in ('http', 'https'): return False diff --git a/setup.cfg b/setup.cfg index 71c787b..0fd6453 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = lxml_html_clean -version = 0.3.0 +version = 0.3.1 description = HTML cleaner from lxml project long_description = file:README.md long_description_content_type = text/markdown diff --git a/tests/test_clean.py b/tests/test_clean.py index 85692a1..8c9bc20 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -348,7 +348,7 @@ def test_memory_usage_many_elements_with_long_tails(self): self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB") - def test_possibly_invalid_url(self): + def test_possibly_invalid_url_with_whitelist(self): cleaner = Cleaner(host_whitelist=['google.com']) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -359,3 +359,12 @@ def test_possibly_invalid_url(self): self.assertIn("impossible to parse the hostname", str(w[-1].message)) self.assertNotIn("google.com", result) self.assertNotIn("example.com", result) + + def test_possibly_invalid_url_without_whitelist(self): + cleaner = Cleaner() + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = cleaner.clean_html(r"

") + self.assertEqual(len(w), 0) + self.assertNotIn("google.com", result) + self.assertNotIn("example.com", result)