When host_whitelist is empty, don't bother parsing the URLs in allow_…

…embedded_url
fedora-python · Oct 9, 2024 · bbfe14d · bbfe14d
1 parent 88973ec
commit bbfe14d
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 1 deletion.
diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
@@ -524,6 +524,8 @@ def allow_embedded_url(self, el, url):
         """
         if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
             return False
+        if not self.host_whitelist:
+            return False
         parts = urlsplit(url)
         if parts.scheme not in ('http', 'https'):
             return False

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -348,7 +348,7 @@ def test_memory_usage_many_elements_with_long_tails(self):
 
         self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB")
 
-    def test_possibly_invalid_url(self):
+    def test_possibly_invalid_url_with_whitelist(self):
         cleaner = Cleaner(host_whitelist=['google.com'])
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
@@ -359,3 +359,12 @@ def test_possibly_invalid_url(self):
             self.assertIn("impossible to parse the hostname", str(w[-1].message))
         self.assertNotIn("google.com", result)
         self.assertNotIn("example.com", result)
+
+    def test_possibly_invalid_url_without_whitelist(self):
+        cleaner = Cleaner()
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            result = cleaner.clean_html(r"<p><iframe src='http://example.com:\@google.com'>  </iframe></p>")
+            self.assertEquals(len(w), 0)
+        self.assertNotIn("google.com", result)
+        self.assertNotIn("example.com", result)