Skip to content

Commit

Permalink
When host_whitelist is empty, don't bother parsing the URLs in allow_…
Browse files Browse the repository at this point in the history
…embedded_url
  • Loading branch information
hroncok committed Oct 9, 2024
1 parent 88973ec commit bbfe14d
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 1 deletion.
2 changes: 2 additions & 0 deletions lxml_html_clean/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,8 @@ def allow_embedded_url(self, el, url):
"""
if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
return False
if not self.host_whitelist:
return False
parts = urlsplit(url)
if parts.scheme not in ('http', 'https'):
return False
Expand Down
11 changes: 10 additions & 1 deletion tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def test_memory_usage_many_elements_with_long_tails(self):

self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB")

def test_possibly_invalid_url(self):
def test_possibly_invalid_url_with_whitelist(self):
cleaner = Cleaner(host_whitelist=['google.com'])
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
Expand All @@ -359,3 +359,12 @@ def test_possibly_invalid_url(self):
self.assertIn("impossible to parse the hostname", str(w[-1].message))
self.assertNotIn("google.com", result)
self.assertNotIn("example.com", result)

def test_possibly_invalid_url_without_whitelist(self):
cleaner = Cleaner()
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
result = cleaner.clean_html(r"<p><iframe src='http://example.com:\@google.com'> </iframe></p>")
self.assertEquals(len(w), 0)
self.assertNotIn("google.com", result)
self.assertNotIn("example.com", result)

0 comments on commit bbfe14d

Please sign in to comment.