From 860c7fb122157af72baf6f8b121f228f1e39a6f1 Mon Sep 17 00:00:00 2001
From: Lumir Balhar <lbalhar@redhat.com>
Date: Wed, 2 Oct 2024 16:22:47 +0200
Subject: [PATCH 1/2] Improve documentation about parsing URLs in
 lxml_html_clean.

---
 README.md                | 4 ++++
 docs/index.rst           | 8 ++++++++
 lxml_html_clean/clean.py | 3 +++
 3 files changed, 15 insertions(+)

diff --git a/README.md b/README.md
index b49ece6..7e4a931 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,10 @@
 
 This project was initially a part of [lxml](https://github.com/lxml/lxml). Because HTML cleaner is designed as blocklist-based, many reports about possible security vulnerabilities were filed for lxml and that make the project problematic for security-sensitive environments. Therefore we decided to extract the problematic part to a separate project.
 
+**Important**: the HTML Cleaner in ``lxml_html_clean`` is **not** considered appropriate **for security sensitive environments**. See e.g. [bleach](https://pypi.org/project/bleach/) for an alternative.
+
+This project uses functions from Python's `urllib.parse` for URL parsing which **do not validate inputs**. For more information on potential security risks, refer to the [URL parsing security](https://docs.python.org/3/library/urllib.parse.html#url-parsing-security) documentation. A maliciously crafted URL could potentially bypass the allowed hosts check in `Cleaner`.
+
 ## Installation
 
 You can install this project directly via `pip install lxml_html_clean` or as an extra of lxml
diff --git a/docs/index.rst b/docs/index.rst
index 415dfd9..bbc9a55 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -8,6 +8,14 @@ This project was initially a part of `lxml <https://github.com/lxml/lxml>`_. Bec
 many reports about possible security vulnerabilities were filed for lxml and that make the project problematic for
 security-sensitive environments. Therefore we decided to extract the problematic part to a separate project.
 
+**Important**: the HTML Cleaner in ``lxml_html_clean`` is **not** considered appropriate **for security sensitive environments**.
+See e.g. `bleach <https://pypi.org/project/bleach/>`_ for an alternative.
+
+This project uses functions from Python's ``urllib.parse`` for URL parsing which **do not validate inputs**.
+For more information on potential security risks, refer to the
+`URL parsing security <https://docs.python.org/3/library/urllib.parse.html#url-parsing-security>`_ documentation.
+A maliciously crafted URL could potentially bypass the allowed hosts check in ``Cleaner``.
+
 Security
 --------
 
diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
index 1c63cfa..31c4936 100644
--- a/lxml_html_clean/clean.py
+++ b/lxml_html_clean/clean.py
@@ -185,6 +185,9 @@ class Cleaner:
 
         Note that you may also need to set ``whitelist_tags``.
 
+        Note that URLs are parsed via functions from ``urllib.parse`` and
+        no input validation is performed.
+
     ``whitelist_tags``:
         A set of tags that can be included with ``host_whitelist``.
         The default is ``iframe`` and ``embed``; you may wish to

From 5cd2a104a338e9c11ae0e5d45acb01e9e6d9e1d2 Mon Sep 17 00:00:00 2001
From: Lumir Balhar <lbalhar@redhat.com>
Date: Wed, 2 Oct 2024 16:43:06 +0200
Subject: [PATCH 2/2] Raise warning for unstable URL parsing

`urllib.parse` does not perform any input validation so
its output might be invalid as well. Because host_whitelist
functionality relies on hostnames parsed from URLs, the result of
`urlsplit` is newly compared with the result of simple regex
parser and If they differ, a warning is raised.
---
 lxml_html_clean/__init__.pyi |  4 +++-
 lxml_html_clean/clean.py     | 38 +++++++++++++++++++++++++++++++++++-
 lxml_html_clean/clean.pyi    | 11 +++++++++++
 tests/test_clean.py          | 15 +++++++++++++-
 4 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/lxml_html_clean/__init__.pyi b/lxml_html_clean/__init__.pyi
index f42a110..07e344b 100644
--- a/lxml_html_clean/__init__.pyi
+++ b/lxml_html_clean/__init__.pyi
@@ -5,5 +5,7 @@ from .clean import (
     autolink as autolink,
     autolink_html as autolink_html,
     word_break as word_break,
-    word_break_html as word_break_html
+    word_break_html as word_break_html,
+    LXMLHTMLCleanWarning as LXMLHTMLCleanWarning,
+    AmbiguousURLWarning as AmbiguousURLWarning,
 )
diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
index 31c4936..b410382 100644
--- a/lxml_html_clean/clean.py
+++ b/lxml_html_clean/clean.py
@@ -10,6 +10,7 @@
 import re
 from collections import deque
 from urllib.parse import urlsplit, unquote_plus
+import warnings
 
 from lxml import etree
 from lxml.html import defs
@@ -18,7 +19,7 @@
 
 
 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
-           'word_break', 'word_break_html']
+           'word_break', 'word_break_html', 'LXMLHTMLCleanWarning', 'AmbiguousURLWarning']
 
 # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
 #   Particularly the CSS cleaning; most of the tag cleaning is integrated now
@@ -100,6 +101,33 @@ def fromstring(string):
     return lxml_fromstring(_ascii_control_characters.sub("", string))
 
 
+# This regular expression is inspired by the one in urllib3.
+_URI_RE = re.compile(
+    r"^(?:(?P<scheme>[a-zA-Z][a-zA-Z0-9+.-]*[a-zA-Z0-9]):)?"
+    r"(?://(?P<authority>[^\\/?#]*))?"
+    r"(?P<path>[^?#]*)"
+    r"(?:\?(?P<query_string>[^#]*))?"
+    r"(?:#(?P<fragment>.*))?$",
+    re.UNICODE,
+)
+
+
+def _get_authority_from_url(url):
+    match = _URI_RE.match(url)
+    if match:
+        return match.group("authority")
+    else:
+        return None
+
+
+class LXMLHTMLCleanWarning(Warning):
+    pass
+
+
+class AmbiguousURLWarning(LXMLHTMLCleanWarning):
+    pass
+
+
 class Cleaner:
     """
     Instances cleans the document of each of the possible offending
@@ -499,6 +527,14 @@ def allow_embedded_url(self, el, url):
         parts = urlsplit(url)
         if parts.scheme not in ('http', 'https'):
             return False
+
+        authority = _get_authority_from_url(url)
+        if (parts.netloc or authority) and parts.netloc != authority:
+            warnings.warn(f"It's impossible to parse the hostname from URL: '{url}'! "
+                          "URL is not allowed because parsers returned ambiguous results.",
+                          AmbiguousURLWarning)
+            return False
+
         if parts.hostname in self.host_whitelist:
             return True
         return False
diff --git a/lxml_html_clean/clean.pyi b/lxml_html_clean/clean.pyi
index 4547afa..8f0707f 100644
--- a/lxml_html_clean/clean.pyi
+++ b/lxml_html_clean/clean.pyi
@@ -8,6 +8,17 @@ _DT = TypeVar("_DT", str, bytes, HtmlElement)
 _ET_DT = TypeVar("_ET_DT", str, bytes, HtmlElement, _ElementTree[HtmlElement])
 
 
+def _get_authority_from_url(url: str) -> str | None: ...
+
+
+class LXMLHTMLCleanWarning(Warning):
+    pass
+
+
+class AmbiguousURLWarning(LXMLHTMLCleanWarning):
+    pass
+
+
 class Cleaner:
     @overload  # allow_tags present, remove_unknown_tags must be False
     def __init__(
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 701cab8..85692a1 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -2,9 +2,10 @@
 import gzip
 import io
 import unittest
+import warnings
 
 import lxml.html
-from lxml_html_clean import Cleaner, clean_html
+from lxml_html_clean import AmbiguousURLWarning, Cleaner, clean_html, LXMLHTMLCleanWarning
 from .utils import peak_memory_usage
 
 
@@ -346,3 +347,15 @@ def test_memory_usage_many_elements_with_long_tails(self):
         mem = peak_memory_usage(cleaner.clean_html, html)
 
         self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB")
+
+    def test_possibly_invalid_url(self):
+        cleaner = Cleaner(host_whitelist=['google.com'])
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            result = cleaner.clean_html(r"<p><iframe src='http://example.com:\@google.com'>  </iframe></p>")
+            self.assertGreaterEqual(len(w), 1)
+            self.assertIs(w[-1].category, AmbiguousURLWarning)
+            self.assertTrue(issubclass(w[-1].category, LXMLHTMLCleanWarning))
+            self.assertIn("impossible to parse the hostname", str(w[-1].message))
+        self.assertNotIn("google.com", result)
+        self.assertNotIn("example.com", result)