From 860c7fb122157af72baf6f8b121f228f1e39a6f1 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Wed, 2 Oct 2024 16:22:47 +0200 Subject: [PATCH 1/2] Improve documentation about parsing URLs in lxml_html_clean. --- README.md | 4 ++++ docs/index.rst | 8 ++++++++ lxml_html_clean/clean.py | 3 +++ 3 files changed, 15 insertions(+) diff --git a/README.md b/README.md index b49ece6..7e4a931 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,10 @@ This project was initially a part of [lxml](https://github.com/lxml/lxml). Because HTML cleaner is designed as blocklist-based, many reports about possible security vulnerabilities were filed for lxml and that make the project problematic for security-sensitive environments. Therefore we decided to extract the problematic part to a separate project. +**Important**: the HTML Cleaner in ``lxml_html_clean`` is **not** considered appropriate **for security sensitive environments**. See e.g. [bleach](https://pypi.org/project/bleach/) for an alternative. + +This project uses functions from Python's `urllib.parse` for URL parsing which **do not validate inputs**. For more information on potential security risks, refer to the [URL parsing security](https://docs.python.org/3/library/urllib.parse.html#url-parsing-security) documentation. A maliciously crafted URL could potentially bypass the allowed hosts check in `Cleaner`. + ## Installation You can install this project directly via `pip install lxml_html_clean` or as an extra of lxml diff --git a/docs/index.rst b/docs/index.rst index 415dfd9..bbc9a55 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,6 +8,14 @@ This project was initially a part of `lxml `_. Bec many reports about possible security vulnerabilities were filed for lxml and that make the project problematic for security-sensitive environments. Therefore we decided to extract the problematic part to a separate project. +**Important**: the HTML Cleaner in ``lxml_html_clean`` is **not** considered appropriate **for security sensitive environments**. +See e.g. `bleach `_ for an alternative. + +This project uses functions from Python's ``urllib.parse`` for URL parsing which **do not validate inputs**. +For more information on potential security risks, refer to the +`URL parsing security `_ documentation. +A maliciously crafted URL could potentially bypass the allowed hosts check in ``Cleaner``. + Security -------- diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index 1c63cfa..31c4936 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -185,6 +185,9 @@ class Cleaner: Note that you may also need to set ``whitelist_tags``. + Note that URLs are parsed via functions from ``urllib.parse`` and + no input validation is performed. + ``whitelist_tags``: A set of tags that can be included with ``host_whitelist``. The default is ``iframe`` and ``embed``; you may wish to From 5cd2a104a338e9c11ae0e5d45acb01e9e6d9e1d2 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Wed, 2 Oct 2024 16:43:06 +0200 Subject: [PATCH 2/2] Raise warning for unstable URL parsing `urllib.parse` does not perform any input validation so its output might be invalid as well. Because host_whitelist functionality relies on hostnames parsed from URLs, the result of `urlsplit` is newly compared with the result of simple regex parser and If they differ, a warning is raised. --- lxml_html_clean/__init__.pyi | 4 +++- lxml_html_clean/clean.py | 38 +++++++++++++++++++++++++++++++++++- lxml_html_clean/clean.pyi | 11 +++++++++++ tests/test_clean.py | 15 +++++++++++++- 4 files changed, 65 insertions(+), 3 deletions(-) diff --git a/lxml_html_clean/__init__.pyi b/lxml_html_clean/__init__.pyi index f42a110..07e344b 100644 --- a/lxml_html_clean/__init__.pyi +++ b/lxml_html_clean/__init__.pyi @@ -5,5 +5,7 @@ from .clean import ( autolink as autolink, autolink_html as autolink_html, word_break as word_break, - word_break_html as word_break_html + word_break_html as word_break_html, + LXMLHTMLCleanWarning as LXMLHTMLCleanWarning, + AmbiguousURLWarning as AmbiguousURLWarning, ) diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index 31c4936..b410382 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -10,6 +10,7 @@ import re from collections import deque from urllib.parse import urlsplit, unquote_plus +import warnings from lxml import etree from lxml.html import defs @@ -18,7 +19,7 @@ __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', - 'word_break', 'word_break_html'] + 'word_break', 'word_break_html', 'LXMLHTMLCleanWarning', 'AmbiguousURLWarning'] # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl # Particularly the CSS cleaning; most of the tag cleaning is integrated now @@ -100,6 +101,33 @@ def fromstring(string): return lxml_fromstring(_ascii_control_characters.sub("", string)) +# This regular expression is inspired by the one in urllib3. +_URI_RE = re.compile( + r"^(?:(?P[a-zA-Z][a-zA-Z0-9+.-]*[a-zA-Z0-9]):)?" + r"(?://(?P[^\\/?#]*))?" + r"(?P[^?#]*)" + r"(?:\?(?P[^#]*))?" + r"(?:#(?P.*))?$", + re.UNICODE, +) + + +def _get_authority_from_url(url): + match = _URI_RE.match(url) + if match: + return match.group("authority") + else: + return None + + +class LXMLHTMLCleanWarning(Warning): + pass + + +class AmbiguousURLWarning(LXMLHTMLCleanWarning): + pass + + class Cleaner: """ Instances cleans the document of each of the possible offending @@ -499,6 +527,14 @@ def allow_embedded_url(self, el, url): parts = urlsplit(url) if parts.scheme not in ('http', 'https'): return False + + authority = _get_authority_from_url(url) + if (parts.netloc or authority) and parts.netloc != authority: + warnings.warn(f"It's impossible to parse the hostname from URL: '{url}'! " + "URL is not allowed because parsers returned ambiguous results.", + AmbiguousURLWarning) + return False + if parts.hostname in self.host_whitelist: return True return False diff --git a/lxml_html_clean/clean.pyi b/lxml_html_clean/clean.pyi index 4547afa..8f0707f 100644 --- a/lxml_html_clean/clean.pyi +++ b/lxml_html_clean/clean.pyi @@ -8,6 +8,17 @@ _DT = TypeVar("_DT", str, bytes, HtmlElement) _ET_DT = TypeVar("_ET_DT", str, bytes, HtmlElement, _ElementTree[HtmlElement]) +def _get_authority_from_url(url: str) -> str | None: ... + + +class LXMLHTMLCleanWarning(Warning): + pass + + +class AmbiguousURLWarning(LXMLHTMLCleanWarning): + pass + + class Cleaner: @overload # allow_tags present, remove_unknown_tags must be False def __init__( diff --git a/tests/test_clean.py b/tests/test_clean.py index 701cab8..85692a1 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -2,9 +2,10 @@ import gzip import io import unittest +import warnings import lxml.html -from lxml_html_clean import Cleaner, clean_html +from lxml_html_clean import AmbiguousURLWarning, Cleaner, clean_html, LXMLHTMLCleanWarning from .utils import peak_memory_usage @@ -346,3 +347,15 @@ def test_memory_usage_many_elements_with_long_tails(self): mem = peak_memory_usage(cleaner.clean_html, html) self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB") + + def test_possibly_invalid_url(self): + cleaner = Cleaner(host_whitelist=['google.com']) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = cleaner.clean_html(r"

") + self.assertGreaterEqual(len(w), 1) + self.assertIs(w[-1].category, AmbiguousURLWarning) + self.assertTrue(issubclass(w[-1].category, LXMLHTMLCleanWarning)) + self.assertIn("impossible to parse the hostname", str(w[-1].message)) + self.assertNotIn("google.com", result) + self.assertNotIn("example.com", result)