diff --git a/courlan/filters.py b/courlan/filters.py index 5f1ea4d..ef39536 100644 --- a/courlan/filters.py +++ b/courlan/filters.py @@ -16,6 +16,8 @@ LOGGER = logging.getLogger(__name__) +PROTOCOLS = {"http", "https"} + # domain/host names IP_SET = { ".", @@ -51,27 +53,27 @@ re.IGNORECASE, ) -UNSUITABLE_DOMAIN = re.compile(r"[0-9]+\.") - # content filters SITE_STRUCTURE = re.compile( # wordpress - r"/(?:paged?|seite|search|suche|gall?er[a-z]{1,2}|labels|archives|uploads|modules|attachment|wp-admin|wp-content|wp-includes|wp-json|wp-themes|oembed)/|" + r"/(?:wp-(?:admin|content|includes|json|themes)|" + r"paged?|seite|search|suche|gall?er[a-z]{1,2}|labels|" + r"archives|uploads|modules|attachment|oembed)/|" # wordpress + short URL r"[/_-](?:tags?|schlagwort|[ck]ategor[a-z]{1,2}|[ck]at|auth?or|user)/[^/]+/?$|" # mixed/blogspot - r"[^0-9]/[0-9]+/[0-9]+/$|[^0-9]/[0-9]{4}/$|" - # blogspot - r"_archive\.html$", + r"[^0-9]/[0-9]+/[0-9]+/$|[^0-9]/[0-9]{4}/$", re.IGNORECASE, ) FILE_TYPE = re.compile( - r"\.(atom|json|css|xml|js|jpg|jpeg|png|svg|gif|tiff|pdf|ogg|mp3|m4a|aac|avi|mp4|mov|web[mp]|flv|ico|pls|zip|tar|gz|iso|swf|woff|eot|ttf)\b|" + r"\.(atom|json|css|xml|js|jpg|jpeg|png|svg|gif|tiff|pdf|ogg|mp3|m4a|aac|" + r"avi|mp4|mov|web[mp]|flv|ico|pls|zip|tar|gz|iso|swf|woff|eot|ttf)\b|" r"[/-](img|jpg|png)(\b|_)", re.IGNORECASE, ) # (?=[&?]) ADULT_AND_VIDEOS = re.compile( - r"[/_-](?:bild-?kontakte|fick|gangbang|incest|live-?cams?|live-?chat|porno?|sexcam|sexyeroti[ck]|swinger|x{3})\b", + r"[/_-](?:bild-?kontakte|fick|gangbang|incest|live-?cams?|live-?chat|" + r"porno?|sexcam|sexyeroti[ck]|swinger|x{3})\b", re.IGNORECASE, ) @@ -132,15 +134,6 @@ ".xml", } -# territories whitelist -# see also: https://babel.pocoo.org/en/latest/api/languages.html -# get_official_languages('ch') -LANGUAGE_MAPPINGS = { - "de": {"at", "ch", "de", "li"}, # 'be', 'it' - "en": {"au", "ca", "en", "gb", "ie", "nz", "us"}, - "fr": {"be", "ca", "ch", "fr", "tn"}, # , 'lu', ... -} - def basic_filter(url: str) -> bool: "Filter URLs based on basic formal characteristics." @@ -158,14 +151,15 @@ def domain_filter(domain: str) -> bool: return True # malformed domains - try: - if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")): + if not VALID_DOMAIN.match(domain): + try: + if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")): + return False + except UnicodeError: return False - except UnicodeError: - return False # unsuitable content - if UNSUITABLE_DOMAIN.match(domain) or FILE_TYPE.search(domain): + if domain.split(".")[0].isdigit() or FILE_TYPE.search(domain): return False # extensions @@ -220,12 +214,11 @@ def lang_filter( score = langcodes_score(language, occurrence, score) # don't perform the test if there are too many candidates: > 2 # second test: prepended language cues - if strict and language in LANGUAGE_MAPPINGS: + if strict: match = HOST_LANG_FILTER.match(url) if match: candidate = match[1].lower() - LOGGER.debug("candidate lang %s found in URL", candidate) - if candidate in LANGUAGE_MAPPINGS[language]: + if candidate == language: score += 1 else: score -= 1 @@ -244,8 +237,8 @@ def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool: """Make sure the target URL is from a suitable type (HTML page with primarily text). Strict: Try to filter out other document types, spam, video and adult websites.""" try: - # feeds - if url.endswith(("/feed", "/rss")): + # feeds + blogspot + if url.endswith(("/feed", "/rss", "_archive.html")): raise ValueError # website structure if SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)): @@ -265,19 +258,16 @@ def validate_url(url: Optional[str]) -> Tuple[bool, Any]: parsed_url = urlsplit(url) except ValueError: return False, None - if not bool(parsed_url.scheme) or parsed_url.scheme not in ( - "http", - "https", - ): + + if not bool(parsed_url.scheme) or parsed_url.scheme not in PROTOCOLS: return False, None - # fmt: off + if len(parsed_url.netloc) < 5 or ( parsed_url.netloc.startswith("www.") # type: ignore and len(parsed_url.netloc) < 8 ): return False, None - # fmt: on - # default + return True, parsed_url diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 6529557..307fa43 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -249,6 +249,9 @@ def test_spam_filter(): def test_type_filter(): assert type_filter("http://www.example.org/feed") is False + # wp + assert type_filter("http://www.example.org/wp-admin/") is False + assert type_filter("http://www.example.org/wp-includes/this") is False # straight category assert type_filter("http://www.example.org/category/123") is False assert type_filter("http://www.example.org/product-category/123") is False