Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

filters: simplify code #94

Merged
merged 4 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 24 additions & 34 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
LOGGER = logging.getLogger(__name__)


PROTOCOLS = {"http", "https"}

# domain/host names
IP_SET = {
".",
Expand Down Expand Up @@ -51,27 +53,27 @@
re.IGNORECASE,
)

UNSUITABLE_DOMAIN = re.compile(r"[0-9]+\.")

# content filters
SITE_STRUCTURE = re.compile(
# wordpress
r"/(?:paged?|seite|search|suche|gall?er[a-z]{1,2}|labels|archives|uploads|modules|attachment|wp-admin|wp-content|wp-includes|wp-json|wp-themes|oembed)/|"
r"/(?:wp-(?:admin|content|includes|json|themes)|"
r"paged?|seite|search|suche|gall?er[a-z]{1,2}|labels|"
r"archives|uploads|modules|attachment|oembed)/|"
# wordpress + short URL
r"[/_-](?:tags?|schlagwort|[ck]ategor[a-z]{1,2}|[ck]at|auth?or|user)/[^/]+/?$|"
# mixed/blogspot
r"[^0-9]/[0-9]+/[0-9]+/$|[^0-9]/[0-9]{4}/$|"
# blogspot
r"_archive\.html$",
r"[^0-9]/[0-9]+/[0-9]+/$|[^0-9]/[0-9]{4}/$",
re.IGNORECASE,
)
FILE_TYPE = re.compile(
r"\.(atom|json|css|xml|js|jpg|jpeg|png|svg|gif|tiff|pdf|ogg|mp3|m4a|aac|avi|mp4|mov|web[mp]|flv|ico|pls|zip|tar|gz|iso|swf|woff|eot|ttf)\b|"
r"\.(atom|json|css|xml|js|jpg|jpeg|png|svg|gif|tiff|pdf|ogg|mp3|m4a|aac|"
r"avi|mp4|mov|web[mp]|flv|ico|pls|zip|tar|gz|iso|swf|woff|eot|ttf)\b|"
r"[/-](img|jpg|png)(\b|_)",
re.IGNORECASE,
) # (?=[&?])
ADULT_AND_VIDEOS = re.compile(
r"[/_-](?:bild-?kontakte|fick|gangbang|incest|live-?cams?|live-?chat|porno?|sexcam|sexyeroti[ck]|swinger|x{3})\b",
r"[/_-](?:bild-?kontakte|fick|gangbang|incest|live-?cams?|live-?chat|"
r"porno?|sexcam|sexyeroti[ck]|swinger|x{3})\b",
re.IGNORECASE,
)

Expand Down Expand Up @@ -132,15 +134,6 @@
".xml",
}

# territories whitelist
# see also: https://babel.pocoo.org/en/latest/api/languages.html
# get_official_languages('ch')
LANGUAGE_MAPPINGS = {
"de": {"at", "ch", "de", "li"}, # 'be', 'it'
"en": {"au", "ca", "en", "gb", "ie", "nz", "us"},
"fr": {"be", "ca", "ch", "fr", "tn"}, # , 'lu', ...
}


def basic_filter(url: str) -> bool:
"Filter URLs based on basic formal characteristics."
Expand All @@ -158,14 +151,15 @@ def domain_filter(domain: str) -> bool:
return True

# malformed domains
try:
if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")):
if not VALID_DOMAIN.match(domain):
try:
if not VALID_DOMAIN.match(domain.encode("idna").decode("utf-8")):
return False
except UnicodeError:
return False
except UnicodeError:
return False

# unsuitable content
if UNSUITABLE_DOMAIN.match(domain) or FILE_TYPE.search(domain):
if domain.split(".")[0].isdigit() or FILE_TYPE.search(domain):
return False

# extensions
Expand Down Expand Up @@ -220,12 +214,11 @@ def lang_filter(
score = langcodes_score(language, occurrence, score)
# don't perform the test if there are too many candidates: > 2
# second test: prepended language cues
if strict and language in LANGUAGE_MAPPINGS:
if strict:
match = HOST_LANG_FILTER.match(url)
if match:
candidate = match[1].lower()
LOGGER.debug("candidate lang %s found in URL", candidate)
if candidate in LANGUAGE_MAPPINGS[language]:
if candidate == language:
score += 1
else:
score -= 1
Expand All @@ -244,8 +237,8 @@ def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool:
"""Make sure the target URL is from a suitable type (HTML page with primarily text).
Strict: Try to filter out other document types, spam, video and adult websites."""
try:
# feeds
if url.endswith(("/feed", "/rss")):
# feeds + blogspot
if url.endswith(("/feed", "/rss", "_archive.html")):
raise ValueError
# website structure
if SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)):
Expand All @@ -265,19 +258,16 @@ def validate_url(url: Optional[str]) -> Tuple[bool, Any]:
parsed_url = urlsplit(url)
except ValueError:
return False, None
if not bool(parsed_url.scheme) or parsed_url.scheme not in (
"http",
"https",
):

if not bool(parsed_url.scheme) or parsed_url.scheme not in PROTOCOLS:
return False, None
# fmt: off

if len(parsed_url.netloc) < 5 or (
parsed_url.netloc.startswith("www.") # type: ignore
and len(parsed_url.netloc) < 8
):
return False, None
# fmt: on
# default

return True, parsed_url


Expand Down
3 changes: 3 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,9 @@ def test_spam_filter():

def test_type_filter():
assert type_filter("http://www.example.org/feed") is False
# wp
assert type_filter("http://www.example.org/wp-admin/") is False
assert type_filter("http://www.example.org/wp-includes/this") is False
# straight category
assert type_filter("http://www.example.org/category/123") is False
assert type_filter("http://www.example.org/product-category/123") is False
Expand Down
Loading