Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

deprecate base_url parameter in extract_links() #121

Merged
merged 3 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# import locale
import logging
import re
import warnings

from typing import List, Optional, Set, Tuple
from urllib.robotparser import RobotFileParser
Expand Down Expand Up @@ -136,15 +135,16 @@ def check_url(
def extract_links(
pagecontent: str,
url: Optional[str] = None,
base_url: Optional[str] = None,
external_bool: bool = False,
*,
no_filter: bool = False,
language: Optional[str] = None,
strict: bool = True,
trailing_slash: bool = True,
with_nav: bool = False,
redirects: bool = False,
reference: Optional[str] = None,
base_url: Optional[str] = None,
) -> Set[str]:
"""Filter links in a HTML document using a series of heuristics
Args:
Expand All @@ -167,17 +167,17 @@ def extract_links(
Nothing.
"""
if base_url:
warnings.warn(
"'base_url' will soon be deprecated, use 'url'.", PendingDeprecationWarning
)
raise ValueError("'base_url' is deprecated, use 'url' instead.")

base_url = base_url or get_base_url(url)
base_url = get_base_url(url)
url = url or base_url
candidates, validlinks = set(), set() # type: Set[str], Set[str]
if not pagecontent:
return validlinks

# define host reference
reference = reference or base_url

# extract links
for link in (m[0] for m in FIND_LINKS_REGEX.finditer(pagecontent)):
if "rel" in link and "nofollow" in link:
Expand All @@ -196,6 +196,7 @@ def extract_links(
linkmatch = LINK_REGEX.search(link)
if linkmatch:
candidates.add(linkmatch[1])

# filter candidates
for link in candidates:
# repair using base
Expand All @@ -222,24 +223,29 @@ def extract_links(
if is_known_link(link, validlinks):
continue
validlinks.add(link)
# return

LOGGER.info("%s links found – %s valid links", len(candidates), len(validlinks))
return validlinks


def filter_links(
htmlstring: str,
url: Optional[str],
base_url: Optional[str] = None,
*,
lang: Optional[str] = None,
rules: Optional[RobotFileParser] = None,
external: bool = False,
strict: bool = False,
with_nav: bool = True,
base_url: Optional[str] = None,
) -> Tuple[List[str], List[str]]:
"Find links in a HTML document, filter and prioritize them for crawling purposes."

if base_url:
raise ValueError("'base_url' is deprecated, use 'url' instead.")

links, links_priority = [], []
url = url or base_url

for link in extract_links(
pagecontent=htmlstring,
url=url,
Expand All @@ -258,4 +264,5 @@ def filter_links(
links_priority.append(link)
else:
links.append(link)

return links, links_priority
14 changes: 11 additions & 3 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,7 +862,9 @@ def test_external():

def test_extraction():
"""test link comparison in HTML"""
assert len(extract_links(None, "https://test.com/", False)) == 0
with pytest.raises(ValueError):
extract_links(None, base_url="https://test.com/", external_bool=False)
assert len(extract_links(None, url="https://test.com/", external_bool=False)) == 0
assert len(extract_links("", "https://test.com/", False)) == 0
# link known under another form
pagecontent = '<html><a href="https://test.org/example"/><a href="https://test.org/example/&"/></html>'
Expand Down Expand Up @@ -933,7 +935,7 @@ def test_extraction():
"https://httpbin.org/links/2/1",
]
links = extract_links(
pagecontent, base_url="https://httpbin.org", external_bool=False, with_nav=True
pagecontent, url="https://httpbin.org", external_bool=False, with_nav=True
)
assert sorted(links) == [
"https://httpbin.org/links/2/0",
Expand Down Expand Up @@ -1033,11 +1035,17 @@ def test_extraction():
"https://test.com/example",
"https://test.com/page/2",
]

# link filtering
base_url = "https://example.org"
htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
links, links_priority = filter_links(htmlstring, base_url)

with pytest.raises(ValueError):
filter_links(htmlstring, url=None, base_url=base_url)

links, links_priority = filter_links(htmlstring, url=base_url)
assert len(links) == 1 and not links_priority

# link filtering with relative URLs
url = "https://example.org/page1.html"
htmlstring = '<html><body><a href="/subpage1"/><a href="/subpage1/"/><a href="https://test.org/page1"/></body></html>'
Expand Down
Loading