Skip to content

Commit

Permalink
remove superfluous lines
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Oct 22, 2024
1 parent 52b4042 commit 5444d1d
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,13 +169,15 @@ def extract_links(
if base_url:
raise ValueError("'base_url' is deprecated, use 'url' instead.")

base_url = base_url or get_base_url(url)
base_url = get_base_url(url)
url = url or base_url
candidates, validlinks = set(), set() # type: Set[str], Set[str]
if not pagecontent:
return validlinks

# define host reference
reference = reference or base_url

# extract links
for link in (m[0] for m in FIND_LINKS_REGEX.finditer(pagecontent)):
if "rel" in link and "nofollow" in link:
Expand All @@ -194,6 +196,7 @@ def extract_links(
linkmatch = LINK_REGEX.search(link)
if linkmatch:
candidates.add(linkmatch[1])

# filter candidates
for link in candidates:
# repair using base
Expand All @@ -220,7 +223,7 @@ def extract_links(
if is_known_link(link, validlinks):
continue
validlinks.add(link)
# return

LOGGER.info("%s links found – %s valid links", len(candidates), len(validlinks))
return validlinks

Expand All @@ -242,7 +245,7 @@ def filter_links(
raise ValueError("'base_url' is deprecated, use 'url' instead.")

links, links_priority = [], []
url = url

for link in extract_links(
pagecontent=htmlstring,
url=url,
Expand All @@ -261,4 +264,5 @@ def filter_links(
links_priority.append(link)
else:
links.append(link)

return links, links_priority

0 comments on commit 5444d1d

Please sign in to comment.