Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UrlStore: use bytes instead of str for url paths #88

Merged
merged 9 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion courlan/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _make_sample(
mysample = sorted(sample(urlpaths, k=samplesize))
else:
mysample = urlpaths
output_urls.extend([domain + p for p in mysample])
output_urls.extend([domain.decode("utf-8") + p for p in mysample])
LOGGER.debug(
"%s\t\turls: %s\tprop.: %s",
domain,
Expand Down
88 changes: 48 additions & 40 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def __init__(
self.language: Optional[str] = language
self.strict: bool = strict
self.trailing_slash: bool = trailing
self.urldict: DefaultDict[str, DomainEntry] = defaultdict(DomainEntry)
self.urldict: DefaultDict[bytes, DomainEntry] = defaultdict(DomainEntry)
self._lock: Lock = Lock()

def dump_unvisited_urls(num: Any, frame: Any) -> None:
Expand Down Expand Up @@ -141,33 +141,35 @@ def _buffer_urls(
LOGGER.warning("Discarding URL: %s", url)
return inputdict

def _load_urls(self, domain: str) -> Deque[UrlPathTuple]:
def _load_urls(self, domain: bytes) -> Deque[UrlPathTuple]:
#domain = domain.encode("utf-8")
if domain in self.urldict:
if self.compressed:
return pickle.loads(bz2.decompress(self.urldict[domain].tuples)) # type: ignore
return self.urldict[domain].tuples
return deque()

def _set_done(self) -> None:
if not self.done and all(self.is_exhausted_domain(d) for d in self.urldict):
if not self.done and all(self.is_exhausted_domain(d.decode("utf-8")) for d in self.urldict):
with self._lock:
self.done = True

def _store_urls(
self,
domain: str,
domain: bytes,
to_right: Optional[Deque[UrlPathTuple]] = None,
timestamp: Optional[datetime] = None,
to_left: Optional[Deque[UrlPathTuple]] = None,
) -> None:

# http/https switch
if domain.startswith("http://"):
candidate = "https" + domain[4:]
if domain.startswith(b"http://"):
candidate = b"https" + domain[4:]
# switch
if candidate in self.urldict:
domain = candidate
elif domain.startswith("https://"):
candidate = "http" + domain[5:]
elif domain.startswith(b"https://"):
candidate = b"http" + domain[5:]
# replace entry
if candidate in self.urldict:
self.urldict[domain] = self.urldict[candidate]
Expand Down Expand Up @@ -213,16 +215,17 @@ def _search_urls(
self, urls: List[str], switch: Optional[int] = None
) -> List[Union[Any, str]]:
# init
last_domain: Optional[str] = None
last_domain: Optional[bytes] = None
known_paths: Dict[str, Optional[bool]] = {}
remaining_urls = {u: None for u in urls}
# iterate
for url in sorted(remaining_urls):
hostinfo, urlpath = get_host_and_path(url)
bytesdom = hostinfo.encode("utf-8")
# examine domain
if hostinfo != last_domain:
last_domain = hostinfo
known_paths = {u.urlpath: u.visited for u in self._load_urls(hostinfo)}
if bytesdom != last_domain:
last_domain = bytesdom
known_paths = {u.urlpath: u.visited for u in self._load_urls(bytesdom)}
# run checks: case 1: the path matches, case 2: visited URL
if urlpath in known_paths and (
switch == 1 or (switch == 2 and known_paths[urlpath])
Expand All @@ -231,7 +234,7 @@ def _search_urls(
# preserve input order
return list(remaining_urls)

def _timestamp(self, domain: str) -> Optional[datetime]:
def _timestamp(self, domain: bytes) -> Optional[datetime]:
return self.urldict[domain].timestamp

# ADDITIONS AND DELETIONS
Expand All @@ -247,10 +250,10 @@ def add_urls(
specify if the URLs have already been visited."""
if urls:
for host, urltuples in self._buffer_urls(urls, visited).items():
self._store_urls(host, to_right=urltuples)
self._store_urls(host.encode("utf-8"), to_right=urltuples)
if appendleft:
for host, urltuples in self._buffer_urls(appendleft, visited).items():
self._store_urls(host, to_left=urltuples)
self._store_urls(host.encode("utf-8"), to_left=urltuples)

def add_from_html(
self,
Expand Down Expand Up @@ -278,7 +281,7 @@ def add_from_html(
def discard(self, domains: List[str]) -> None:
"Declare domains void and prune the store."
with self._lock:
for d in domains:
for d in (dom.encode("utf-8") for dom in domains):
self.urldict[d] = DomainEntry()
self.urldict[d].state = State.BUSTED
self._set_done()
Expand All @@ -297,22 +300,23 @@ def reset(self) -> None:

def get_known_domains(self) -> List[str]:
"Return all known domains as a list."
return list(self.urldict)
return [d.decode("utf-8") for d in self.urldict.keys()]

def get_unvisited_domains(self) -> List[str]:
"""Find all domains for which there are unvisited URLs
and potentially adjust done meta-information."""
unvisited = []
if not self.done:
unvisited = [d for d in self.urldict if not self.is_exhausted_domain(d)]
unvisited = [d for d in self.urldict if self.urldict[d].state == State.OPEN]
if not unvisited:
self._set_done()
return unvisited
return [d.decode("utf-8") for d in unvisited]

def is_exhausted_domain(self, domain: str) -> bool:
"Tell if all known URLs for the website have been visited."
if domain in self.urldict:
return self.urldict[domain].state in (State.ALL_VISITED, State.BUSTED)
test_domain = domain.encode("utf-8")
if test_domain in self.urldict:
return self.urldict[test_domain].state in (State.ALL_VISITED, State.BUSTED)
return False
# raise KeyError("website not in store")

Expand All @@ -324,13 +328,14 @@ def unvisited_websites_number(self) -> int:

def find_known_urls(self, domain: str) -> List[str]:
"""Get all already known URLs for the given domain (ex. "https://example.org")."""
return [domain + u.urlpath for u in self._load_urls(domain)]
test_domain = domain.encode("utf-8")
return [test_domain.decode("utf-8") + u.urlpath for u in self._load_urls(test_domain)]

def find_unvisited_urls(self, domain: str) -> List[str]:
"Get all unvisited URLs for the given domain."
if not self.is_exhausted_domain(domain):
return [
domain + u.urlpath for u in self._load_urls(domain) if not u.visited
domain + u.urlpath for u in self._load_urls(domain.encode("utf-8")) if not u.visited
]
return []

Expand All @@ -350,28 +355,29 @@ def is_known(self, url: str) -> bool:
"Check if the given URL has already been stored."
hostinfo, urlpath = get_host_and_path(url)
# returns False if domain or URL is new
return urlpath in {u.urlpath for u in self._load_urls(hostinfo)}
return urlpath in {u.urlpath for u in self._load_urls(hostinfo.encode("utf-8"))}

# DOWNLOADS

def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
"Retrieve a single URL and consider it to be visited (with corresponding timestamp)."
# not fully used
if not self.is_exhausted_domain(domain):
url_tuples = self._load_urls(domain)
bytesdom = domain.encode("utf-8")
url_tuples = self._load_urls(bytesdom)
# get first non-seen url
for url in url_tuples:
if not url.visited:
# store information
if as_visited:
url.visited = True
with self._lock:
self.urldict[domain].count += 1
self._store_urls(domain, url_tuples, timestamp=datetime.now())
self.urldict[bytesdom].count += 1
self._store_urls(bytesdom, url_tuples, timestamp=datetime.now())
return domain + url.urlpath
# nothing to draw from
with self._lock:
self.urldict[domain].state = State.ALL_VISITED
self.urldict[domain.encode("utf-8")].state = State.ALL_VISITED
self._set_done()
return None

Expand All @@ -381,7 +387,7 @@ def get_download_urls(self, timelimit: int = 10) -> Optional[List[str]]:
potential = self.get_unvisited_domains()
targets = []
for domain in potential:
timestamp = self._timestamp(domain)
timestamp = self._timestamp(domain.encode("utf-8"))
if (
timestamp is None
or (datetime.now() - timestamp).total_seconds() > timelimit
Expand All @@ -404,8 +410,9 @@ def establish_download_schedule(
targets: List[Tuple[float, str]] = []
# iterate potential domains
for domain in potential:
bytesdom = domain.encode("utf-8")
# load urls
url_tuples = self._load_urls(domain)
url_tuples = self._load_urls(bytesdom)
urlpaths: List[str] = []
# get first non-seen urls
for url in url_tuples:
Expand All @@ -418,10 +425,10 @@ def establish_download_schedule(
urlpaths.append(url.urlpath)
url.visited = True
with self._lock:
self.urldict[domain].count += 1
self.urldict[bytesdom].count += 1
# determine timestamps
now = datetime.now()
original_timestamp = self._timestamp(domain)
original_timestamp = self._timestamp(bytesdom)
if (
original_timestamp is None
or (now - original_timestamp).total_seconds() > time_limit
Expand All @@ -437,7 +444,7 @@ def establish_download_schedule(
# calculate difference and offset last addition
total_diff = now + timedelta(0, schedule_secs - time_limit)
# store new info
self._store_urls(domain, url_tuples, timestamp=total_diff)
self._store_urls(bytesdom, url_tuples, timestamp=total_diff)
# sort by first tuple element (time in secs)
self._set_done()
return sorted(targets, key=lambda x: x[0]) # type: ignore[arg-type]
Expand All @@ -450,14 +457,15 @@ def store_rules(self, website: str, rules: Optional[RobotFileParser]) -> None:
rules = zlib.compress( # type: ignore[assignment]
pickle.dumps(rules, protocol=4)
)
self.urldict[website].rules = rules
self.urldict[website.encode("utf-8")].rules = rules

def get_rules(self, website: str) -> Optional[RobotFileParser]:
"Return the stored crawling rules for the given website."
if website in self.urldict:
bytesdom = website.encode("utf-8")
if bytesdom in self.urldict:
if self.compressed:
return pickle.loads(zlib.decompress(self.urldict[website].rules)) # type: ignore
return self.urldict[website].rules
return pickle.loads(zlib.decompress(self.urldict[bytesdom].rules)) # type: ignore
return self.urldict[bytesdom].rules
return None

def get_crawl_delay(self, website: str, default: float = 5) -> float:
Expand Down Expand Up @@ -489,21 +497,21 @@ def dump_urls(self) -> List[str]:
"Return a list of all known URLs."
urls = []
for domain in self.urldict:
urls.extend(self.find_known_urls(domain))
urls.extend(self.find_known_urls(domain.decode("utf-8")))
return urls

def print_unvisited_urls(self) -> None:
"Print all unvisited URLs in store."
for domain in self.urldict:
print("\n".join(self.find_unvisited_urls(domain)), flush=True)
print("\n".join(self.find_unvisited_urls(domain.decode("utf-8"))), flush=True)

def print_urls(self) -> None:
"Print all URLs in store (URL + TAB + visited or not)."
for domain in self.urldict:
print(
"\n".join(
[
domain + u.urlpath + "\t" + str(u.visited)
domain.decode("utf-8") + u.urlpath + "\t" + str(u.visited)
for u in self._load_urls(domain)
]
),
Expand Down
Loading
Loading