Skip to content

Commit

Permalink
UrlStore compression: make bz2 & zlib optional and update protocol (#113
Browse files Browse the repository at this point in the history
)

* UrlStore compression: make bz2 & zlib optional

* fix coverage
  • Loading branch information
adbar authored Aug 29, 2024
1 parent a48713a commit 47e7e59
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 12 deletions.
3 changes: 2 additions & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ omit =
[report]
exclude_lines =
pragma: no cover
if __name__ == .__main__.:
if __name__ == .__main__.:
ImportError:
62 changes: 52 additions & 10 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,26 @@
Defines a URL store which holds URLs along with relevant information and entails crawling helpers.
"""

import bz2
import gc
import logging
import pickle
import signal
import sys
import zlib

try:
import bz2

HAS_BZ2 = True
except ImportError:
HAS_BZ2 = False

try:
import zlib

HAS_ZLIB = True
except ImportError:
HAS_ZLIB = False


from collections import defaultdict, deque
from datetime import datetime, timedelta
Expand Down Expand Up @@ -38,6 +51,39 @@
LOGGER = logging.getLogger(__name__)


class Compressor:
"Use system information on available compression modules and define corresponding methods."
__slots__ = ("compressor", "decompressor")

def __init__(self, compression: bool = True) -> None:
self.compressor: Any = (
bz2.compress
if compression and HAS_BZ2
else zlib.compress if compression and HAS_ZLIB else self._identical
)
self.decompressor: Any = (
bz2.decompress
if compression and HAS_BZ2
else zlib.decompress if compression and HAS_ZLIB else self._identical
)

@staticmethod
def _identical(data: Any) -> Any:
"Return unchanged data."
return data

def compress(self, data: Any) -> Any:
"Pickle the data and compress it if a method is available."
return self.compressor(pickle.dumps(data, protocol=5))

def decompress(self, data: bytes) -> Any:
"Decompress the data if a method is available and load the object."
return pickle.loads(self.decompressor(data))


COMPRESSOR = Compressor()


class State(Enum):
"Record state information about a domain or host."
OPEN = 1
Expand Down Expand Up @@ -149,7 +195,7 @@ def _buffer_urls(
def _load_urls(self, domain: str) -> Deque[UrlPathTuple]:
if domain in self.urldict:
if self.compressed:
return pickle.loads(bz2.decompress(self.urldict[domain].tuples)) # type: ignore
return COMPRESSOR.decompress(self.urldict[domain].tuples) # type: ignore
return self.urldict[domain].tuples
return deque()

Expand Down Expand Up @@ -197,9 +243,7 @@ def _store_urls(

with self._lock:
if self.compressed:
self.urldict[domain].tuples = bz2.compress( # type: ignore[assignment]
pickle.dumps(urls, protocol=4)
)
self.urldict[domain].tuples = COMPRESSOR.compress(urls)
else:
self.urldict[domain].tuples = urls
self.urldict[domain].total = len(urls)
Expand Down Expand Up @@ -453,16 +497,14 @@ def establish_download_schedule(
def store_rules(self, website: str, rules: Optional[RobotFileParser]) -> None:
"Store crawling rules for a given website."
if self.compressed:
rules = zlib.compress( # type: ignore[assignment]
pickle.dumps(rules, protocol=4)
)
rules = COMPRESSOR.compress(rules)
self.urldict[website].rules = rules

def get_rules(self, website: str) -> Optional[RobotFileParser]:
"Return the stored crawling rules for the given website."
if website in self.urldict:
if self.compressed:
return pickle.loads(zlib.decompress(self.urldict[website].rules)) # type: ignore
return COMPRESSOR.decompress(self.urldict[website].rules) # type: ignore
return self.urldict[website].rules
return None

Expand Down
12 changes: 11 additions & 1 deletion tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,17 @@
import pytest

from courlan import UrlStore
from courlan.urlstore import State, load_store
from courlan.urlstore import Compressor, State, load_store, HAS_BZ2, HAS_ZLIB


def test_compressor():
"Test compression class."
assert HAS_BZ2 or HAS_ZLIB
data = 1234

for setting in (True, False):
comp = Compressor(compression=setting)
assert comp.decompress(comp.compress(data)) == data


def test_urlstore():
Expand Down

0 comments on commit 47e7e59

Please sign in to comment.