From 47e7e595872729399ce4e208b30137891fe1f6e9 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 29 Aug 2024 13:06:38 +0200 Subject: [PATCH] UrlStore compression: make bz2 & zlib optional and update protocol (#113) * UrlStore compression: make bz2 & zlib optional * fix coverage --- .coveragerc | 3 +- courlan/urlstore.py | 62 ++++++++++++++++++++++++++++++++++------- tests/urlstore_tests.py | 12 +++++++- 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/.coveragerc b/.coveragerc index 84c79509..c7053869 100644 --- a/.coveragerc +++ b/.coveragerc @@ -8,4 +8,5 @@ omit = [report] exclude_lines = pragma: no cover - if __name__ == .__main__.: \ No newline at end of file + if __name__ == .__main__.: + ImportError: diff --git a/courlan/urlstore.py b/courlan/urlstore.py index b19a6000..b96d8458 100644 --- a/courlan/urlstore.py +++ b/courlan/urlstore.py @@ -2,13 +2,26 @@ Defines a URL store which holds URLs along with relevant information and entails crawling helpers. """ -import bz2 import gc import logging import pickle import signal import sys -import zlib + +try: + import bz2 + + HAS_BZ2 = True +except ImportError: + HAS_BZ2 = False + +try: + import zlib + + HAS_ZLIB = True +except ImportError: + HAS_ZLIB = False + from collections import defaultdict, deque from datetime import datetime, timedelta @@ -38,6 +51,39 @@ LOGGER = logging.getLogger(__name__) +class Compressor: + "Use system information on available compression modules and define corresponding methods." + __slots__ = ("compressor", "decompressor") + + def __init__(self, compression: bool = True) -> None: + self.compressor: Any = ( + bz2.compress + if compression and HAS_BZ2 + else zlib.compress if compression and HAS_ZLIB else self._identical + ) + self.decompressor: Any = ( + bz2.decompress + if compression and HAS_BZ2 + else zlib.decompress if compression and HAS_ZLIB else self._identical + ) + + @staticmethod + def _identical(data: Any) -> Any: + "Return unchanged data." + return data + + def compress(self, data: Any) -> Any: + "Pickle the data and compress it if a method is available." + return self.compressor(pickle.dumps(data, protocol=5)) + + def decompress(self, data: bytes) -> Any: + "Decompress the data if a method is available and load the object." + return pickle.loads(self.decompressor(data)) + + +COMPRESSOR = Compressor() + + class State(Enum): "Record state information about a domain or host." OPEN = 1 @@ -149,7 +195,7 @@ def _buffer_urls( def _load_urls(self, domain: str) -> Deque[UrlPathTuple]: if domain in self.urldict: if self.compressed: - return pickle.loads(bz2.decompress(self.urldict[domain].tuples)) # type: ignore + return COMPRESSOR.decompress(self.urldict[domain].tuples) # type: ignore return self.urldict[domain].tuples return deque() @@ -197,9 +243,7 @@ def _store_urls( with self._lock: if self.compressed: - self.urldict[domain].tuples = bz2.compress( # type: ignore[assignment] - pickle.dumps(urls, protocol=4) - ) + self.urldict[domain].tuples = COMPRESSOR.compress(urls) else: self.urldict[domain].tuples = urls self.urldict[domain].total = len(urls) @@ -453,16 +497,14 @@ def establish_download_schedule( def store_rules(self, website: str, rules: Optional[RobotFileParser]) -> None: "Store crawling rules for a given website." if self.compressed: - rules = zlib.compress( # type: ignore[assignment] - pickle.dumps(rules, protocol=4) - ) + rules = COMPRESSOR.compress(rules) self.urldict[website].rules = rules def get_rules(self, website: str) -> Optional[RobotFileParser]: "Return the stored crawling rules for the given website." if website in self.urldict: if self.compressed: - return pickle.loads(zlib.decompress(self.urldict[website].rules)) # type: ignore + return COMPRESSOR.decompress(self.urldict[website].rules) # type: ignore return self.urldict[website].rules return None diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py index 193ab7aa..ab171b18 100644 --- a/tests/urlstore_tests.py +++ b/tests/urlstore_tests.py @@ -16,7 +16,17 @@ import pytest from courlan import UrlStore -from courlan.urlstore import State, load_store +from courlan.urlstore import Compressor, State, load_store, HAS_BZ2, HAS_ZLIB + + +def test_compressor(): + "Test compression class." + assert HAS_BZ2 or HAS_ZLIB + data = 1234 + + for setting in (True, False): + comp = Compressor(compression=setting) + assert comp.decompress(comp.compress(data)) == data def test_urlstore():