diff --git a/courlan/urlstore.py b/courlan/urlstore.py index f59e213..75205bc 100644 --- a/courlan/urlstore.py +++ b/courlan/urlstore.py @@ -509,3 +509,19 @@ def print_urls(self) -> None: ), flush=True, ) + + # PERSISTANCE + + def write(self, filename: str) -> None: + "Write the URL store to disk." + del self._lock + with open(filename, "wb") as output: + pickle.dump(self, output) + + +def load_store(filename: str) -> UrlStore: + "Load a URL store from disk." + with open(filename, "rb") as output: + url_store = pickle.load(output) + url_store._lock = Lock() + return url_store # type: ignore[no-any-return] diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py index 0a1f408..eb0ae5f 100644 --- a/tests/urlstore_tests.py +++ b/tests/urlstore_tests.py @@ -7,6 +7,7 @@ import pickle import signal import sys +import tempfile import uuid from datetime import datetime @@ -16,7 +17,7 @@ from courlan import UrlStore from courlan.core import filter_links -from courlan.urlstore import State +from courlan.urlstore import State, load_store def test_urlstore(): @@ -414,3 +415,29 @@ def test_from_html(): htmlstring = '' url_store.add_from_html(htmlstring, base_url) assert not url_store.find_known_urls(base_url) + + +def test_persistance(): + "Test writing and loading to/from disk." + url_store = UrlStore( + compressed=True, language="de", strict=True, trailing=True, verbose=True + ) + example_urls = [f"https://www.example.org/{str(a)}" for a in range(100)] + test_urls = [f"https://test.org/{str(uuid.uuid4())[:20]}" for _ in range(100)] + url_store.add_urls(example_urls + test_urls) + + _, tmp = tempfile.mkstemp() + url_store.write(tmp) + new_store = load_store(tmp) + try: + os.remove(tmp) + except PermissionError: + pass # Windows + + assert new_store.compressed is True + assert new_store.language == "de" + assert new_store.strict is True + assert new_store.trailing_slash is True + urls = set(new_store.dump_urls()) + assert new_store.total_url_number() == len(urls) == 200 + assert "https://www.example.org/99" in urls