Skip to content

Commit

Permalink
UrlStore: add write and load functions (#83)
Browse files Browse the repository at this point in the history
* persist UrlStore to disk

* fix Windows tests

* more tests
  • Loading branch information
adbar authored Jan 31, 2024
1 parent 9a7cf63 commit d260675
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 1 deletion.
16 changes: 16 additions & 0 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,3 +509,19 @@ def print_urls(self) -> None:
),
flush=True,
)

# PERSISTANCE

def write(self, filename: str) -> None:
"Write the URL store to disk."
del self._lock
with open(filename, "wb") as output:
pickle.dump(self, output)


def load_store(filename: str) -> UrlStore:
"Load a URL store from disk."
with open(filename, "rb") as output:
url_store = pickle.load(output)
url_store._lock = Lock()
return url_store # type: ignore[no-any-return]
29 changes: 28 additions & 1 deletion tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pickle
import signal
import sys
import tempfile
import uuid

from datetime import datetime
Expand All @@ -16,7 +17,7 @@

from courlan import UrlStore
from courlan.core import filter_links
from courlan.urlstore import State
from courlan.urlstore import State, load_store


def test_urlstore():
Expand Down Expand Up @@ -414,3 +415,29 @@ def test_from_html():
htmlstring = '<html><body><a href="https://example.org/en/page2"/><a href="https://example.org/imprint.html"/></body></html>'
url_store.add_from_html(htmlstring, base_url)
assert not url_store.find_known_urls(base_url)


def test_persistance():
"Test writing and loading to/from disk."
url_store = UrlStore(
compressed=True, language="de", strict=True, trailing=True, verbose=True
)
example_urls = [f"https://www.example.org/{str(a)}" for a in range(100)]
test_urls = [f"https://test.org/{str(uuid.uuid4())[:20]}" for _ in range(100)]
url_store.add_urls(example_urls + test_urls)

_, tmp = tempfile.mkstemp()
url_store.write(tmp)
new_store = load_store(tmp)
try:
os.remove(tmp)
except PermissionError:
pass # Windows

assert new_store.compressed is True
assert new_store.language == "de"
assert new_store.strict is True
assert new_store.trailing_slash is True
urls = set(new_store.dump_urls())
assert new_store.total_url_number() == len(urls) == 200
assert "https://www.example.org/99" in urls

0 comments on commit d260675

Please sign in to comment.