diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1cb7739..6a4abf5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-json
- exclude: ".vscode(.dist)?/.*"
+ exclude: "(.vscode(.dist)?/.*)|(tests/test_dict.json)"
- id: check-yaml
- id: check-builtin-literals
- id: check-case-conflict
diff --git a/src/consts.py b/src/consts.py
index ac423cc..ea884c2 100644
--- a/src/consts.py
+++ b/src/consts.py
@@ -1,3 +1,21 @@
+import dataclasses
+from pathlib import Path
+
+from ankiutils.consts import AddonConsts as BaseAddonConsts
from ankiutils.consts import get_consts
-consts = get_consts(__name__)
+
+@dataclasses.dataclass
+class AddonConsts(BaseAddonConsts):
+ userfiles_dir: Path
+ dicts_dir: Path
+ icons_dir: Path
+
+
+base_consts = get_consts(__name__)
+consts = AddonConsts(
+ **dataclasses.asdict(base_consts),
+ userfiles_dir=base_consts.dir / "user_files",
+ dicts_dir=base_consts.dir / "user_files" / "dictionaries",
+ icons_dir=base_consts.dir / "icons"
+)
diff --git a/src/fetcher.py b/src/fetcher.py
index 4095d86..00964b5 100644
--- a/src/fetcher.py
+++ b/src/fetcher.py
@@ -1,9 +1,10 @@
from __future__ import annotations
-import functools
import json
+import shutil
+import sqlite3
from pathlib import Path
-from typing import Callable
+from typing import Any, Callable
class WiktionaryError(Exception):
@@ -16,10 +17,34 @@ class WordNotFoundError(WiktionaryError):
class WiktionaryFetcher:
def __init__(self, dictionary: str, base_dir: Path):
- self.dict_dir = base_dir / dictionary
+ self.db_path = base_dir / f"{dictionary}.db"
+ self._connection = sqlite3.connect(self.db_path, check_same_thread=False)
+ self._connection.executescript(
+ """
+ CREATE TABLE IF NOT EXISTS words (
+ word text,
+ data text
+ );
+ CREATE INDEX IF NOT EXISTS index_word ON words(word);
+ """
+ )
+
+ def close(self) -> None:
+ self._connection.close()
+
+ def __enter__(self) -> WiktionaryFetcher:
+ return self
+
+ def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None:
+ self.close()
+
+ def _add_word(self, word: str, data: str) -> None:
+ self._connection.execute(
+ "INSERT INTO words(word, data) values(?, ?)", (word, data)
+ )
@classmethod
- def dump_kaikki_dict(
+ def import_kaikki_dict(
cls,
filename: str | Path,
dictionary: str,
@@ -28,44 +53,42 @@ def dump_kaikki_dict(
base_dir: Path,
) -> int:
"""Dumps a JSON file downloaded from https://kaikki.org/dictionary/{lang}/
- to separate files for each entry in 'dictionary'"""
- outdir = base_dir / dictionary
- outdir.mkdir(exist_ok=True)
+ to a SQLite database"""
+ base_dir.mkdir(exist_ok=True)
count = 0
with open(filename, encoding="utf-8") as file:
- for i, line in enumerate(file):
- entry = json.loads(line)
- word = entry["word"]
- try:
- with open(
- outdir / f"{word}.json",
- mode="w",
- encoding="utf-8",
- ) as outfile:
- outfile.write(line)
+ with WiktionaryFetcher(dictionary, base_dir) as fetcher:
+ for i, line in enumerate(file):
+ try:
+ entry = json.loads(line)
+ word = entry["word"]
+ fetcher._add_word(word, line)
count += 1
- except Exception as exc:
- on_error(word, exc)
- if i % 50 == 0:
- if not on_progress(i + 1):
- break
+ except Exception as exc:
+ print(f"{exc=}")
+ on_error(word, exc)
+ if i % 50 == 0:
+ if not on_progress(i + 1):
+ break
+ fetcher._connection.commit()
return count
- @staticmethod
- @functools.lru_cache
- def _get_word_json(dict_dir: Path, word: str) -> dict:
- # TODO: handle words with multiple word senses
-
- try:
- with open(dict_dir / f"{word}.json", encoding="utf-8") as file:
- return json.load(file)
- except FileNotFoundError as exc:
- raise WordNotFoundError(
- f'"{word}" was not found in the dictionary.'
- ) from exc
+ @classmethod
+ def migrate_dict_to_sqlite(cls, dictionary_dir: Path, new_dir: Path) -> None:
+ with WiktionaryFetcher(dictionary_dir.name, new_dir) as fetcher:
+ for file in dictionary_dir.iterdir():
+ fetcher._add_word(file.stem, file.read_text(encoding="utf-8"))
+ fetcher._connection.commit()
+ shutil.rmtree(dictionary_dir)
def get_word_json(self, word: str) -> dict:
- return self._get_word_json(self.dict_dir, word)
+ # TODO: handle words with multiple word senses
+ row = self._connection.execute(
+ "SELECT data FROM words WHERE word = ?", (word,)
+ ).fetchone()
+ if not row:
+ raise WordNotFoundError(f'"{word}" was not found in the dictionary.')
+ return json.loads(row[0])
def get_senses(self, word: str) -> list[str]:
data = self.get_word_json(word)
diff --git a/src/gui/importer.py b/src/gui/importer.py
index 964ec50..6bc6791 100644
--- a/src/gui/importer.py
+++ b/src/gui/importer.py
@@ -2,6 +2,8 @@
import os
import re
+import sys
+import traceback
from concurrent.futures import Future
from typing import TYPE_CHECKING
@@ -89,6 +91,7 @@ def on_done(future: Future) -> None:
try:
count = future.result()
except Exception as exc:
+ traceback.print_exception(None, exc, exc.__traceback__, file=sys.stdout)
showWarning(str(exc), parent=self, title=consts.name)
return
tooltip(f"Successfully imported {count} words", parent=self.mw)
@@ -101,13 +104,13 @@ def on_done(future: Future) -> None:
return
self.mw.progress.start(label="Starting importing...", parent=self)
self.mw.progress.set_title(f"{consts.name} - Importing a dictionary")
- # TODO: handle exceptions
self.mw.taskman.run_in_background(
- lambda: WiktionaryFetcher.dump_kaikki_dict(
+ lambda: WiktionaryFetcher.import_kaikki_dict(
filename,
name,
on_progress=on_progress,
on_error=on_error,
+ base_dir=consts.dicts_dir,
),
on_done=on_done,
)
diff --git a/src/gui/main.py b/src/gui/main.py
index c0a05b9..34e7346 100644
--- a/src/gui/main.py
+++ b/src/gui/main.py
@@ -2,7 +2,7 @@
import os
import time
-from typing import TYPE_CHECKING, Any, Callable, cast
+from typing import TYPE_CHECKING, Any, Callable
from urllib.parse import unquote
import requests
@@ -21,6 +21,7 @@
from ..consts import consts
from ..fetcher import WiktionaryFetcher, WordNotFoundError
+from ..utils import get_dict_names
if TYPE_CHECKING or qtmajor > 5:
from ..forms.main_qt6 import Ui_Dialog
@@ -31,10 +32,6 @@
PROGRESS_LABEL = "Updated {count} out of {total} note(s)"
-def get_available_dicts() -> list[str]:
- return [p.name for p in (consts.dir / "user_files").iterdir() if p.is_dir()]
-
-
class WiktionaryFetcherDialog(QDialog):
def __init__(
self,
@@ -61,9 +58,9 @@ def __init__(
]
self.setWindowTitle(consts.name)
self.form.icon.setPixmap(
- QPixmap(os.path.join(consts.dir, "icons", "enwiktionary-1.5x.png"))
+ QPixmap(os.path.join(consts.icons_dir, "enwiktionary-1.5x.png"))
)
- self.form.dictionaryComboBox.addItems(get_available_dicts())
+ self.form.dictionaryComboBox.addItems(get_dict_names())
self.downloader: WiktionaryFetcher | None = None
qconnect(self.form.addButton.clicked, self.on_add)
self.form.addButton.setShortcut(QKeySequence("Ctrl+Return"))
@@ -153,8 +150,7 @@ def on_add(self) -> None:
textFormat="rich",
)
return
- dictionary = self.form.dictionaryComboBox.currentText()
- self.downloader = WiktionaryFetcher(dictionary, consts.dir / "user_files")
+ dictionary_name = self.form.dictionaryComboBox.currentText()
word_field = self.form.wordFieldComboBox.currentText()
definition_field_i = self.form.definitionFieldComboBox.currentIndex()
example_field_i = self.form.exampleFieldComboBox.currentIndex()
@@ -186,6 +182,7 @@ def on_failure(exc: Exception) -> None:
op = QueryOp(
parent=self,
op=lambda col: self._fill_notes(
+ dictionary_name,
word_field,
field_tuples,
),
@@ -203,6 +200,7 @@ def on_failure(exc: Exception) -> None:
def _fill_notes(
self,
+ dictionary_name: str,
word_field: str,
field_tuples: tuple[tuple[int, Callable[[str], str]], ...],
) -> None:
@@ -222,33 +220,33 @@ def on_progress() -> None:
max=len(self.notes),
)
- for note in self.notes:
- word = strip_html(note[word_field]).strip()
- if not word:
- continue
- need_updating = False
- try:
- for field_tuple in field_tuples:
- if not field_tuple[0]:
- continue
- contents = field_tuple[1](word)
- note[self.field_names[field_tuple[0]]] = contents
- need_updating = True
- except WordNotFoundError as exc:
- self.errors.append(str(exc))
- finally:
- if need_updating:
- self.updated_notes.append(note)
- if time.time() - last_progress >= 0.01:
- self.mw.taskman.run_on_main(on_progress)
- last_progress = time.time()
- if want_cancel:
- break
+ with WiktionaryFetcher(dictionary_name, consts.dicts_dir) as fetcher:
+ for note in self.notes:
+ word = strip_html(note[word_field]).strip()
+ if not word:
+ continue
+ need_updating = False
+ try:
+ for field_tuple in field_tuples:
+ if not field_tuple[0]:
+ continue
+ contents = field_tuple[1](fetcher, word)
+ note[self.field_names[field_tuple[0]]] = contents
+ need_updating = True
+ except WordNotFoundError as exc:
+ self.errors.append(str(exc))
+ finally:
+ if need_updating:
+ self.updated_notes.append(note)
+ if time.time() - last_progress >= 0.01:
+ self.mw.taskman.run_on_main(on_progress)
+ last_progress = time.time()
+ if want_cancel:
+ break
self.mw.taskman.run_on_main(self.mw.progress.finish)
- def _get_definitions(self, word: str) -> str:
- downloader = cast(WiktionaryFetcher, self.downloader)
- defs = downloader.get_senses(word)
+ def _get_definitions(self, fetcher: WiktionaryFetcher, word: str) -> str:
+ defs = fetcher.get_senses(word)
if len(defs) == 0:
return ""
if len(defs) == 1:
@@ -259,9 +257,8 @@ def _get_definitions(self, word: str) -> str:
formatted += ""
return formatted
- def _get_examples(self, word: str) -> str:
- downloader = cast(WiktionaryFetcher, self.downloader)
- examples = downloader.get_examples(word)
+ def _get_examples(self, fetcher: WiktionaryFetcher, word: str) -> str:
+ examples = fetcher.get_examples(word)
if len(examples) == 0:
return ""
if len(examples) == 1:
@@ -272,21 +269,17 @@ def _get_examples(self, word: str) -> str:
formatted += ""
return formatted
- def _get_gender(self, word: str) -> str:
- downloader = cast(WiktionaryFetcher, self.downloader)
- return downloader.get_gender(word)
+ def _get_gender(self, fetcher: WiktionaryFetcher, word: str) -> str:
+ return fetcher.get_gender(word)
- def _get_part_of_speech(self, word: str) -> str:
- downloader = cast(WiktionaryFetcher, self.downloader)
- return downloader.get_part_of_speech(word)
+ def _get_part_of_speech(self, fetcher: WiktionaryFetcher, word: str) -> str:
+ return fetcher.get_part_of_speech(word)
- def _get_ipa(self, word: str) -> str:
- downloader = cast(WiktionaryFetcher, self.downloader)
- return downloader.get_ipa(word)
+ def _get_ipa(self, fetcher: WiktionaryFetcher, word: str) -> str:
+ return fetcher.get_ipa(word)
- def _get_audio(self, word: str) -> str:
- downloader = cast(WiktionaryFetcher, self.downloader)
- url = downloader.get_audio_url(word)
+ def _get_audio(self, fetcher: WiktionaryFetcher, word: str) -> str:
+ url = fetcher.get_audio_url(word)
http_session = requests.Session()
# https://meta.wikimedia.org/wiki/User-Agent_policy
@@ -302,13 +295,11 @@ def _get_audio(self, word: str) -> str:
filename = self.mw.col.media.write_data(unquote(os.path.basename(url)), data)
return "[sound:" + filename + "]"
- def _get_etymology(self, word: str) -> str:
- downloader = cast(WiktionaryFetcher, self.downloader)
- return downloader.get_etymology(word)
+ def _get_etymology(self, fetcher: WiktionaryFetcher, word: str) -> str:
+ return fetcher.get_etymology(word)
- def _get_declension(self, word: str) -> str:
- downloader = cast(WiktionaryFetcher, self.downloader)
- declensions = downloader.get_declension(word)
+ def _get_declension(self, fetcher: WiktionaryFetcher, word: str) -> str:
+ declensions = fetcher.get_declension(word)
if len(declensions) == 0:
return ""
formatted = "
"
diff --git a/src/main.py b/src/main.py
index ac364ff..75f321a 100644
--- a/src/main.py
+++ b/src/main.py
@@ -16,6 +16,7 @@
from .gui.importer import ImportDictionaryDialog
from .gui.main import WiktionaryFetcherDialog
from .log import logger
+from .migration import migrate_legacy_dicts
def on_bulk_updated_notes(
@@ -82,7 +83,7 @@ def on_editor_did_init_buttons(buttons: list[str], editor: Editor) -> None:
QKeySequence.SequenceFormat.NativeText
)
button = editor.addButton(
- icon=os.path.join(consts.dir, "icons", "en.ico"),
+ icon=os.path.join(consts.icons_dir, "en.ico"),
cmd="wiktionary",
tip=f"{consts.name} ({shortcut})" if shortcut else consts.name,
func=on_editor_button_clicked,
@@ -115,3 +116,4 @@ def add_wiktionary_menu() -> None:
browser_menus_did_init.append(on_browser_menus_did_init)
editor_did_init_buttons.append(on_editor_did_init_buttons)
add_wiktionary_menu()
+migrate_legacy_dicts()
diff --git a/src/migration.py b/src/migration.py
new file mode 100644
index 0000000..241c5d8
--- /dev/null
+++ b/src/migration.py
@@ -0,0 +1,28 @@
+from anki.collection import Collection
+from anki.utils import pointVersion
+from aqt import mw
+from aqt.operations import QueryOp
+from aqt.utils import tooltip
+
+from .consts import consts
+from .fetcher import WiktionaryFetcher
+from .utils import get_legacy_dict_dirs
+
+
+def migrate_legacy_dicts() -> None:
+ legacy_dicts = get_legacy_dict_dirs()
+
+ def op(col: Collection) -> None:
+ for dict_dir in legacy_dicts:
+ WiktionaryFetcher.migrate_dict_to_sqlite(dict_dir, consts.dicts_dir)
+
+ def success(_: None) -> None:
+ tooltip("Migrated Wiktionary dictionaries successfully")
+
+ if legacy_dicts:
+ query_op = QueryOp(parent=mw, op=op, success=success)
+ if pointVersion() >= 50:
+ query_op = query_op.with_progress(label="Migrating Wiktionary dictionaries")
+ if pointVersion() >= 231000:
+ query_op = query_op.without_collection()
+ query_op.run_in_background()
diff --git a/src/user_files/README.txt b/src/user_files/README.txt
index f8e500f..0036ced 100644
--- a/src/user_files/README.txt
+++ b/src/user_files/README.txt
@@ -1,2 +1 @@
-Your imported dictionaries are processsed and stored here in subfolders.
-A file is generated for each dictionary entry, so the folders may contain a lot of files.
+Your imported dictionaries are processsed and stored here inside the "dictionaries" subfolder as SQLite databases.
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 0000000..4fe8b86
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from .consts import consts
+
+
+def get_legacy_dict_dirs() -> list[Path]:
+ return [
+ p
+ for p in consts.userfiles_dir.iterdir()
+ if p.is_dir() and p.name not in ("logs", consts.dicts_dir.name)
+ ]
+
+
+def get_dicts() -> list[Path]:
+ return [p for p in (consts.dicts_dir).iterdir() if p.is_file()]
+
+
+def get_dict_names() -> list[str]:
+ return [p.stem for p in get_dicts()]
diff --git a/tests/test_dict.json b/tests/test_dict.json
index 11f9160..0b96d27 100644
--- a/tests/test_dict.json
+++ b/tests/test_dict.json
@@ -1,3 +1,2 @@
{"pos": "noun", "forms": [{"form": "ко́шка", "tags": ["animate", "canonical", "feminine", "inanimate"]}, {"form": "kóška", "tags": ["romanization"]}, {"form": "ко́шки", "tags": ["genitive"]}, {"form": "ко́шки", "tags": ["nominative", "plural"]}, {"form": "ко́шек", "tags": ["genitive", "plural"]}, {"form": "коша́чий", "tags": ["adjective"]}, {"form": "ко́шечка", "tags": ["diminutive"]}, {"form": "", "source": "Declension", "tags": ["table-tags"]}, {"form": "velar-stem", "source": "Declension", "tags": ["class"]}, {"form": "accent-a", "source": "Declension", "tags": ["class"]}, {"form": "ко́шка", "tags": ["nominative", "singular"], "source": "Declension", "roman": "kóška"}, {"form": "ко́шки", "tags": ["nominative", "plural"], "source": "Declension", "roman": "kóški"}, {"form": "ко́шки", "tags": ["genitive", "singular"], "source": "Declension", "roman": "kóški"}, {"form": "ко́шек", "tags": ["genitive", "plural"], "source": "Declension", "roman": "kóšek"}, {"form": "ко́шке", "tags": ["dative", "singular"], "source": "Declension", "roman": "kóške"}, {"form": "ко́шкам", "tags": ["dative", "plural"], "source": "Declension", "roman": "kóškam"}, {"form": "ко́шку", "tags": ["accusative", "singular"], "source": "Declension", "roman": "kóšku"}, {"form": "ко́шек", "tags": ["accusative", "plural"], "source": "Declension", "roman": "kóšek"}, {"form": "ко́шки", "tags": ["accusative", "plural"], "source": "Declension", "roman": "kóški"}, {"form": "ко́шкой", "tags": ["instrumental", "singular"], "source": "Declension", "roman": "kóškoj"}, {"form": "ко́шкою", "tags": ["instrumental", "singular"], "source": "Declension", "roman": "kóškoju"}, {"form": "ко́шками", "tags": ["instrumental", "plural"], "source": "Declension", "roman": "kóškami"}, {"form": "ко́шке", "tags": ["prepositional", "singular"], "source": "Declension", "roman": "kóške"}, {"form": "ко́шках", "tags": ["plural", "prepositional"], "source": "Declension", "roman": "kóškax"}, {"form": "", "source": "Declension", "tags": ["table-tags"]}, {"form": "velar-stem", "source": "Declension", "tags": ["class"]}, {"form": "accent-a", "source": "Declension", "tags": ["class"]}, {"form": "ко́шка", "tags": ["dated", "nominative", "singular"], "source": "Declension", "roman": "kóška"}, {"form": "ко́шки", "tags": ["dated", "nominative", "plural"], "source": "Declension", "roman": "kóški"}, {"form": "ко́шки", "tags": ["dated", "genitive", "singular"], "source": "Declension", "roman": "kóški"}, {"form": "ко́шекъ", "tags": ["dated", "genitive", "plural"], "source": "Declension", "roman": "kóšek"}, {"form": "ко́шкѣ", "tags": ["dated", "dative", "singular"], "source": "Declension", "roman": "kóškě"}, {"form": "ко́шкамъ", "tags": ["dated", "dative", "plural"], "source": "Declension", "roman": "kóškam"}, {"form": "ко́шку", "tags": ["accusative", "dated", "singular"], "source": "Declension", "roman": "kóšku"}, {"form": "ко́шекъ", "tags": ["accusative", "dated", "plural"], "source": "Declension", "roman": "kóšek"}, {"form": "ко́шки", "tags": ["accusative", "dated", "plural"], "source": "Declension", "roman": "kóški"}, {"form": "ко́шкой", "tags": ["dated", "instrumental", "singular"], "source": "Declension", "roman": "kóškoj"}, {"form": "ко́шкою", "tags": ["dated", "instrumental", "singular"], "source": "Declension", "roman": "kóškoju"}, {"form": "ко́шками", "tags": ["dated", "instrumental", "plural"], "source": "Declension", "roman": "kóškami"}, {"form": "ко́шкѣ", "tags": ["dated", "prepositional", "singular"], "source": "Declension", "roman": "kóškě"}, {"form": "ко́шкахъ", "tags": ["dated", "plural", "prepositional"], "source": "Declension", "roman": "kóškax"}], "etymology_text": "From unattested Old East Slavic *ко́чька (*kóčĭka), from Proto-Slavic *kòťьka, from *kòťь, from *kòtъ. Cognate with Old Ruthenian ко́шка (kóška), Ukrainian кі́шка (kíška), Russian ко́шка (kóška).", "etymology_templates": [{"name": "inh", "args": {"1": "ru", "2": "orv", "3": "*кочька", "4": "*ко́чька"}, "expansion": "Old East Slavic *ко́чька (*kóčĭka)"}, {"name": "inh", "args": {"1": "ru", "2": "sla-pro", "3": "*kòťьka"}, "expansion": "Proto-Slavic *kòťьka"}, {"name": "m", "args": {"1": "sla-pro", "2": "*kòťь"}, "expansion": "*kòťь"}, {"name": "m", "args": {"1": "sla-pro", "2": "*kòtъ"}, "expansion": "*kòtъ"}, {"name": "cog", "args": {"1": "zle-ort", "2": "ко́шка"}, "expansion": "Old Ruthenian ко́шка (kóška)"}, {"name": "cog", "args": {"1": "uk", "2": "кі́шка"}, "expansion": "Ukrainian кі́шка (kíška)"}, {"name": "cog", "args": {"1": "ru", "2": "ко́шка"}, "expansion": "Russian ко́шка (kóška)"}], "sounds": [{"ipa": "[ˈkoʂkə]"}, {"audio": "Ru-кошка.ogg", "text": "Audio", "ogg_url": "https://upload.wikimedia.org/wikipedia/commons/f/f3/Ru-%D0%BA%D0%BE%D1%88%D0%BA%D0%B0.ogg", "mp3_url": "https://upload.wikimedia.org/wikipedia/commons/transcoded/f/f3/Ru-%D0%BA%D0%BE%D1%88%D0%BA%D0%B0.ogg/Ru-%D0%BA%D0%BE%D1%88%D0%BA%D0%B0.ogg.mp3"}], "word": "кошка", "lang": "Russian", "lang_code": "ru", "senses": [{"raw_glosses": ["cat"], "examples": [{"text": "жить как ко́шка с соба́кой", "english": "to lead a cat-and-dog life", "type": "example", "roman": "žitʹ kak kóška s sobákoj"}, {"text": "игра́ть в ко́шки-мышки", "english": "play cat-and-mouse", "type": "example", "roman": "igrátʹ v kóški-myški"}, {"text": "но́чью все ко́шки се́ры", "english": "at night all cats are gray", "type": "example", "roman": "nóčʹju vse kóški séry"}, {"text": "у него́ ко́шки скребу́т на се́рдце", "english": "he is sick at heart (very upset)", "type": "example", "roman": "u nevó kóški skrebút na sérdce"}], "glosses": ["cat"], "id": "кошка-ru-noun-d693i1Gr", "categories": [{"name": "Cats", "kind": "lifeform", "parents": ["Felids", "List of sets", "Carnivores", "All sets", "Mammals", "Fundamental", "Vertebrates", "Chordates", "Animals", "Lifeforms", "Nature", "All topics"], "source": "w+disamb", "orig": "ru:Cats", "langcode": "ru", "_dis": "49 8 9 7 9 9 9"}]}, {"raw_glosses": ["(inanimate) cat-o'-nine-tails"], "tags": ["inanimate"], "glosses": ["cat-o'-nine-tails"], "id": "кошка-ru-noun-SnCks4De", "categories": [{"name": "Russian nouns with accent pattern a", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "4 15 17 17 17 17 13"}, {"name": "Russian nouns with multiple animacies", "kind": "other", "parents": ["Nouns with multiple animacies", "Nouns", "Lemmas"], "source": "w+disamb", "_dis": "7 16 14 13 13 20 17"}, {"name": "Russian nouns with reducible stem", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "7 18 14 13 13 21 15"}]}, {"raw_glosses": ["(technical, inanimate) grapnel, drag"], "topics": ["engineering", "natural-sciences", "physical-sciences", "technical"], "tags": ["inanimate"], "glosses": ["grapnel, drag"], "id": "кошка-ru-noun-hC.xx8kl", "categories": [{"name": "Russian nouns with accent pattern a", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "4 15 17 17 17 17 13"}, {"name": "Russian nouns with multiple animacies", "kind": "other", "parents": ["Nouns with multiple animacies", "Nouns", "Lemmas"], "source": "w+disamb", "_dis": "7 16 14 13 13 20 17"}, {"name": "Russian nouns with reducible stem", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "7 18 14 13 13 21 15"}]}, {"raw_glosses": ["(technical, inanimate) grapple fork"], "topics": ["engineering", "natural-sciences", "physical-sciences", "technical"], "tags": ["inanimate"], "glosses": ["grapple fork"], "id": "кошка-ru-noun-XIx0azoz", "categories": [{"name": "Russian nouns with accent pattern a", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "4 15 17 17 17 17 13"}, {"name": "Russian nouns with multiple animacies", "kind": "other", "parents": ["Nouns with multiple animacies", "Nouns", "Lemmas"], "source": "w+disamb", "_dis": "7 16 14 13 13 20 17"}, {"name": "Russian nouns with reducible stem", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "7 18 14 13 13 21 15"}]}, {"raw_glosses": ["(cranes, inanimate) car, trolley, carriage"], "topics": ["cranes", "engineering", "natural-sciences", "physical-sciences", "tools"], "tags": ["inanimate"], "glosses": ["car, trolley, carriage"], "id": "кошка-ru-noun-p2aeDHoL", "categories": [{"name": "Russian nouns with accent pattern a", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "4 15 17 17 17 17 13"}, {"name": "Russian nouns with multiple animacies", "kind": "other", "parents": ["Nouns with multiple animacies", "Nouns", "Lemmas"], "source": "w+disamb", "_dis": "7 16 14 13 13 20 17"}, {"name": "Russian nouns with reducible stem", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "7 18 14 13 13 21 15"}]}, {"raw_glosses": ["(geology, inanimate) spit, bar"], "topics": ["geography", "geology", "natural-sciences"], "tags": ["inanimate"], "glosses": ["spit, bar"], "id": "кошка-ru-noun-qoyfZb02", "categories": [{"name": "Geology", "kind": "topical", "parents": ["Earth sciences", "Sciences", "All topics", "Fundamental"], "source": "w", "orig": "ru:Geology", "langcode": "ru"}, {"name": "Russian nouns with accent pattern a", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "4 15 17 17 17 17 13"}, {"name": "Russian nouns with multiple animacies", "kind": "other", "parents": ["Nouns with multiple animacies", "Nouns", "Lemmas"], "source": "w+disamb", "_dis": "7 16 14 13 13 20 17"}, {"name": "Russian nouns with reducible stem", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "7 18 14 13 13 21 15"}]}, {"raw_glosses": ["(climbing, in the plural, inanimate) crampons, climbing irons, climbing grapplers"], "topics": ["climbing", "hobbies", "lifestyle", "sports"], "tags": ["in-plural", "inanimate"], "glosses": ["crampons, climbing irons, climbing grapplers"], "id": "кошка-ru-noun-gSMwNFin", "categories": [{"name": "Climbing", "kind": "topical", "parents": ["Sports", "Human activity", "Human behaviour", "Human", "All topics", "Fundamental"], "source": "w", "orig": "ru:Climbing", "langcode": "ru"}, {"name": "Footwear", "kind": "topical", "parents": ["Clothing", "Human", "All topics", "Fundamental"], "source": "w", "orig": "ru:Footwear", "langcode": "ru"}, {"name": "Russian nouns with accent pattern a", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "4 15 17 17 17 17 13"}, {"name": "Russian nouns with multiple animacies", "kind": "other", "parents": ["Nouns with multiple animacies", "Nouns", "Lemmas"], "source": "w+disamb", "_dis": "7 16 14 13 13 20 17"}, {"name": "Russian nouns with reducible stem", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "7 18 14 13 13 21 15"}]}]}
{"pos": "noun", "forms": [{"form": "соба́ка", "tags": ["animate", "canonical", "feminine"]}, {"form": "sobáka", "tags": ["romanization"]}, {"form": "соба́ки", "tags": ["genitive"]}, {"form": "соба́ки", "tags": ["nominative", "plural"]}, {"form": "соба́к", "tags": ["genitive", "plural"]}, {"form": "соба́чий", "tags": ["adjective"]}, {"form": "соба́чка", "tags": ["diminutive"]}, {"form": "", "source": "Declension", "tags": ["table-tags"]}, {"form": "velar-stem", "source": "Declension", "tags": ["class"]}, {"form": "accent-a", "source": "Declension", "tags": ["class"]}, {"form": "соба́ка", "tags": ["nominative", "singular"], "source": "Declension", "roman": "sobáka"}, {"form": "соба́ки", "tags": ["nominative", "plural"], "source": "Declension", "roman": "sobáki"}, {"form": "соба́ки", "tags": ["genitive", "singular"], "source": "Declension", "roman": "sobáki"}, {"form": "соба́к", "tags": ["genitive", "plural"], "source": "Declension", "roman": "sobák"}, {"form": "соба́ке", "tags": ["dative", "singular"], "source": "Declension", "roman": "sobáke"}, {"form": "соба́кам", "tags": ["dative", "plural"], "source": "Declension", "roman": "sobákam"}, {"form": "соба́ку", "tags": ["accusative", "singular"], "source": "Declension", "roman": "sobáku"}, {"form": "соба́к", "tags": ["accusative", "plural"], "source": "Declension", "roman": "sobák"}, {"form": "соба́кой", "tags": ["instrumental", "singular"], "source": "Declension", "roman": "sobákoj"}, {"form": "соба́кою", "tags": ["instrumental", "singular"], "source": "Declension", "roman": "sobákoju"}, {"form": "соба́ками", "tags": ["instrumental", "plural"], "source": "Declension", "roman": "sobákami"}, {"form": "соба́ке", "tags": ["prepositional", "singular"], "source": "Declension", "roman": "sobáke"}, {"form": "соба́ках", "tags": ["plural", "prepositional"], "source": "Declension", "roman": "sobákax"}, {"form": "", "source": "Declension", "tags": ["table-tags"]}, {"form": "velar-stem", "source": "Declension", "tags": ["class"]}, {"form": "accent-a", "source": "Declension", "tags": ["class"]}, {"form": "соба́ка", "tags": ["dated", "nominative", "singular"], "source": "Declension", "roman": "sobáka"}, {"form": "соба́ки", "tags": ["dated", "nominative", "plural"], "source": "Declension", "roman": "sobáki"}, {"form": "соба́ки", "tags": ["dated", "genitive", "singular"], "source": "Declension", "roman": "sobáki"}, {"form": "соба́къ", "tags": ["dated", "genitive", "plural"], "source": "Declension", "roman": "sobák"}, {"form": "соба́кѣ", "tags": ["dated", "dative", "singular"], "source": "Declension", "roman": "sobákě"}, {"form": "соба́камъ", "tags": ["dated", "dative", "plural"], "source": "Declension", "roman": "sobákam"}, {"form": "соба́ку", "tags": ["accusative", "dated", "singular"], "source": "Declension", "roman": "sobáku"}, {"form": "соба́къ", "tags": ["accusative", "dated", "plural"], "source": "Declension", "roman": "sobák"}, {"form": "соба́кой", "tags": ["dated", "instrumental", "singular"], "source": "Declension", "roman": "sobákoj"}, {"form": "соба́кою", "tags": ["dated", "instrumental", "singular"], "source": "Declension", "roman": "sobákoju"}, {"form": "соба́ками", "tags": ["dated", "instrumental", "plural"], "source": "Declension", "roman": "sobákami"}, {"form": "соба́кѣ", "tags": ["dated", "prepositional", "singular"], "source": "Declension", "roman": "sobákě"}, {"form": "соба́кахъ", "tags": ["dated", "plural", "prepositional"], "source": "Declension", "roman": "sobákax"}], "etymology_text": "Inherited from Old East Slavic собака (sobaka), derived from Middle Iranian *sabāka-, from West Iranian *spaka, from Proto-Iranian *cwā́; compare Zoroastrian Dari [script needed] (sabah), Old Median σπάκα (spā́kəʰ) [the source of Old Armenian ասպակ (aspak, “dog”)], Avestan 𐬯𐬞𐬀𐬐𐬀 (spaka, “dog-like”). Cognates include Ukrainian соба́ка (sobáka), Belarusian саба́ка (sabáka), Polish sobaka (dialectal), Kashubian sobaka (“bitch (female dog); dissolute man”), Sanskrit शुनक (śunaka).", "etymology_templates": [{"name": "glossary", "args": {"1": "Inherited"}, "expansion": "Inherited"}, {"name": "inh", "args": {"1": "ru", "2": "orv", "3": "собака", "4": "", "5": "", "lit": "", "pos": "", "tr": "", "ts": "", "id": "", "sc": "", "g": "", "g2": "", "g3": "", "nocat": "", "sort": ""}, "expansion": "Old East Slavic собака (sobaka)"}, {"name": "inh+", "args": {"1": "ru", "2": "orv", "3": "собака"}, "expansion": "Inherited from Old East Slavic собака (sobaka)"}, {"name": "der", "args": {"1": "ru", "2": "MIr."}, "expansion": "Middle Iranian"}, {"name": "m", "args": {"1": "und", "2": "*sabāka-"}, "expansion": "*sabāka-"}, {"name": "der", "args": {"1": "ru", "2": "ira-pro", "3": "*cwā́"}, "expansion": "Proto-Iranian *cwā́"}, {"name": "noncog", "args": {"1": "gbz", "tr": "sabah"}, "expansion": "Zoroastrian Dari [script needed] (sabah)"}, {"name": "noncog", "args": {"1": "xme-old", "2": "σπάκα", "tr": "spā́kəʰ"}, "expansion": "Old Median σπάκα (spā́kəʰ)"}, {"name": "noncog", "args": {"1": "xcl", "2": "ասպակ", "3": "", "4": "dog"}, "expansion": "Old Armenian ասպակ (aspak, “dog”)"}, {"name": "cog", "args": {"1": "ae", "2": "𐬯𐬞𐬀𐬐𐬀", "t": "dog-like"}, "expansion": "Avestan 𐬯𐬞𐬀𐬐𐬀 (spaka, “dog-like”)"}, {"name": "cog", "args": {"1": "uk", "2": "соба́ка"}, "expansion": "Ukrainian соба́ка (sobáka)"}, {"name": "cog", "args": {"1": "be", "2": "саба́ка"}, "expansion": "Belarusian саба́ка (sabáka)"}, {"name": "cog", "args": {"1": "pl", "2": "sobaka"}, "expansion": "Polish sobaka"}, {"name": "i", "args": {"1": "dialectal"}, "expansion": "(dialectal)"}, {"name": "cog", "args": {"1": "csb", "2": "sobaka", "3": "", "4": "bitch (female dog); dissolute man"}, "expansion": "Kashubian sobaka (“bitch (female dog); dissolute man”)"}, {"name": "cog", "args": {"1": "sa", "2": "शुनक", "tr": "śunaka"}, "expansion": "Sanskrit शुनक (śunaka)"}], "sounds": [{"ipa": "[sɐˈbakə]"}, {"audio": "Ru-собака.ogg", "text": "Audio", "ogg_url": "https://upload.wikimedia.org/wikipedia/commons/7/76/Ru-%D1%81%D0%BE%D0%B1%D0%B0%D0%BA%D0%B0.ogg", "mp3_url": "https://upload.wikimedia.org/wikipedia/commons/transcoded/7/76/Ru-%D1%81%D0%BE%D0%B1%D0%B0%D0%BA%D0%B0.ogg/Ru-%D1%81%D0%BE%D0%B1%D0%B0%D0%BA%D0%B0.ogg.mp3"}], "word": "собака", "lang": "Russian", "lang_code": "ru", "wikipedia": ["ru:собака"], "senses": [{"raw_glosses": ["dog"], "examples": [{"text": "сторожева́я соба́ка", "english": "watchdog", "type": "example", "roman": "storoževája sobáka"}, {"text": "дворо́вая соба́ка", "english": "cur, mongrel, mutt", "type": "example", "roman": "dvoróvaja sobáka"}, {"text": "Вот где соба́ка зарыта!", "english": "Now I see it!", "type": "example", "roman": "Vot gde sobáka zaryta!"}, {"text": "Его́ ка́ждая соба́ка зна́ет", "english": "Everyone knows him. (literally, “Every dog knows him”)", "type": "example", "roman": "Jevó káždaja sobáka znájet"}, {"text": "голо́дный как соба́ка ― golódnyj kak sobáka ― as hungry as a dog; wolfish, rapacious", "type": "example"}, {"text": "замёрзнуть как соба́ка", "english": "to be chilled to the marrow", "type": "example", "roman": "zamjórznutʹ kak sobáka"}, {"text": "злой как соба́ка", "english": "mad as hell (literally, “vicious as a dog”)", "type": "example", "roman": "zloj kak sobáka"}, {"text": "ну́жный как соба́ке пя́тая нога́", "english": "needed like a hole in the head (literally, “needed like a dog needs a fifth leg”)", "type": "example", "roman": "núžnyj kak sobáke pjátaja nogá"}, {"text": "соба́ка на се́не ― sobáka na séne ― dog in the manger", "type": "example"}, {"text": "соба́ку съесть", "english": "to know something inside out", "type": "example", "roman": "sobáku sʺjestʹ"}, {"text": "уста́ть как соба́ка", "english": "to be dog-tired", "type": "example", "roman": "ustátʹ kak sobáka"}], "synonyms": [{"word": "пёс"}], "glosses": ["dog"], "id": "собака-ru-noun-zWNX792W", "categories": [{"name": "Dogs", "kind": "lifeform", "parents": ["Canids", "List of sets", "Carnivores", "All sets", "Mammals", "Fundamental", "Vertebrates", "Chordates", "Animals", "Lifeforms", "Nature", "All topics"], "source": "w+disamb", "orig": "ru:Dogs", "langcode": "ru", "_dis": "43 27 7 10 3 10"}]}, {"raw_glosses": ["hound"], "examples": [{"text": "соба́ка-ище́йка ― sobáka-iščéjka ― bloodhound", "type": "example"}], "glosses": ["hound"], "id": "собака-ru-noun-WPwH6ZXh", "categories": []}, {"raw_glosses": ["(derogatory, figuratively) mongrel, cur, bastard (a detestable person)"], "tags": ["derogatory", "figuratively"], "glosses": ["mongrel, cur, bastard (a detestable person)"], "id": "собака-ru-noun-eQXjec05", "categories": []}, {"raw_glosses": ["(colloquial, figuratively) fox (a clever, capable person)"], "tags": ["colloquial", "figuratively"], "glosses": ["fox (a clever, capable person)"], "id": "собака-ru-noun-U2e.K2bV", "categories": []}, {"raw_glosses": ["(Internet) @ (at sign)"], "tags": ["Internet"], "glosses": ["@ (at sign)"], "id": "собака-ru-noun--nZeLGPv", "categories": [{"name": "Internet", "kind": "topical", "parents": ["Computing", "Networking", "Technology", "All topics", "Fundamental"], "source": "w", "orig": "ru:Internet", "langcode": "ru"}]}, {"raw_glosses": ["(computing slang) watchdog timer"], "topics": ["computing", "engineering", "mathematics", "natural-sciences", "physical-sciences", "sciences"], "tags": ["slang"], "glosses": ["watchdog timer"], "id": "собака-ru-noun-yY4oKkJD", "categories": [{"name": "Computing", "kind": "topical", "parents": ["Technology", "All topics", "Fundamental"], "source": "w", "orig": "ru:Computing", "langcode": "ru"}, {"name": "Russian nouns with accent pattern a", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "5 5 20 22 7 40"}]}]}
-{"pos": "noun", "word": "FAIL", "lang": "Russian", "lang_code": "ru", "senses": [{"raw_glosses": ["Dummy definition"]}]}
diff --git a/tests/test_importer.py b/tests/test_importer.py
index 023c90a..9d0bbd3 100644
--- a/tests/test_importer.py
+++ b/tests/test_importer.py
@@ -1,64 +1,28 @@
from __future__ import annotations
-import builtins
import tempfile
from pathlib import Path
-from typing import Any, Callable
-from unittest.mock import patch
from src.fetcher import WiktionaryFetcher
-def mock_open(file: str | Path, *args: Any, **kwargs: Any) -> Any:
- # Make importing artifically fail for a certain file
- if file == "FAIL.json" or getattr(file, "name", "") == "FAIL.json":
- raise Exception("FAIL")
- return builtins.open(file, *args, **kwargs) # pylint: disable=unspecified-encoding
-
-
-DICT_NAME = "dict"
-
-
def test_importing() -> None:
- patcher = patch("src.fetcher.open", side_effect=mock_open)
- patcher.start()
- failed_words = []
-
- def on_error(word: str, exc: Exception) -> None:
- assert str(exc) == "FAIL"
- failed_words.append(word)
-
tests_dir = Path(__file__).parent
with tempfile.TemporaryDirectory() as tmp_dir_s:
tmp_dir = Path(tmp_dir_s)
- count = WiktionaryFetcher.dump_kaikki_dict(
+ count = WiktionaryFetcher.import_kaikki_dict(
tests_dir / "test_dict.json",
- DICT_NAME,
- on_progress=lambda _: True,
- on_error=on_error,
+ "dict",
+ on_progress=lambda *args, **kwargs: None,
+ on_error=lambda *args, **kwargs: None,
base_dir=tmp_dir,
)
assert count == 2
- assert len(failed_words) == 1
- assert failed_words[0] == "FAIL"
- patcher.stop()
- fetcher = WiktionaryFetcher(DICT_NAME, base_dir=tmp_dir)
- assert fetcher.get_gender("кошка") == "feminine"
- assert fetcher.get_senses("кошка")[0] == "cat"
- assert fetcher.get_part_of_speech("кошка") == "noun"
- assert (
- fetcher.get_examples("кошка")[0]
- == "жить как ко́шка с соба́кой / to lead a cat-and-dog life"
- )
- methods: list[Callable[[str], Any]] = [
- fetcher.get_examples,
- fetcher.get_gender,
- fetcher.get_part_of_speech,
- fetcher.get_senses,
- ]
- for method in methods:
- try:
- method("FAIL")
- assert False
- except:
- assert True
+ with WiktionaryFetcher("dict", base_dir=tmp_dir) as fetcher:
+ assert fetcher.get_gender("кошка") == "feminine"
+ assert fetcher.get_senses("кошка")[0] == "cat"
+ assert fetcher.get_part_of_speech("кошка") == "noun"
+ assert (
+ fetcher.get_examples("кошка")[0]
+ == "жить как ко́шка с соба́кой / to lead a cat-and-dog life"
+ )