Polish Wiktionary kaikki.org data use "nb" code for Norwegian Bokmål

xxyzz · Sep 8, 2024 · 8d9063f · 8d9063f
1 parent 2a4e14a
commit 8d9063f
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 16 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -40,7 +40,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        gloss_lang: ['de', 'el', 'en', 'es', 'fi', 'fr', 'he', 'hr', 'it', 'ja', 'lt', 'nl', 'no', 'pl', 'pt', 'ru', 'sv', 'zh']
+        gloss_lang: ['de', 'el', 'en', 'es', 'fi', 'fr', 'he', 'hr', 'it', 'ja', 'lt', 'nl', 'nb', 'pl', 'pt', 'ru', 'sv', 'zh']
     steps:
       - uses: actions/checkout@v4
 

diff --git a/src/proficiency/extract_dbnary.py b/src/proficiency/extract_dbnary.py
@@ -19,8 +19,7 @@ def download_dbnary_files(gloss_lang: str) -> None:
 
     base_url = "https://kaiko.getalp.org/static/ontolex/latest"
     lang_key = gloss_lang
-    if gloss_lang == "hr":
-        gloss_lang = "sh"
+    gloss_lang = convert_lang_code(gloss_lang)
     download_dbnary_file(f"{base_url}/{gloss_lang}_dbnary_ontolex.ttl.bz2")
     if DBNARY_LANGS[lang_key]["has_exolex"]:
         download_dbnary_file(f"{base_url}/{gloss_lang}_dbnary_exolex_ontolex.ttl.bz2")
@@ -237,8 +236,7 @@ def insert_senses(
 
 
 def init_oxigraph_store(gloss_lang: str) -> tuple[Store, bool]:
-    if gloss_lang == "hr":
-        gloss_lang = "sh"
+    gloss_lang = convert_lang_code(gloss_lang)
     store = Store(f"build/ttl/{gloss_lang}_store")
     store.bulk_load(f"build/ttl/{gloss_lang}_dbnary_ontolex.ttl", "text/turtle")
     exolex_path = Path(f"build/ttl/{gloss_lang}_dbnary_exolex_ontolex.ttl")
@@ -256,11 +254,9 @@ def create_lemmas_db_from_dbnary(
     store: Store, lemma_lang: str, gloss_lang: str, has_morphology: bool
 ) -> list[Path]:
     db_path = wiktionary_db_path(lemma_lang, gloss_lang)
-    if lemma_lang == "hr":
-        lemma_lang = "sh"
-    if gloss_lang == "hr":
-        gloss_lang = "sh"
     conn = init_db(db_path, lemma_lang, False, False)
+    lemma_lang = convert_lang_code(lemma_lang)
+    gloss_lang = convert_lang_code(gloss_lang)
     lemma_ids = insert_lemmas(store, conn, lemma_lang)
     if has_morphology and lemma_lang == gloss_lang:
         insert_forms(store, conn, lemma_lang, lemma_ids)
@@ -284,6 +280,14 @@ def dbnary_to_kaikki_pos(pos: str) -> str:
             return "other"
 
 
+def convert_lang_code(code: str) -> str:
+    if code == "hr":  # Croatian
+        return "sh"  # Serbo-Croatian, MediaWiki old code
+    elif code == "nb":  # Norwegian Bokmål
+        return "no"  # Norwegian, MediaWiki old code
+    return code
+
+
 if __name__ == "__main__":
     gloss_lang = "fr"
     store = Store()

diff --git a/src/proficiency/extract_kaikki.py b/src/proficiency/extract_kaikki.py
@@ -82,8 +82,6 @@ def download_kaikki_json(lemma_lang: str, gloss_lang: str) -> None:
 
 
 def load_data(lemma_lang: str, gloss_lang: str) -> tuple[Path, dict[str, int]]:
-    if lemma_lang == "hr":
-        lemma_lang = "sh"
     kaikki_json_path = Path(f"build/{lemma_lang}/{lemma_lang}_{gloss_lang}.jsonl")
     if gloss_lang in KAIKKI_TRANSLATED_GLOSS_LANGS:
         kaikki_json_path = Path(f"build/{lemma_lang}/{lemma_lang}_{lemma_lang}.jsonl")

diff --git a/src/proficiency/languages.py b/src/proficiency/languages.py
@@ -18,7 +18,7 @@
     "lt",  # Lithuanian
     "mk",  # Macedonian
     "nl",  # Dutch
-    "no",  # Norwegian Bokmål
+    "nb",  # Norwegian Bokmål
     "pl",  # Polish
     "pt",  # Portuguese
     "ro",  # Romanian
@@ -67,7 +67,7 @@
     "lt": {"has_exolex": False, "has_morphology": False},
     "mg": {"has_exolex": True, "has_morphology": False},
     "nl": {"has_exolex": False, "has_morphology": False},
-    "no": {"has_exolex": False, "has_morphology": False},
+    "nb": {"has_exolex": False, "has_morphology": False},
     "pl": {"has_exolex": False, "has_morphology": False},
     "pt": {"has_exolex": False, "has_morphology": False},
     "ru": {"has_exolex": False, "has_morphology": False},

diff --git a/src/proficiency/split_jsonl.py b/src/proficiency/split_jsonl.py
@@ -19,9 +19,6 @@ def split_kaikki_jsonl(
         gloss_code = lemma_code
     else:
         lemma_codes = KAIKKI_LEMMA_LANGS
-        lemma_codes.remove("hr")  # Croatian
-        # Wiktionary still uses the deprecated language code
-        lemma_codes.add("sh")
 
     out_file_paths = {
         l_code: Path(f"build/{l_code}/{l_code}_{gloss_code}.jsonl")
@@ -43,7 +40,19 @@ def split_kaikki_jsonl(
             elif lang_code == "mul":
                 for out_f in out_files.values():
                     out_f.write(line.decode("utf-8"))
+            else:
+                new_lang_code = convert_lang_code(lang_code)
+                if new_lang_code in lemma_codes:
+                    out_files[new_lang_code].write(line.decode("utf-8"))
 
     for out_f in out_files.values():
         out_f.close()
     logging.info("Split JSONL file completed")
+
+
+def convert_lang_code(code: str) -> str:
+    codes = {
+        "sh": "hr",  # Serbo-Croatian -> Croatian
+        "no": "nb",  # Norwegian -> Norwegian Bokmål
+    }
+    return codes.get(code, "")