Skip to content

Commit

Permalink
Polish Wiktionary kaikki.org data use "nb" code for Norwegian Bokmål
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Sep 8, 2024
1 parent 2a4e14a commit 8d9063f
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 16 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
strategy:
fail-fast: false
matrix:
gloss_lang: ['de', 'el', 'en', 'es', 'fi', 'fr', 'he', 'hr', 'it', 'ja', 'lt', 'nl', 'no', 'pl', 'pt', 'ru', 'sv', 'zh']
gloss_lang: ['de', 'el', 'en', 'es', 'fi', 'fr', 'he', 'hr', 'it', 'ja', 'lt', 'nl', 'nb', 'pl', 'pt', 'ru', 'sv', 'zh']
steps:
- uses: actions/checkout@v4

Expand Down
20 changes: 12 additions & 8 deletions src/proficiency/extract_dbnary.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ def download_dbnary_files(gloss_lang: str) -> None:

base_url = "https://kaiko.getalp.org/static/ontolex/latest"
lang_key = gloss_lang
if gloss_lang == "hr":
gloss_lang = "sh"
gloss_lang = convert_lang_code(gloss_lang)
download_dbnary_file(f"{base_url}/{gloss_lang}_dbnary_ontolex.ttl.bz2")
if DBNARY_LANGS[lang_key]["has_exolex"]:
download_dbnary_file(f"{base_url}/{gloss_lang}_dbnary_exolex_ontolex.ttl.bz2")
Expand Down Expand Up @@ -237,8 +236,7 @@ def insert_senses(


def init_oxigraph_store(gloss_lang: str) -> tuple[Store, bool]:
if gloss_lang == "hr":
gloss_lang = "sh"
gloss_lang = convert_lang_code(gloss_lang)
store = Store(f"build/ttl/{gloss_lang}_store")
store.bulk_load(f"build/ttl/{gloss_lang}_dbnary_ontolex.ttl", "text/turtle")
exolex_path = Path(f"build/ttl/{gloss_lang}_dbnary_exolex_ontolex.ttl")
Expand All @@ -256,11 +254,9 @@ def create_lemmas_db_from_dbnary(
store: Store, lemma_lang: str, gloss_lang: str, has_morphology: bool
) -> list[Path]:
db_path = wiktionary_db_path(lemma_lang, gloss_lang)
if lemma_lang == "hr":
lemma_lang = "sh"
if gloss_lang == "hr":
gloss_lang = "sh"
conn = init_db(db_path, lemma_lang, False, False)
lemma_lang = convert_lang_code(lemma_lang)
gloss_lang = convert_lang_code(gloss_lang)
lemma_ids = insert_lemmas(store, conn, lemma_lang)
if has_morphology and lemma_lang == gloss_lang:
insert_forms(store, conn, lemma_lang, lemma_ids)
Expand All @@ -284,6 +280,14 @@ def dbnary_to_kaikki_pos(pos: str) -> str:
return "other"


def convert_lang_code(code: str) -> str:
if code == "hr": # Croatian
return "sh" # Serbo-Croatian, MediaWiki old code
elif code == "nb": # Norwegian Bokmål
return "no" # Norwegian, MediaWiki old code
return code


if __name__ == "__main__":
gloss_lang = "fr"
store = Store()
Expand Down
2 changes: 0 additions & 2 deletions src/proficiency/extract_kaikki.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,6 @@ def download_kaikki_json(lemma_lang: str, gloss_lang: str) -> None:


def load_data(lemma_lang: str, gloss_lang: str) -> tuple[Path, dict[str, int]]:
if lemma_lang == "hr":
lemma_lang = "sh"
kaikki_json_path = Path(f"build/{lemma_lang}/{lemma_lang}_{gloss_lang}.jsonl")
if gloss_lang in KAIKKI_TRANSLATED_GLOSS_LANGS:
kaikki_json_path = Path(f"build/{lemma_lang}/{lemma_lang}_{lemma_lang}.jsonl")
Expand Down
4 changes: 2 additions & 2 deletions src/proficiency/languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"lt", # Lithuanian
"mk", # Macedonian
"nl", # Dutch
"no", # Norwegian Bokmål
"nb", # Norwegian Bokmål
"pl", # Polish
"pt", # Portuguese
"ro", # Romanian
Expand Down Expand Up @@ -67,7 +67,7 @@
"lt": {"has_exolex": False, "has_morphology": False},
"mg": {"has_exolex": True, "has_morphology": False},
"nl": {"has_exolex": False, "has_morphology": False},
"no": {"has_exolex": False, "has_morphology": False},
"nb": {"has_exolex": False, "has_morphology": False},
"pl": {"has_exolex": False, "has_morphology": False},
"pt": {"has_exolex": False, "has_morphology": False},
"ru": {"has_exolex": False, "has_morphology": False},
Expand Down
15 changes: 12 additions & 3 deletions src/proficiency/split_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ def split_kaikki_jsonl(
gloss_code = lemma_code
else:
lemma_codes = KAIKKI_LEMMA_LANGS
lemma_codes.remove("hr") # Croatian
# Wiktionary still uses the deprecated language code
lemma_codes.add("sh")

out_file_paths = {
l_code: Path(f"build/{l_code}/{l_code}_{gloss_code}.jsonl")
Expand All @@ -43,7 +40,19 @@ def split_kaikki_jsonl(
elif lang_code == "mul":
for out_f in out_files.values():
out_f.write(line.decode("utf-8"))
else:
new_lang_code = convert_lang_code(lang_code)
if new_lang_code in lemma_codes:
out_files[new_lang_code].write(line.decode("utf-8"))

for out_f in out_files.values():
out_f.close()
logging.info("Split JSONL file completed")


def convert_lang_code(code: str) -> str:
codes = {
"sh": "hr", # Serbo-Croatian -> Croatian
"no": "nb", # Norwegian -> Norwegian Bokmål
}
return codes.get(code, "")

0 comments on commit 8d9063f

Please sign in to comment.