From ee2d3d83062a75437ba648bb001ddf0168571537 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 7 Aug 2024 22:44:16 +0800 Subject: [PATCH] Fix disk out of space error for Hebrew files also extract kaikki JSONL file to stdout for translated file --- src/proficiency/extract_kaikki.py | 35 +++++++++++++------------------ src/proficiency/main.py | 4 ++-- src/proficiency/split_jsonl.py | 26 ++++++++++++++--------- 3 files changed, 32 insertions(+), 33 deletions(-) diff --git a/src/proficiency/extract_kaikki.py b/src/proficiency/extract_kaikki.py index 3525605..f5d2c57 100644 --- a/src/proficiency/extract_kaikki.py +++ b/src/proficiency/extract_kaikki.py @@ -5,7 +5,7 @@ from collections import defaultdict from dataclasses import dataclass from pathlib import Path -from shutil import copyfileobj, which +from shutil import which from typing import Any from .database import create_indexes_then_close, init_db, wiktionary_db_path @@ -43,11 +43,13 @@ class Sense: example: str = "" -def download_kaikki_json(gloss_lang: str, split_files: bool = True) -> None: +def download_kaikki_json(lemma_lang: str, gloss_lang: str) -> None: from .split_jsonl import split_kaikki_jsonl url = "https://kaikki.org/" - if gloss_lang == "en": + if gloss_lang == "en" or ( + gloss_lang in KAIKKI_TRANSLATED_GLOSS_LANGS and lemma_lang == "en" + ): url += "dictionary/" else: url += f"{gloss_lang}wiktionary/" @@ -67,25 +69,16 @@ def download_kaikki_json(gloss_lang: str, split_files: bool = True) -> None: import gzip with gzip.open(gz_path, "rb") as gz_f: - if split_files: - split_kaikki_jsonl(gz_f, gloss_lang) - else: - with open(gz_path.with_suffix(".json"), "w", encoding="utf-8") as f: - copyfileobj(gz_f, f) # type: ignore + split_kaikki_jsonl(gz_f, lemma_lang, gloss_lang) else: - command_args = ["pigz" if which("pigz") is not None else "gzip", "-d"] - if split_files: - command_args.append("-c") + command_args = ["pigz" if which("pigz") is not None else "gzip", "-d", "-c"] command_args.append(str(gz_path)) - if split_files: - sub_p = subprocess.Popen(command_args, stdout=subprocess.PIPE) - if sub_p.stdout is not None: - with sub_p.stdout as f: - split_kaikki_jsonl(f, gloss_lang) - sub_p.wait() - gz_path.unlink() - else: - subprocess.run(command_args, check=True, text=True) + sub_p = subprocess.Popen(command_args, stdout=subprocess.PIPE) + if sub_p.stdout is not None: + with sub_p.stdout as f: + split_kaikki_jsonl(f, lemma_lang, gloss_lang) + sub_p.wait() + gz_path.unlink() def load_data(lemma_lang: str, gloss_lang: str) -> tuple[Path, dict[str, int]]: @@ -93,7 +86,7 @@ def load_data(lemma_lang: str, gloss_lang: str) -> tuple[Path, dict[str, int]]: lemma_lang = "sh" kaikki_json_path = Path(f"build/{lemma_lang}/{lemma_lang}_{gloss_lang}.jsonl") if gloss_lang in KAIKKI_TRANSLATED_GLOSS_LANGS: - kaikki_json_path = Path(f"build/{lemma_lang}.jsonl") + kaikki_json_path = Path(f"build/{lemma_lang}/{lemma_lang}_{lemma_lang}.jsonl") difficulty_data = load_difficulty_data(lemma_lang) return kaikki_json_path, difficulty_data diff --git a/src/proficiency/main.py b/src/proficiency/main.py index d3eb5e0..868b74f 100644 --- a/src/proficiency/main.py +++ b/src/proficiency/main.py @@ -58,7 +58,7 @@ def create_wiktionary_files_from_kaikki( lemma_lang: str, gloss_lang: str = "en" ) -> None: if gloss_lang in KAIKKI_TRANSLATED_GLOSS_LANGS: - download_kaikki_json(lemma_lang, False) + download_kaikki_json(lemma_lang, gloss_lang) for db_path in create_lemmas_db_from_kaikki(lemma_lang, gloss_lang): compress(db_path) @@ -138,7 +138,7 @@ def main() -> None: logging.info("Creating Wiktionary files") if args.gloss_lang in KAIKKI_GLOSS_LANGS | KAIKKI_TRANSLATED_GLOSS_LANGS.keys(): if args.gloss_lang in KAIKKI_GLOSS_LANGS: - download_kaikki_json(args.gloss_lang) + download_kaikki_json("", args.gloss_lang) for _ in executor.map( partial( create_wiktionary_files_from_kaikki, gloss_lang=args.gloss_lang diff --git a/src/proficiency/split_jsonl.py b/src/proficiency/split_jsonl.py index 241374b..428c3ae 100644 --- a/src/proficiency/split_jsonl.py +++ b/src/proficiency/split_jsonl.py @@ -5,27 +5,33 @@ from typing import IO -def split_kaikki_jsonl(jsonl_f: IO[bytes] | GzipFile, gloss_code: str) -> None: +def split_kaikki_jsonl( + jsonl_f: IO[bytes] | GzipFile, lemma_code: str, gloss_code: str +) -> None: """ Split extracted jsonl file created by wiktextract to each language file. """ - from .languages import KAIKKI_LEMMA_LANGS + from .languages import KAIKKI_LEMMA_LANGS, KAIKKI_TRANSLATED_GLOSS_LANGS logging.info("Start splitting JSONL file") - lemma_codes = KAIKKI_LEMMA_LANGS - lemma_codes.remove("hr") # Croatian - # Wiktionary still uses the deprecated language code - lemma_codes.add("sh") + if gloss_code in KAIKKI_TRANSLATED_GLOSS_LANGS: + lemma_codes = {lemma_code} + gloss_code = lemma_code + else: + lemma_codes = KAIKKI_LEMMA_LANGS + lemma_codes.remove("hr") # Croatian + # Wiktionary still uses the deprecated language code + lemma_codes.add("sh") out_file_paths = { - lemma_code: Path(f"build/{lemma_code}/{lemma_code}_{gloss_code}.jsonl") - for lemma_code in lemma_codes + l_code: Path(f"build/{l_code}/{l_code}_{gloss_code}.jsonl") + for l_code in lemma_codes } for out_file_path in out_file_paths.values(): out_file_path.parent.mkdir(parents=True, exist_ok=True) out_files = { - lemma_code: out_file_path.open("w", encoding="utf-8") - for lemma_code, out_file_path in zip(lemma_codes, out_file_paths.values()) + l_code: out_file_path.open("w", encoding="utf-8") + for l_code, out_file_path in zip(lemma_codes, out_file_paths.values()) } for line in iter(jsonl_f.readline, b""):