Skip to content

Commit

Permalink
Fix disk out of space error for Hebrew files
Browse files Browse the repository at this point in the history
also extract kaikki JSONL file to stdout for translated file
  • Loading branch information
xxyzz committed Aug 7, 2024
1 parent 8bdf68f commit ee2d3d8
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 33 deletions.
35 changes: 14 additions & 21 deletions src/proficiency/extract_kaikki.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from shutil import copyfileobj, which
from shutil import which
from typing import Any

from .database import create_indexes_then_close, init_db, wiktionary_db_path
Expand Down Expand Up @@ -43,11 +43,13 @@ class Sense:
example: str = ""


def download_kaikki_json(gloss_lang: str, split_files: bool = True) -> None:
def download_kaikki_json(lemma_lang: str, gloss_lang: str) -> None:
from .split_jsonl import split_kaikki_jsonl

url = "https://kaikki.org/"
if gloss_lang == "en":
if gloss_lang == "en" or (
gloss_lang in KAIKKI_TRANSLATED_GLOSS_LANGS and lemma_lang == "en"
):
url += "dictionary/"
else:
url += f"{gloss_lang}wiktionary/"
Expand All @@ -67,33 +69,24 @@ def download_kaikki_json(gloss_lang: str, split_files: bool = True) -> None:
import gzip

with gzip.open(gz_path, "rb") as gz_f:
if split_files:
split_kaikki_jsonl(gz_f, gloss_lang)
else:
with open(gz_path.with_suffix(".json"), "w", encoding="utf-8") as f:
copyfileobj(gz_f, f) # type: ignore
split_kaikki_jsonl(gz_f, lemma_lang, gloss_lang)
else:
command_args = ["pigz" if which("pigz") is not None else "gzip", "-d"]
if split_files:
command_args.append("-c")
command_args = ["pigz" if which("pigz") is not None else "gzip", "-d", "-c"]
command_args.append(str(gz_path))
if split_files:
sub_p = subprocess.Popen(command_args, stdout=subprocess.PIPE)
if sub_p.stdout is not None:
with sub_p.stdout as f:
split_kaikki_jsonl(f, gloss_lang)
sub_p.wait()
gz_path.unlink()
else:
subprocess.run(command_args, check=True, text=True)
sub_p = subprocess.Popen(command_args, stdout=subprocess.PIPE)
if sub_p.stdout is not None:
with sub_p.stdout as f:
split_kaikki_jsonl(f, lemma_lang, gloss_lang)
sub_p.wait()
gz_path.unlink()


def load_data(lemma_lang: str, gloss_lang: str) -> tuple[Path, dict[str, int]]:
if lemma_lang == "hr":
lemma_lang = "sh"
kaikki_json_path = Path(f"build/{lemma_lang}/{lemma_lang}_{gloss_lang}.jsonl")
if gloss_lang in KAIKKI_TRANSLATED_GLOSS_LANGS:
kaikki_json_path = Path(f"build/{lemma_lang}.jsonl")
kaikki_json_path = Path(f"build/{lemma_lang}/{lemma_lang}_{lemma_lang}.jsonl")

difficulty_data = load_difficulty_data(lemma_lang)
return kaikki_json_path, difficulty_data
Expand Down
4 changes: 2 additions & 2 deletions src/proficiency/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def create_wiktionary_files_from_kaikki(
lemma_lang: str, gloss_lang: str = "en"
) -> None:
if gloss_lang in KAIKKI_TRANSLATED_GLOSS_LANGS:
download_kaikki_json(lemma_lang, False)
download_kaikki_json(lemma_lang, gloss_lang)

for db_path in create_lemmas_db_from_kaikki(lemma_lang, gloss_lang):
compress(db_path)
Expand Down Expand Up @@ -138,7 +138,7 @@ def main() -> None:
logging.info("Creating Wiktionary files")
if args.gloss_lang in KAIKKI_GLOSS_LANGS | KAIKKI_TRANSLATED_GLOSS_LANGS.keys():
if args.gloss_lang in KAIKKI_GLOSS_LANGS:
download_kaikki_json(args.gloss_lang)
download_kaikki_json("", args.gloss_lang)
for _ in executor.map(
partial(
create_wiktionary_files_from_kaikki, gloss_lang=args.gloss_lang
Expand Down
26 changes: 16 additions & 10 deletions src/proficiency/split_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,33 @@
from typing import IO


def split_kaikki_jsonl(jsonl_f: IO[bytes] | GzipFile, gloss_code: str) -> None:
def split_kaikki_jsonl(
jsonl_f: IO[bytes] | GzipFile, lemma_code: str, gloss_code: str
) -> None:
"""
Split extracted jsonl file created by wiktextract to each language file.
"""
from .languages import KAIKKI_LEMMA_LANGS
from .languages import KAIKKI_LEMMA_LANGS, KAIKKI_TRANSLATED_GLOSS_LANGS

logging.info("Start splitting JSONL file")
lemma_codes = KAIKKI_LEMMA_LANGS
lemma_codes.remove("hr") # Croatian
# Wiktionary still uses the deprecated language code
lemma_codes.add("sh")
if gloss_code in KAIKKI_TRANSLATED_GLOSS_LANGS:
lemma_codes = {lemma_code}
gloss_code = lemma_code
else:
lemma_codes = KAIKKI_LEMMA_LANGS
lemma_codes.remove("hr") # Croatian
# Wiktionary still uses the deprecated language code
lemma_codes.add("sh")

out_file_paths = {
lemma_code: Path(f"build/{lemma_code}/{lemma_code}_{gloss_code}.jsonl")
for lemma_code in lemma_codes
l_code: Path(f"build/{l_code}/{l_code}_{gloss_code}.jsonl")
for l_code in lemma_codes
}
for out_file_path in out_file_paths.values():
out_file_path.parent.mkdir(parents=True, exist_ok=True)
out_files = {
lemma_code: out_file_path.open("w", encoding="utf-8")
for lemma_code, out_file_path in zip(lemma_codes, out_file_paths.values())
l_code: out_file_path.open("w", encoding="utf-8")
for l_code, out_file_path in zip(lemma_codes, out_file_paths.values())
}

for line in iter(jsonl_f.readline, b""):
Expand Down

0 comments on commit ee2d3d8

Please sign in to comment.