Skip to content

Commit

Permalink
Merge pull request #15 from Vuizur/windows-fix
Browse files Browse the repository at this point in the history
Make it work on Windows
  • Loading branch information
xxyzz authored Feb 21, 2024
2 parents 7aa6f8c + 789d0a9 commit 4fbc03e
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 33 deletions.
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,30 @@ Wiktionary data come from kaikki.org and [Dbnary](https://kaiko.getalp.org/about
- pigz or gzip

## Create files

**Unix**:
```
$ python -m venv .venv
$ source .venv/bin/activate.fish
$ python -m pip install .
$ proficiency en
```

**Windows**:

First install wget with
```
winget install -e --id JernejSimoncic.Wget
```
and re-open the console window.

Then activate Python and run the program:
```
python -m venv .venv
.\.venv\Scripts\activate
python -m pip install .
proficiency en
```

## License

This work is licensed under GPL version 3 or later.
23 changes: 10 additions & 13 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,23 @@ build-backend = "setuptools.build_meta"
[project]
name = "Proficiency"
version = "0.5.12"
authors = [
{name = "xxyzz"}
]
authors = [{ name = "xxyzz" }]
description = "Create language files for WordDumb."
readme = "README.md"
requires-python = ">=3.11"
license = {text = "GNU General Public License v3 or later (GPLv3+)"}
license = { text = "GNU General Public License v3 or later (GPLv3+)" }
dependencies = [
"lemminflect",
"OpenCC",
"wordfreq[mecab]",
"wiktextract-lemmatization @ git+https://github.com/Vuizur/wiktextract-lemmatization@37f438eb973364de4d5e70959ee1c2aa26bf5ba5",
"pyoxigraph",
# mecab with prerelease version for 3.12 wheels
"mecab-python3>=1.0.9.dev4 ; platform_system == 'Windows'",
]

[project.optional-dependencies]
dev = [
"mypy",
"ruff",
]
dev = ["mypy", "ruff"]

[project.scripts]
proficiency = "proficiency.main:main"
Expand All @@ -51,12 +48,12 @@ ignore_missing_imports = true
[tool.typos]
type.csv.check-file = false
type.json.check-file = false
default.extend-words = {"Formes" = "Formes"}
default.extend-words = { "Formes" = "Formes" }

[tool.ruff.lint]
select = [
"E", # pycodestyle error
"F", # Pyflakes
"I", # isort
"W", # pycodestyle warning
"E", # pycodestyle error
"F", # Pyflakes
"I", # isort
"W", # pycodestyle warning
]
35 changes: 22 additions & 13 deletions src/proficiency/extract_kaikki.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,23 @@ def download_kaikki_non_en_json(gloss_lang: str) -> Path:
text=True,
)
if gz_path.exists() and not jsonl_path.exists():
subprocess.run(
[
"pigz" if which("pigz") is not None else "gzip",
"-d",
str(gz_path),
],
check=True,
capture_output=True,
text=True,
)
if which("pigz") is None and which("gzip") is None:
import gzip
import shutil

with gzip.open(gz_path, "rb") as f_in, open(jsonl_path, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
else:
subprocess.run(
[
"pigz" if which("pigz") is not None else "gzip",
"-d",
str(gz_path),
],
check=True,
capture_output=True,
text=True,
)
split_kaikki_non_en_jsonl(jsonl_path, gloss_lang)

return jsonl_path
Expand Down Expand Up @@ -202,9 +209,11 @@ def create_lemmas_db_from_kaikki(lemma_lang: str, gloss_lang: str) -> list[Path]
enabled,
converter.convert(short_gloss),
converter.convert(gloss),
converter.convert(example_sent)
if example_sent is not None
else None,
(
converter.convert(example_sent)
if example_sent is not None
else None
),
)
)
enabled = False
Expand Down
27 changes: 21 additions & 6 deletions src/proficiency/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,27 @@
def compress(file_path: Path) -> None:
compressed_path = file_path.with_suffix(file_path.suffix + ".bz2")
compressed_path.unlink(missing_ok=True)
subprocess.run(
["lbzip2" if which("lbzip2") is not None else "bzip2", "-k", str(file_path)],
check=True,
capture_output=True,
text=True,
)

if which("lbzip2") is None and which("bzip2") is None:
import bz2

# Use pure python implementation of bzip2 compression
with open(file_path, "rb") as input_file:
data = input_file.read()
compressed_data = bz2.compress(data)
with open(compressed_path, "wb") as output_file:
output_file.write(compressed_data)
else:
subprocess.run(
[
"lbzip2" if which("lbzip2") is not None else "bzip2",
"-k",
str(file_path),
],
check=True,
capture_output=True,
text=True,
)


def create_wiktionary_files_from_kaikki(
Expand Down

0 comments on commit 4fbc03e

Please sign in to comment.