Merge pull request #15 from Vuizur/windows-fix

Make it work on Windows
xxyzz · Feb 21, 2024 · 4fbc03e · 4fbc03e
2 parents 7aa6f8c + 789d0a9
commit 4fbc03e
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -29,14 +29,30 @@ Wiktionary data come from kaikki.org and [Dbnary](https://kaiko.getalp.org/about
 - pigz or gzip
 
 ## Create files
-
+**Unix**:
 ```
 $ python -m venv .venv
 $ source .venv/bin/activate.fish
 $ python -m pip install .
 $ proficiency en
 ```
 
+**Windows**:
+
+First install wget with
+```
+winget install -e --id JernejSimoncic.Wget
+```
+and re-open the console window.
+
+Then activate Python and run the program:
+```
+python -m venv .venv
+.\.venv\Scripts\activate
+python -m pip install .
+proficiency en
+```
+
 ## License
 
 This work is licensed under GPL version 3 or later.
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,26 +5,23 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "Proficiency"
 version = "0.5.12"
-authors = [
-    {name = "xxyzz"}
-]
+authors = [{ name = "xxyzz" }]
 description = "Create language files for WordDumb."
 readme = "README.md"
 requires-python = ">=3.11"
-license = {text = "GNU General Public License v3 or later (GPLv3+)"}
+license = { text = "GNU General Public License v3 or later (GPLv3+)" }
 dependencies = [
     "lemminflect",
     "OpenCC",
     "wordfreq[mecab]",
     "wiktextract-lemmatization @ git+https://github.com/Vuizur/wiktextract-lemmatization@37f438eb973364de4d5e70959ee1c2aa26bf5ba5",
     "pyoxigraph",
+    # mecab with prerelease version for 3.12 wheels
+    "mecab-python3>=1.0.9.dev4 ; platform_system == 'Windows'",
 ]
 
 [project.optional-dependencies]
-dev = [
-    "mypy",
-    "ruff",
-]
+dev = ["mypy", "ruff"]
 
 [project.scripts]
 proficiency = "proficiency.main:main"
@@ -51,12 +48,12 @@ ignore_missing_imports = true
 [tool.typos]
 type.csv.check-file = false
 type.json.check-file = false
-default.extend-words = {"Formes" = "Formes"}
+default.extend-words = { "Formes" = "Formes" }
 
 [tool.ruff.lint]
 select = [
-    "E",  # pycodestyle error
-    "F",  # Pyflakes
-    "I",  # isort
-    "W",  # pycodestyle warning
+    "E", # pycodestyle error
+    "F", # Pyflakes
+    "I", # isort
+    "W", # pycodestyle warning
 ]
diff --git a/src/proficiency/extract_kaikki.py b/src/proficiency/extract_kaikki.py
@@ -78,16 +78,23 @@ def download_kaikki_non_en_json(gloss_lang: str) -> Path:
             text=True,
         )
     if gz_path.exists() and not jsonl_path.exists():
-        subprocess.run(
-            [
-                "pigz" if which("pigz") is not None else "gzip",
-                "-d",
-                str(gz_path),
-            ],
-            check=True,
-            capture_output=True,
-            text=True,
-        )
+        if which("pigz") is None and which("gzip") is None:
+            import gzip
+            import shutil
+
+            with gzip.open(gz_path, "rb") as f_in, open(jsonl_path, "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        else:
+            subprocess.run(
+                [
+                    "pigz" if which("pigz") is not None else "gzip",
+                    "-d",
+                    str(gz_path),
+                ],
+                check=True,
+                capture_output=True,
+                text=True,
+            )
         split_kaikki_non_en_jsonl(jsonl_path, gloss_lang)
 
     return jsonl_path
@@ -202,9 +209,11 @@ def create_lemmas_db_from_kaikki(lemma_lang: str, gloss_lang: str) -> list[Path]
                             enabled,
                             converter.convert(short_gloss),
                             converter.convert(gloss),
-                            converter.convert(example_sent)
-                            if example_sent is not None
-                            else None,
+                            (
+                                converter.convert(example_sent)
+                                if example_sent is not None
+                                else None
+                            ),
                         )
                     )
                 enabled = False

diff --git a/src/proficiency/main.py b/src/proficiency/main.py
@@ -27,12 +27,27 @@
 def compress(file_path: Path) -> None:
     compressed_path = file_path.with_suffix(file_path.suffix + ".bz2")
     compressed_path.unlink(missing_ok=True)
-    subprocess.run(
-        ["lbzip2" if which("lbzip2") is not None else "bzip2", "-k", str(file_path)],
-        check=True,
-        capture_output=True,
-        text=True,
-    )
+
+    if which("lbzip2") is None and which("bzip2") is None:
+        import bz2
+
+        # Use pure python implementation of bzip2 compression
+        with open(file_path, "rb") as input_file:
+            data = input_file.read()
+            compressed_data = bz2.compress(data)
+            with open(compressed_path, "wb") as output_file:
+                output_file.write(compressed_data)
+    else:
+        subprocess.run(
+            [
+                "lbzip2" if which("lbzip2") is not None else "bzip2",
+                "-k",
+                str(file_path),
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
 
 
 def create_wiktionary_files_from_kaikki(