From 6e9f7e2452a53cb01f394be8cccc0292f1dc3d0a Mon Sep 17 00:00:00 2001
From: Vuizur <Vuizur@users.noreply.github.com>
Date: Wed, 21 Feb 2024 14:11:49 +0100
Subject: [PATCH 1/2] Remove nested parentheses in gloss

---
 .github/workflows/build.yml |  3 +++
 src/proficiency/util.py     | 20 ++++++++++++++++++--
 tests/test_util.py          |  8 ++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_util.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index de91699..3798ea6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -37,6 +37,9 @@ jobs:
           sudo apt install -y lbzip2 mecab libmecab-dev
           python -m pip install .
 
+      - name: Run tests
+        run: python -m unittest discover -b -s tests
+
       - name: Create files
         run: proficiency ${{ matrix.gloss_lang }}
         env:
diff --git a/src/proficiency/util.py b/src/proficiency/util.py
index 6363fd1..bf8b268 100644
--- a/src/proficiency/util.py
+++ b/src/proficiency/util.py
@@ -16,14 +16,30 @@ def remove_full_stop(text: str) -> str:
     return text.removesuffix(".").removesuffix("。")
 
 
+def remove_parentheses(text: str) -> str:
+    # supports nested parentheses
+    left_bracket_count = 0
+    result = ""
+    for char in text:
+        if char == "(":
+            left_bracket_count += 1
+        elif char == ")":
+            left_bracket_count -= 1
+        elif left_bracket_count == 0:
+            result += char
+    return result.replace("  ", " ")
+
+
 def get_short_def(gloss: str, gloss_lang: str) -> str:
     gloss = remove_full_stop(gloss)
+    if "(" in gloss:
+        gloss = remove_parentheses(gloss)
     gloss = re.sub(
-        r"\([^)]+\)|（[^）]+）|〈[^〉]+〉|\[[^]]+\]|［[^］]+］|【[^】]+】|﹝[^﹞]+﹞|「[^」]+」",
+        r"（[^）]+）|〈[^〉]+〉|\[[^]]+\]|［[^］]+］|【[^】]+】|﹝[^﹞]+﹞|「[^」]+」",
         "",
         gloss,
     )
-    gloss = min(re.split(";|；", gloss), key=len)
+    gloss = min(re.split(";|；|,", gloss), key=len)
     gloss = min(gloss.split("/"), key=len)
     if gloss_lang == "zh":
         gloss = min(gloss.split("、"), key=len)
diff --git a/tests/test_util.py b/tests/test_util.py
new file mode 100644
index 0000000..b5b48c1
--- /dev/null
+++ b/tests/test_util.py
@@ -0,0 +1,8 @@
+from unittest import TestCase
+
+
+class TestUtil(TestCase):
+    def test_nested_parentheses(self) -> None:
+        from proficiency.util import remove_parentheses
+
+        self.assertEqual(remove_parentheses("a (b (c)) d"), "a d")

From 12cc2e9c0731ecf1d4b50384990ea07e9b9dc5cf Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Thu, 22 Feb 2024 19:26:45 +0800
Subject: [PATCH 2/2] Split semicolon first then split comma

Only keep the text before the first comma, texts after comma usually
are not a complete sentence.
---
 .github/workflows/build.yml | 25 +++++++++++++++++++++----
 src/proficiency/util.py     |  5 +++--
 tests/test_util.py          | 34 ++++++++++++++++++++++++++++++++--
 3 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3798ea6..7bf3ea1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -16,8 +16,29 @@ permissions:
   pull-requests: read
 
 jobs:
+  tests:
+    name: Run tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+
+      - name: Install package
+        run: |
+          # remove mecab once "mecab-python3" releases 3.12 wheels
+          sudo apt install -y mecab libmecab-dev
+          python -m pip install .
+
+      - name: Run tests
+        run: python -m unittest discover -b -s tests
+
   create_files:
     name: Create files
+    needs: tests
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
@@ -33,13 +54,9 @@ jobs:
 
       - name: Install package
         run: |
-          # remove mecab once "mecab-python3" releases 3.12 wheels
           sudo apt install -y lbzip2 mecab libmecab-dev
           python -m pip install .
 
-      - name: Run tests
-        run: python -m unittest discover -b -s tests
-
       - name: Create files
         run: proficiency ${{ matrix.gloss_lang }}
         env:
diff --git a/src/proficiency/util.py b/src/proficiency/util.py
index bf8b268..d04e237 100644
--- a/src/proficiency/util.py
+++ b/src/proficiency/util.py
@@ -39,13 +39,14 @@ def get_short_def(gloss: str, gloss_lang: str) -> str:
         "",
         gloss,
     )
-    gloss = min(re.split(";|；|,", gloss), key=len)
+    gloss = min(re.split(";|；", gloss), key=len)
+    gloss = re.split(r",|，", gloss, 1)[0]
     gloss = min(gloss.split("/"), key=len)
     if gloss_lang == "zh":
         gloss = min(gloss.split("、"), key=len)
     if gloss_lang == "es" and "|" in gloss:
         gloss = gloss.split("|", 1)[1]
-    return gloss.strip()
+    return remove_full_stop(gloss.strip())
 
 
 def load_difficulty_data(lemma_lang: str) -> dict[str, int]:
diff --git a/tests/test_util.py b/tests/test_util.py
index b5b48c1..3e22139 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,8 +1,38 @@
 from unittest import TestCase
 
+from proficiency.util import get_short_def
+
 
 class TestUtil(TestCase):
     def test_nested_parentheses(self) -> None:
-        from proficiency.util import remove_parentheses
+        # https://en.wiktionary.org/wiki/cadet_house
+        self.assertEqual(
+            get_short_def(
+                "Synonym of cadet branch (“house (dynasty) descended from one of "
+                "the patriarch's younger sons”)",
+                "en",
+            ),
+            "Synonym of cadet branch",
+        )
+
+    def test_get_short_def_stop(self) -> None:
+        # https://en.wiktionary.org/wiki/jack_jumper
+        self.assertEqual(
+            get_short_def(
+                "Any of various small species of ant of the genus Myrmecia, "
+                "often capable of jumping and having a painful sting; a jumper. "
+                "(Also used attributively.)",
+                "en",
+            ),
+            "a jumper",
+        )
 
-        self.assertEqual(remove_parentheses("a (b (c)) d"), "a d")
+    def test_mastodonian(self) -> None:
+        # https://en.wiktionary.org/wiki/mastodonian
+        self.assertEqual(
+            get_short_def(
+                "Of, related to, or characteristic of a mastodon; large; powerful.",
+                "en",
+            ),
+            "large",
+        )