diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index de91699..7bf3ea1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,8 +16,29 @@ permissions: pull-requests: read jobs: + tests: + name: Run tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install package + run: | + # remove mecab once "mecab-python3" releases 3.12 wheels + sudo apt install -y mecab libmecab-dev + python -m pip install . + + - name: Run tests + run: python -m unittest discover -b -s tests + create_files: name: Create files + needs: tests runs-on: ubuntu-latest strategy: fail-fast: false @@ -33,7 +54,6 @@ jobs: - name: Install package run: | - # remove mecab once "mecab-python3" releases 3.12 wheels sudo apt install -y lbzip2 mecab libmecab-dev python -m pip install . diff --git a/src/proficiency/util.py b/src/proficiency/util.py index 6363fd1..d04e237 100644 --- a/src/proficiency/util.py +++ b/src/proficiency/util.py @@ -16,20 +16,37 @@ def remove_full_stop(text: str) -> str: return text.removesuffix(".").removesuffix("。") +def remove_parentheses(text: str) -> str: + # supports nested parentheses + left_bracket_count = 0 + result = "" + for char in text: + if char == "(": + left_bracket_count += 1 + elif char == ")": + left_bracket_count -= 1 + elif left_bracket_count == 0: + result += char + return result.replace(" ", " ") + + def get_short_def(gloss: str, gloss_lang: str) -> str: gloss = remove_full_stop(gloss) + if "(" in gloss: + gloss = remove_parentheses(gloss) gloss = re.sub( - r"\([^)]+\)|([^)]+)|〈[^〉]+〉|\[[^]]+\]|[[^]]+]|【[^】]+】|﹝[^﹞]+﹞|「[^」]+」", + r"([^)]+)|〈[^〉]+〉|\[[^]]+\]|[[^]]+]|【[^】]+】|﹝[^﹞]+﹞|「[^」]+」", "", gloss, ) gloss = min(re.split(";|;", gloss), key=len) + gloss = re.split(r",|,", gloss, 1)[0] gloss = min(gloss.split("/"), key=len) if gloss_lang == "zh": gloss = min(gloss.split("、"), key=len) if gloss_lang == "es" and "|" in gloss: gloss = gloss.split("|", 1)[1] - return gloss.strip() + return remove_full_stop(gloss.strip()) def load_difficulty_data(lemma_lang: str) -> dict[str, int]: diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..3e22139 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,38 @@ +from unittest import TestCase + +from proficiency.util import get_short_def + + +class TestUtil(TestCase): + def test_nested_parentheses(self) -> None: + # https://en.wiktionary.org/wiki/cadet_house + self.assertEqual( + get_short_def( + "Synonym of cadet branch (“house (dynasty) descended from one of " + "the patriarch's younger sons”)", + "en", + ), + "Synonym of cadet branch", + ) + + def test_get_short_def_stop(self) -> None: + # https://en.wiktionary.org/wiki/jack_jumper + self.assertEqual( + get_short_def( + "Any of various small species of ant of the genus Myrmecia, " + "often capable of jumping and having a painful sting; a jumper. " + "(Also used attributively.)", + "en", + ), + "a jumper", + ) + + def test_mastodonian(self) -> None: + # https://en.wiktionary.org/wiki/mastodonian + self.assertEqual( + get_short_def( + "Of, related to, or characteristic of a mastodon; large; powerful.", + "en", + ), + "large", + )