From 6e9f7e2452a53cb01f394be8cccc0292f1dc3d0a Mon Sep 17 00:00:00 2001 From: Vuizur Date: Wed, 21 Feb 2024 14:11:49 +0100 Subject: [PATCH 1/2] Remove nested parentheses in gloss --- .github/workflows/build.yml | 3 +++ src/proficiency/util.py | 20 ++++++++++++++++++-- tests/test_util.py | 8 ++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 tests/test_util.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index de91699..3798ea6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -37,6 +37,9 @@ jobs: sudo apt install -y lbzip2 mecab libmecab-dev python -m pip install . + - name: Run tests + run: python -m unittest discover -b -s tests + - name: Create files run: proficiency ${{ matrix.gloss_lang }} env: diff --git a/src/proficiency/util.py b/src/proficiency/util.py index 6363fd1..bf8b268 100644 --- a/src/proficiency/util.py +++ b/src/proficiency/util.py @@ -16,14 +16,30 @@ def remove_full_stop(text: str) -> str: return text.removesuffix(".").removesuffix("。") +def remove_parentheses(text: str) -> str: + # supports nested parentheses + left_bracket_count = 0 + result = "" + for char in text: + if char == "(": + left_bracket_count += 1 + elif char == ")": + left_bracket_count -= 1 + elif left_bracket_count == 0: + result += char + return result.replace(" ", " ") + + def get_short_def(gloss: str, gloss_lang: str) -> str: gloss = remove_full_stop(gloss) + if "(" in gloss: + gloss = remove_parentheses(gloss) gloss = re.sub( - r"\([^)]+\)|([^)]+)|〈[^〉]+〉|\[[^]]+\]|[[^]]+]|【[^】]+】|﹝[^﹞]+﹞|「[^」]+」", + r"([^)]+)|〈[^〉]+〉|\[[^]]+\]|[[^]]+]|【[^】]+】|﹝[^﹞]+﹞|「[^」]+」", "", gloss, ) - gloss = min(re.split(";|;", gloss), key=len) + gloss = min(re.split(";|;|,", gloss), key=len) gloss = min(gloss.split("/"), key=len) if gloss_lang == "zh": gloss = min(gloss.split("、"), key=len) diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..b5b48c1 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,8 @@ +from unittest import TestCase + + +class TestUtil(TestCase): + def test_nested_parentheses(self) -> None: + from proficiency.util import remove_parentheses + + self.assertEqual(remove_parentheses("a (b (c)) d"), "a d") From 12cc2e9c0731ecf1d4b50384990ea07e9b9dc5cf Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 22 Feb 2024 19:26:45 +0800 Subject: [PATCH 2/2] Split semicolon first then split comma Only keep the text before the first comma, texts after comma usually are not a complete sentence. --- .github/workflows/build.yml | 25 +++++++++++++++++++++---- src/proficiency/util.py | 5 +++-- tests/test_util.py | 34 ++++++++++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3798ea6..7bf3ea1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,8 +16,29 @@ permissions: pull-requests: read jobs: + tests: + name: Run tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install package + run: | + # remove mecab once "mecab-python3" releases 3.12 wheels + sudo apt install -y mecab libmecab-dev + python -m pip install . + + - name: Run tests + run: python -m unittest discover -b -s tests + create_files: name: Create files + needs: tests runs-on: ubuntu-latest strategy: fail-fast: false @@ -33,13 +54,9 @@ jobs: - name: Install package run: | - # remove mecab once "mecab-python3" releases 3.12 wheels sudo apt install -y lbzip2 mecab libmecab-dev python -m pip install . - - name: Run tests - run: python -m unittest discover -b -s tests - - name: Create files run: proficiency ${{ matrix.gloss_lang }} env: diff --git a/src/proficiency/util.py b/src/proficiency/util.py index bf8b268..d04e237 100644 --- a/src/proficiency/util.py +++ b/src/proficiency/util.py @@ -39,13 +39,14 @@ def get_short_def(gloss: str, gloss_lang: str) -> str: "", gloss, ) - gloss = min(re.split(";|;|,", gloss), key=len) + gloss = min(re.split(";|;", gloss), key=len) + gloss = re.split(r",|,", gloss, 1)[0] gloss = min(gloss.split("/"), key=len) if gloss_lang == "zh": gloss = min(gloss.split("、"), key=len) if gloss_lang == "es" and "|" in gloss: gloss = gloss.split("|", 1)[1] - return gloss.strip() + return remove_full_stop(gloss.strip()) def load_difficulty_data(lemma_lang: str) -> dict[str, int]: diff --git a/tests/test_util.py b/tests/test_util.py index b5b48c1..3e22139 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,8 +1,38 @@ from unittest import TestCase +from proficiency.util import get_short_def + class TestUtil(TestCase): def test_nested_parentheses(self) -> None: - from proficiency.util import remove_parentheses + # https://en.wiktionary.org/wiki/cadet_house + self.assertEqual( + get_short_def( + "Synonym of cadet branch (“house (dynasty) descended from one of " + "the patriarch's younger sons”)", + "en", + ), + "Synonym of cadet branch", + ) + + def test_get_short_def_stop(self) -> None: + # https://en.wiktionary.org/wiki/jack_jumper + self.assertEqual( + get_short_def( + "Any of various small species of ant of the genus Myrmecia, " + "often capable of jumping and having a painful sting; a jumper. " + "(Also used attributively.)", + "en", + ), + "a jumper", + ) - self.assertEqual(remove_parentheses("a (b (c)) d"), "a d") + def test_mastodonian(self) -> None: + # https://en.wiktionary.org/wiki/mastodonian + self.assertEqual( + get_short_def( + "Of, related to, or characteristic of a mastodon; large; powerful.", + "en", + ), + "large", + )