Skip to content

Commit

Permalink
Merge pull request #16 from Vuizur/split-by-comma
Browse files Browse the repository at this point in the history
Split by comma + handle nested parentheses
  • Loading branch information
xxyzz authored Feb 22, 2024
2 parents fa9ef01 + 12cc2e9 commit cdec50a
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 3 deletions.
22 changes: 21 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,29 @@ permissions:
pull-requests: read

jobs:
tests:
name: Run tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip'

- name: Install package
run: |
# remove mecab once "mecab-python3" releases 3.12 wheels
sudo apt install -y mecab libmecab-dev
python -m pip install .
- name: Run tests
run: python -m unittest discover -b -s tests

create_files:
name: Create files
needs: tests
runs-on: ubuntu-latest
strategy:
fail-fast: false
Expand All @@ -33,7 +54,6 @@ jobs:

- name: Install package
run: |
# remove mecab once "mecab-python3" releases 3.12 wheels
sudo apt install -y lbzip2 mecab libmecab-dev
python -m pip install .
Expand Down
21 changes: 19 additions & 2 deletions src/proficiency/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,37 @@ def remove_full_stop(text: str) -> str:
return text.removesuffix(".").removesuffix("。")


def remove_parentheses(text: str) -> str:
# supports nested parentheses
left_bracket_count = 0
result = ""
for char in text:
if char == "(":
left_bracket_count += 1
elif char == ")":
left_bracket_count -= 1
elif left_bracket_count == 0:
result += char
return result.replace(" ", " ")


def get_short_def(gloss: str, gloss_lang: str) -> str:
gloss = remove_full_stop(gloss)
if "(" in gloss:
gloss = remove_parentheses(gloss)
gloss = re.sub(
r"\([^)]+\)|([^)]+)|〈[^〉]+〉|\[[^]]+\]|[[^]]+]|【[^】]+】|﹝[^﹞]+﹞|「[^」]+」",
r"([^)]+)|〈[^〉]+〉|\[[^]]+\]|[[^]]+]|【[^】]+】|﹝[^﹞]+﹞|「[^」]+」",
"",
gloss,
)
gloss = min(re.split(";|;", gloss), key=len)
gloss = re.split(r",|,", gloss, 1)[0]
gloss = min(gloss.split("/"), key=len)
if gloss_lang == "zh":
gloss = min(gloss.split("、"), key=len)
if gloss_lang == "es" and "|" in gloss:
gloss = gloss.split("|", 1)[1]
return gloss.strip()
return remove_full_stop(gloss.strip())


def load_difficulty_data(lemma_lang: str) -> dict[str, int]:
Expand Down
38 changes: 38 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from unittest import TestCase

from proficiency.util import get_short_def


class TestUtil(TestCase):
def test_nested_parentheses(self) -> None:
# https://en.wiktionary.org/wiki/cadet_house
self.assertEqual(
get_short_def(
"Synonym of cadet branch (“house (dynasty) descended from one of "
"the patriarch's younger sons”)",
"en",
),
"Synonym of cadet branch",
)

def test_get_short_def_stop(self) -> None:
# https://en.wiktionary.org/wiki/jack_jumper
self.assertEqual(
get_short_def(
"Any of various small species of ant of the genus Myrmecia, "
"often capable of jumping and having a painful sting; a jumper. "
"(Also used attributively.)",
"en",
),
"a jumper",
)

def test_mastodonian(self) -> None:
# https://en.wiktionary.org/wiki/mastodonian
self.assertEqual(
get_short_def(
"Of, related to, or characteristic of a mastodon; large; powerful.",
"en",
),
"large",
)

0 comments on commit cdec50a

Please sign in to comment.