Merge branch 'master' of ssh://github.com/himkt/konoha

himkt · Jun 6, 2021 · 4e00e0c · 4e00e0c
2 parents d99f40c + 04f663c
commit 4e00e0c
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ cd konoha && docker-compose up --build  # build and launch container
 ```
 
 Tokenization is done by posting a json object to `localhost:8000/api/v1/tokenize`.
-You can also batch tokenize by passing `texts: ["１つ目の入力", "２つ目の入力"]` to the server.
+You can also batch tokenize by passing `texts: ["１つ目の入力", "２つ目の入力"]` to `localhost:8000/api/v1/batch_tokenize`.
 
 (API documentation is available on `localhost:8000/redoc`, you can check it using your web browser)
 

diff --git a/konoha/integrations/allennlp.py b/konoha/integrations/allennlp.py
@@ -48,7 +48,7 @@ def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
     @overrides
     def tokenize(self, text: str) -> List[Token]:
         konoha_tokens = self._tokenizer.tokenize(text)
-        tokens = [Token(text=token.surface, lemma_=token.base_form, pos_=token.postag,) for token in konoha_tokens]
+        tokens = [Token(text=token.surface, lemma_=token.base_form, pos_=token.postag) for token in konoha_tokens]
 
         for start_token in self._start_tokens:
             tokens.insert(0, Token(start_token, 0))

diff --git a/konoha/word_tokenizer.py b/konoha/word_tokenizer.py
@@ -57,7 +57,7 @@ def _setup_tokenizer(self) -> None:
             if self._model_path is None:
                 raise ValueError("`model_path` must be specified for sentencepiece.")
 
-            self._tokenizer = word_tokenizers.SentencepieceTokenizer(model_path=self._model_path,)
+            self._tokenizer = word_tokenizers.SentencepieceTokenizer(model_path=self._model_path)
 
         if self._tokenizer_name == "mecab":
             self._tokenizer = word_tokenizers.MeCabTokenizer(
@@ -114,7 +114,7 @@ def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
             return [self._tokenizer.tokenize(text) for text in texts]
 
     @staticmethod
-    def _tokenize_with_remote_host(endpoint: str, payload: Dict, headers: Dict,) -> List[Dict]:
+    def _tokenize_with_remote_host(endpoint: str, payload: Dict, headers: Dict) -> List[Dict]:
         return requests.post(endpoint, json=payload, headers=headers).json()["tokens"]
 
     @staticmethod

diff --git a/konoha/word_tokenizers/mecab_tokenizer.py b/konoha/word_tokenizers/mecab_tokenizer.py
@@ -7,7 +7,7 @@
 
 def parse_feature_for_ipadic(elem) -> Token:
     surface, feature = elem.split("\t")
-    (postag, postag2, postag3, postag4, inflection, conjugation, base_form, *other,) = feature.split(",")
+    (postag, postag2, postag3, postag4, inflection, conjugation, base_form, *other) = feature.split(",")
 
     # For words not in a dictionary
     if len(other) == 2:

diff --git a/tests/integrations/test_allennlp_integration.py b/tests/integrations/test_allennlp_integration.py
@@ -1,81 +1,45 @@
 import tempfile
+from typing import List, Optional
+
+import allennlp.commands.train
+from allennlp.models.basic_classifier import BasicClassifier
 
 import pytest
 
 from konoha.integrations.allennlp import KonohaTokenizer
 
 
-def test_allennlp_mecab():
-    try:
-        import allennlp  # NOQA
-        import natto  # NOQA
-    except ImportError:
-        pytest.skip("AllenNLP or MeCab is not installed.")
-
-    tokenizer = KonohaTokenizer(tokenizer_name="mecab")
-    tokens_konoha = tokenizer.tokenize("吾輩は猫である")
-    token_surfaces = "吾輩 は 猫 で ある".split()
-    assert token_surfaces == list(t.text for t in tokens_konoha)
-
-
-def test_allennlp_janome():
-    try:
-        import allennlp  # NOQA
-        import janome  # NOQA
-    except ImportError:
-        pytest.skip("AllenNLP or Janome is not installed.")
-
-    tokenizer = KonohaTokenizer(tokenizer_name="janome")
-    tokens_konoha = tokenizer.tokenize("吾輩は猫である")
-    token_surfaces = "吾輩 は 猫 で ある".split()
-    assert token_surfaces == list(t.text for t in tokens_konoha)
+@pytest.fixture
+def raw_text():
+    return "吾輩は猫である"
 
 
-def test_allennlp_kytea():
-    try:
-        import allennlp  # NOQA
-        import Mykytea  # NOQA
-    except ImportError:
-        pytest.skip("AllenNLP or KyTea is not installed.")
-    tokenizer = KonohaTokenizer(tokenizer_name="kytea")
-    tokens_konoha = tokenizer.tokenize("吾輩は猫である")
-    token_surfaces = "吾輩 は 猫 で あ る".split()
-    assert token_surfaces == list(t.text for t in tokens_konoha)
-
-
-def test_allennlp_sentencepiece():
-    try:
-        import allennlp  # NOQA
-        import sentencepiece  # NOQA
-    except ImportError:
-        pytest.skip("AllenNLP or Sentencepiece is not installed.")
+@pytest.mark.parametrize(
+    "token_surfaces,tokenizer_name,mode,model_path", (
+        ("吾輩 は 猫 で ある".split(" "), "mecab", None, None),
+        ("吾輩 は 猫 で ある".split(" "), "janome", None, None),
+        ("吾輩 は 猫 で あ る".split(" "), "kytea", None, None),
+        ("▁ 吾 輩 は 猫 である".split(" "), "sentencepiece", None, "data/model.spm"),
+        ("吾輩 は 猫 で ある".split(" "), "sudachi", "A", None),
+    )
+)
+def test_allennlp(
+    raw_text: str,
+    token_surfaces: List[str],
+    tokenizer_name: str,
+    mode: Optional[str],
+    model_path: Optional[str],
+) -> None:
     tokenizer = KonohaTokenizer(
-        tokenizer_name="sentencepiece", model_path="data/model.spm"
+        tokenizer_name=tokenizer_name,
+        mode=mode,
+        model_path=model_path,
     )
-    tokens_konoha = tokenizer.tokenize("吾輩は猫である")
-    token_surfaces = "▁ 吾 輩 は 猫 である".split()
-    assert token_surfaces == list(t.text for t in tokens_konoha)
-
-
-def test_allennlp_sudachi():
-    try:
-        import allennlp  # NOQA
-        import sudachipy  # NOQA
-    except ImportError:
-        pytest.skip("AllenNLP or SudachiPy is not installed.")
-    tokenizer = KonohaTokenizer(tokenizer_name="sudachi", mode="A",)
-    tokens_konoha = tokenizer.tokenize("医薬品安全管理責任者")
-    token_surfaces = "医薬 品 安全 管理 責任 者".split()
+    tokens_konoha = tokenizer.tokenize(raw_text)
     assert token_surfaces == list(t.text for t in tokens_konoha)
 
 
 def test_allennlp_training():
-    try:
-        import allennlp.commands.train
-        from allennlp.models.basic_classifier import BasicClassifier
-    except ImportError:
-        pytest.skip("AllenNLP or Konoha (with Janome) is not installed.")
-
     with tempfile.TemporaryDirectory() as serialization_dir:
         model = allennlp.commands.train.train_model_from_file(
             "test_fixtures/classifier.jsonnet",