From 04f663cff59d9b8a52eda8ac3e725a28bf84a911 Mon Sep 17 00:00:00 2001 From: Makoto Hiramatsu <himkt@klis.tsukuba.ac.jp> Date: Sun, 6 Jun 2021 13:40:10 +0900 Subject: [PATCH] Cleanup codebase (#145) * Update README * Cleanup tests using pytest fixture * Cleanup code --- README.md | 2 +- konoha/integrations/allennlp.py | 2 +- konoha/word_tokenizer.py | 4 +- konoha/word_tokenizers/mecab_tokenizer.py | 2 +- .../integrations/test_allennlp_integration.py | 90 ++++++------------- 5 files changed, 32 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index 4caeb02..b8795dc 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ cd konoha && docker-compose up --build # build and launch container ``` Tokenization is done by posting a json object to `localhost:8000/api/v1/tokenize`. -You can also batch tokenize by passing `texts: ["1つ目の入力", "2つ目の入力"]` to the server. +You can also batch tokenize by passing `texts: ["1つ目の入力", "2つ目の入力"]` to `localhost:8000/api/v1/batch_tokenize`. (API documentation is available on `localhost:8000/redoc`, you can check it using your web browser) diff --git a/konoha/integrations/allennlp.py b/konoha/integrations/allennlp.py index 02d7c55..f9d2a96 100644 --- a/konoha/integrations/allennlp.py +++ b/konoha/integrations/allennlp.py @@ -48,7 +48,7 @@ def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: @overrides def tokenize(self, text: str) -> List[Token]: konoha_tokens = self._tokenizer.tokenize(text) - tokens = [Token(text=token.surface, lemma_=token.base_form, pos_=token.postag,) for token in konoha_tokens] + tokens = [Token(text=token.surface, lemma_=token.base_form, pos_=token.postag) for token in konoha_tokens] for start_token in self._start_tokens: tokens.insert(0, Token(start_token, 0)) diff --git a/konoha/word_tokenizer.py b/konoha/word_tokenizer.py index c7a3db6..9ee2552 100644 --- a/konoha/word_tokenizer.py +++ b/konoha/word_tokenizer.py @@ -57,7 +57,7 @@ def _setup_tokenizer(self) -> None: if self._model_path is None: raise ValueError("`model_path` must be specified for sentencepiece.") - self._tokenizer = word_tokenizers.SentencepieceTokenizer(model_path=self._model_path,) + self._tokenizer = word_tokenizers.SentencepieceTokenizer(model_path=self._model_path) if self._tokenizer_name == "mecab": self._tokenizer = word_tokenizers.MeCabTokenizer( @@ -114,7 +114,7 @@ def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: return [self._tokenizer.tokenize(text) for text in texts] @staticmethod - def _tokenize_with_remote_host(endpoint: str, payload: Dict, headers: Dict,) -> List[Dict]: + def _tokenize_with_remote_host(endpoint: str, payload: Dict, headers: Dict) -> List[Dict]: return requests.post(endpoint, json=payload, headers=headers).json()["tokens"] @staticmethod diff --git a/konoha/word_tokenizers/mecab_tokenizer.py b/konoha/word_tokenizers/mecab_tokenizer.py index 316167d..50eb520 100644 --- a/konoha/word_tokenizers/mecab_tokenizer.py +++ b/konoha/word_tokenizers/mecab_tokenizer.py @@ -7,7 +7,7 @@ def parse_feature_for_ipadic(elem) -> Token: surface, feature = elem.split("\t") - (postag, postag2, postag3, postag4, inflection, conjugation, base_form, *other,) = feature.split(",") + (postag, postag2, postag3, postag4, inflection, conjugation, base_form, *other) = feature.split(",") # For words not in a dictionary if len(other) == 2: diff --git a/tests/integrations/test_allennlp_integration.py b/tests/integrations/test_allennlp_integration.py index a23ce03..f7fa563 100644 --- a/tests/integrations/test_allennlp_integration.py +++ b/tests/integrations/test_allennlp_integration.py @@ -1,81 +1,45 @@ import tempfile +from typing import List, Optional + +import allennlp.commands.train +from allennlp.models.basic_classifier import BasicClassifier import pytest from konoha.integrations.allennlp import KonohaTokenizer -def test_allennlp_mecab(): - try: - import allennlp # NOQA - import natto # NOQA - except ImportError: - pytest.skip("AllenNLP or MeCab is not installed.") - - tokenizer = KonohaTokenizer(tokenizer_name="mecab") - tokens_konoha = tokenizer.tokenize("吾輩は猫である") - token_surfaces = "吾輩 は 猫 で ある".split() - assert token_surfaces == list(t.text for t in tokens_konoha) - - -def test_allennlp_janome(): - try: - import allennlp # NOQA - import janome # NOQA - except ImportError: - pytest.skip("AllenNLP or Janome is not installed.") - - tokenizer = KonohaTokenizer(tokenizer_name="janome") - tokens_konoha = tokenizer.tokenize("吾輩は猫である") - token_surfaces = "吾輩 は 猫 で ある".split() - assert token_surfaces == list(t.text for t in tokens_konoha) +@pytest.fixture +def raw_text(): + return "吾輩は猫である" -def test_allennlp_kytea(): - try: - import allennlp # NOQA - import Mykytea # NOQA - except ImportError: - pytest.skip("AllenNLP or KyTea is not installed.") - tokenizer = KonohaTokenizer(tokenizer_name="kytea") - tokens_konoha = tokenizer.tokenize("吾輩は猫である") - token_surfaces = "吾輩 は 猫 で あ る".split() - assert token_surfaces == list(t.text for t in tokens_konoha) - - -def test_allennlp_sentencepiece(): - try: - import allennlp # NOQA - import sentencepiece # NOQA - except ImportError: - pytest.skip("AllenNLP or Sentencepiece is not installed.") +@pytest.mark.parametrize( + "token_surfaces,tokenizer_name,mode,model_path", ( + ("吾輩 は 猫 で ある".split(" "), "mecab", None, None), + ("吾輩 は 猫 で ある".split(" "), "janome", None, None), + ("吾輩 は 猫 で あ る".split(" "), "kytea", None, None), + ("▁ 吾 輩 は 猫 である".split(" "), "sentencepiece", None, "data/model.spm"), + ("吾輩 は 猫 で ある".split(" "), "sudachi", "A", None), + ) +) +def test_allennlp( + raw_text: str, + token_surfaces: List[str], + tokenizer_name: str, + mode: Optional[str], + model_path: Optional[str], +) -> None: tokenizer = KonohaTokenizer( - tokenizer_name="sentencepiece", model_path="data/model.spm" + tokenizer_name=tokenizer_name, + mode=mode, + model_path=model_path, ) - tokens_konoha = tokenizer.tokenize("吾輩は猫である") - token_surfaces = "▁ 吾 輩 は 猫 である".split() - assert token_surfaces == list(t.text for t in tokens_konoha) - - -def test_allennlp_sudachi(): - try: - import allennlp # NOQA - import sudachipy # NOQA - except ImportError: - pytest.skip("AllenNLP or SudachiPy is not installed.") - tokenizer = KonohaTokenizer(tokenizer_name="sudachi", mode="A",) - tokens_konoha = tokenizer.tokenize("医薬品安全管理責任者") - token_surfaces = "医薬 品 安全 管理 責任 者".split() + tokens_konoha = tokenizer.tokenize(raw_text) assert token_surfaces == list(t.text for t in tokens_konoha) def test_allennlp_training(): - try: - import allennlp.commands.train - from allennlp.models.basic_classifier import BasicClassifier - except ImportError: - pytest.skip("AllenNLP or Konoha (with Janome) is not installed.") - with tempfile.TemporaryDirectory() as serialization_dir: model = allennlp.commands.train.train_model_from_file( "test_fixtures/classifier.jsonnet",