Skip to content

Commit

Permalink
Merge branch 'master' of ssh://github.com/himkt/konoha
Browse files Browse the repository at this point in the history
  • Loading branch information
Makoto Hiramatsu committed Jun 6, 2021
2 parents d99f40c + 04f663c commit 4e00e0c
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 68 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ cd konoha && docker-compose up --build # build and launch container
```

Tokenization is done by posting a json object to `localhost:8000/api/v1/tokenize`.
You can also batch tokenize by passing `texts: ["1つ目の入力", "2つ目の入力"]` to the server.
You can also batch tokenize by passing `texts: ["1つ目の入力", "2つ目の入力"]` to `localhost:8000/api/v1/batch_tokenize`.

(API documentation is available on `localhost:8000/redoc`, you can check it using your web browser)

Expand Down
2 changes: 1 addition & 1 deletion konoha/integrations/allennlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
@overrides
def tokenize(self, text: str) -> List[Token]:
konoha_tokens = self._tokenizer.tokenize(text)
tokens = [Token(text=token.surface, lemma_=token.base_form, pos_=token.postag,) for token in konoha_tokens]
tokens = [Token(text=token.surface, lemma_=token.base_form, pos_=token.postag) for token in konoha_tokens]

for start_token in self._start_tokens:
tokens.insert(0, Token(start_token, 0))
Expand Down
4 changes: 2 additions & 2 deletions konoha/word_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _setup_tokenizer(self) -> None:
if self._model_path is None:
raise ValueError("`model_path` must be specified for sentencepiece.")

self._tokenizer = word_tokenizers.SentencepieceTokenizer(model_path=self._model_path,)
self._tokenizer = word_tokenizers.SentencepieceTokenizer(model_path=self._model_path)

if self._tokenizer_name == "mecab":
self._tokenizer = word_tokenizers.MeCabTokenizer(
Expand Down Expand Up @@ -114,7 +114,7 @@ def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
return [self._tokenizer.tokenize(text) for text in texts]

@staticmethod
def _tokenize_with_remote_host(endpoint: str, payload: Dict, headers: Dict,) -> List[Dict]:
def _tokenize_with_remote_host(endpoint: str, payload: Dict, headers: Dict) -> List[Dict]:
return requests.post(endpoint, json=payload, headers=headers).json()["tokens"]

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion konoha/word_tokenizers/mecab_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def parse_feature_for_ipadic(elem) -> Token:
surface, feature = elem.split("\t")
(postag, postag2, postag3, postag4, inflection, conjugation, base_form, *other,) = feature.split(",")
(postag, postag2, postag3, postag4, inflection, conjugation, base_form, *other) = feature.split(",")

# For words not in a dictionary
if len(other) == 2:
Expand Down
90 changes: 27 additions & 63 deletions tests/integrations/test_allennlp_integration.py
Original file line number Diff line number Diff line change
@@ -1,81 +1,45 @@
import tempfile
from typing import List, Optional

import allennlp.commands.train
from allennlp.models.basic_classifier import BasicClassifier

import pytest

from konoha.integrations.allennlp import KonohaTokenizer


def test_allennlp_mecab():
try:
import allennlp # NOQA
import natto # NOQA
except ImportError:
pytest.skip("AllenNLP or MeCab is not installed.")

tokenizer = KonohaTokenizer(tokenizer_name="mecab")
tokens_konoha = tokenizer.tokenize("吾輩は猫である")
token_surfaces = "吾輩 は 猫 で ある".split()
assert token_surfaces == list(t.text for t in tokens_konoha)


def test_allennlp_janome():
try:
import allennlp # NOQA
import janome # NOQA
except ImportError:
pytest.skip("AllenNLP or Janome is not installed.")

tokenizer = KonohaTokenizer(tokenizer_name="janome")
tokens_konoha = tokenizer.tokenize("吾輩は猫である")
token_surfaces = "吾輩 は 猫 で ある".split()
assert token_surfaces == list(t.text for t in tokens_konoha)
@pytest.fixture
def raw_text():
return "吾輩は猫である"


def test_allennlp_kytea():
try:
import allennlp # NOQA
import Mykytea # NOQA
except ImportError:
pytest.skip("AllenNLP or KyTea is not installed.")
tokenizer = KonohaTokenizer(tokenizer_name="kytea")
tokens_konoha = tokenizer.tokenize("吾輩は猫である")
token_surfaces = "吾輩 は 猫 で あ る".split()
assert token_surfaces == list(t.text for t in tokens_konoha)


def test_allennlp_sentencepiece():
try:
import allennlp # NOQA
import sentencepiece # NOQA
except ImportError:
pytest.skip("AllenNLP or Sentencepiece is not installed.")
@pytest.mark.parametrize(
"token_surfaces,tokenizer_name,mode,model_path", (
("吾輩 は 猫 で ある".split(" "), "mecab", None, None),
("吾輩 は 猫 で ある".split(" "), "janome", None, None),
("吾輩 は 猫 で あ る".split(" "), "kytea", None, None),
("▁ 吾 輩 は 猫 である".split(" "), "sentencepiece", None, "data/model.spm"),
("吾輩 は 猫 で ある".split(" "), "sudachi", "A", None),
)
)
def test_allennlp(
raw_text: str,
token_surfaces: List[str],
tokenizer_name: str,
mode: Optional[str],
model_path: Optional[str],
) -> None:
tokenizer = KonohaTokenizer(
tokenizer_name="sentencepiece", model_path="data/model.spm"
tokenizer_name=tokenizer_name,
mode=mode,
model_path=model_path,
)
tokens_konoha = tokenizer.tokenize("吾輩は猫である")
token_surfaces = "▁ 吾 輩 は 猫 である".split()
assert token_surfaces == list(t.text for t in tokens_konoha)


def test_allennlp_sudachi():
try:
import allennlp # NOQA
import sudachipy # NOQA
except ImportError:
pytest.skip("AllenNLP or SudachiPy is not installed.")
tokenizer = KonohaTokenizer(tokenizer_name="sudachi", mode="A",)
tokens_konoha = tokenizer.tokenize("医薬品安全管理責任者")
token_surfaces = "医薬 品 安全 管理 責任 者".split()
tokens_konoha = tokenizer.tokenize(raw_text)
assert token_surfaces == list(t.text for t in tokens_konoha)


def test_allennlp_training():
try:
import allennlp.commands.train
from allennlp.models.basic_classifier import BasicClassifier
except ImportError:
pytest.skip("AllenNLP or Konoha (with Janome) is not installed.")

with tempfile.TemporaryDirectory() as serialization_dir:
model = allennlp.commands.train.train_model_from_file(
"test_fixtures/classifier.jsonnet",
Expand Down

0 comments on commit 4e00e0c

Please sign in to comment.