Skip to content

Commit

Permalink
feat: add --endpoint option to example (#197)
Browse files Browse the repository at this point in the history
* feat: introduce KonohaAPITokenizer

* feat: add --endpoint option to example

* chore: with_postag is no more available
  • Loading branch information
himkt authored Jan 13, 2024
1 parent f6c4fd0 commit f32f52f
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 6 deletions.
13 changes: 7 additions & 6 deletions example/tokenize_demo.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
import argparse

from konoha import SentenceTokenizer
from konoha import WordTokenizer


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--endpoint", type=str, default=None)
args = parser.parse_args()

sentence_tokenizer = SentenceTokenizer()
tokenizers = ["MeCab", "KyTea", "Janome", "nagisa", "Character"]
tokenizers_support_postag = ["MeCab", "KyTea", "Janome", "nagisa"]

word_tokenizers = []
for word_tokenizer_name in tokenizers:
try:
_tokenizer = WordTokenizer(word_tokenizer_name)
_tokenizer = WordTokenizer(word_tokenizer_name, endpoint=args.endpoint)
word_tokenizers.append(_tokenizer)

if word_tokenizer_name in tokenizers_support_postag:
_tokenizer = WordTokenizer(word_tokenizer_name)
word_tokenizers.append(_tokenizer)

except (ImportError, RuntimeError):
print("Skip: ", word_tokenizer_name)

Expand Down
3 changes: 3 additions & 0 deletions src/konoha/word_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from konoha.data.resource import Resource
from konoha.data.token import Token
from konoha.word_tokenizers.tokenizer import BaseTokenizer
from konoha.word_tokenizers import KonohaAPITokenizer


class WordTokenizer:
Expand Down Expand Up @@ -42,6 +43,8 @@ def __init__(

if not isinstance(endpoint, str):
self._setup_tokenizer()
else:
self._tokenizer = KonohaAPITokenizer(tokenizer)

def _setup_tokenizer(self) -> None:
if self._tokenizer_name == "character":
Expand Down
1 change: 1 addition & 0 deletions src/konoha/word_tokenizers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .character_tokenizer import CharacterTokenizer # NOQA
from .janome_tokenizer import JanomeTokenizer # NOQA
from .kytea_tokenizer import KyTeaTokenizer # NOQA
from .konoha_api_tokenizer import KonohaAPITokenizer # NOQA
from .mecab_tokenizer import MeCabTokenizer # NOQA
from .nagisa_tokenizer import NagisaTokenizer # NOQA
from .sentencepiece_tokenizer import SentencepieceTokenizer # NOQA
Expand Down
9 changes: 9 additions & 0 deletions src/konoha/word_tokenizers/konoha_api_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from konoha.word_tokenizers.tokenizer import BaseTokenizer


class KonohaAPITokenizer(BaseTokenizer):
def __init__(self, tokenizer: str):
super().__init__(name=f"{tokenizer} (remote)")

def tokenize(self, text: str):
pass

0 comments on commit f32f52f

Please sign in to comment.