From 0c89ded7337db6c40641259bff5b0f0d2498bb53 Mon Sep 17 00:00:00 2001 From: Amir Hossein Kargaran Date: Sat, 16 Apr 2022 19:16:07 +0430 Subject: [PATCH] fix tokenizer support punct. --- parstdex/utils/tokenizer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parstdex/utils/tokenizer.py b/parstdex/utils/tokenizer.py index a719547..5c20400 100644 --- a/parstdex/utils/tokenizer.py +++ b/parstdex/utils/tokenizer.py @@ -1,4 +1,7 @@ +import re + +# persian-english word tokenizer def tokenize_words(text): - token_list = text.strip().split() + token_list = re.findall(r"[\w\u200c']+|[!\"#$%&\'()*+,-./:؛؟،;<=>?@[\\\]^_`{|}~]", text) token_list = [x.strip("\u200c") for x in token_list if len(x.strip("\u200c")) != 0] - return token_list \ No newline at end of file + return token_list