fix tokenizer

support punct.
kargaranamir · Apr 16, 2022 · 0c89ded · 0c89ded
1 parent 05ab787
commit 0c89ded
Showing 1 changed file with 5 additions and 2 deletions.
diff --git a/parstdex/utils/tokenizer.py b/parstdex/utils/tokenizer.py
@@ -1,4 +1,7 @@
+import re
+
+# persian-english word tokenizer
 def tokenize_words(text):
-    token_list = text.strip().split()
+    token_list = re.findall(r"[\w\u200c']+|[!\"#$%&\'()*+,-./:؛؟،;<=>?@[\\\]^_`{|}~]", text)
     token_list = [x.strip("\u200c") for x in token_list if len(x.strip("\u200c")) != 0]
-    return token_list
+    return token_list