Skip to content

Commit

Permalink
fix tokenizer
Browse files Browse the repository at this point in the history
support punct.
  • Loading branch information
kargaranamir authored Apr 16, 2022
1 parent 05ab787 commit 0c89ded
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions parstdex/utils/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import re

# persian-english word tokenizer
def tokenize_words(text):
token_list = text.strip().split()
token_list = re.findall(r"[\w\u200c']+|[!\"#$%&\'()*+,-./:؛؟،;<=>?@[\\\]^_`{|}~]", text)
token_list = [x.strip("\u200c") for x in token_list if len(x.strip("\u200c")) != 0]
return token_list
return token_list

0 comments on commit 0c89ded

Please sign in to comment.