-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils.py
46 lines (28 loc) · 1.24 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import re
import stanza
import spacy_udpipe
EXTERNAL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'external_data')
nlp_udpipe = spacy_udpipe.load(lang="hy")
nlp_stanza = stanza.Pipeline(use_gpu=False, lang='hy', processors='tokenize, mwt, pos, lemma, depparse')
def lemmatizer(text: str):
doc = nlp_stanza(text)
return [word.lemma for sentence in doc.sentences for word in sentence.words]
def pos_tagger(text: str):
doc = nlp_stanza(text)
return [word.pos for sentence in doc.sentences for word in sentence.words]
def word_tokenize(text: str, remove_punctuation=False):
text = remove_punct(text) if remove_punctuation else text
doc = nlp_udpipe(text)
return [word.text for word in doc]
def letter_tokenize(text: str):
return list(re.sub(r'[^\u0561-\u0587\u0531-\u0556]', '', text))
def letters_and_numbers(text: str):
return list(re.sub(r'[^\d\u0561-\u0587\u0531-\u0556]', '', text))
def remove_punct(text: str):
return re.sub(r'[^\d\s\u0561-\u0587\u0531-\u0556]', ' ', text)
def remove_non_letters(text: str):
return re.sub(r'[^\s\u0561-\u0587\u0531-\u0556]', ' ', text)
def sentence_tokenize(text: str):
doc = nlp_udpipe(text)
return [x.string for x in list(doc.sents)]