-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnltk_utils.py
120 lines (100 loc) · 4.02 KB
/
nltk_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
def remove_stopwords(text, custom_stopwords):
if custom_stopwords:
stopwords_list = load_stopwords_custom_object()
else:
stopwords_list = stopwords.words('english')
text_tokens = word_tokenize(text)
tokens = [word for word in text_tokens if word.lower() not in stopwords_list]
text = ""
for token in tokens:
text = text + str(token) + " "
return text
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return "n"
def lemmatize_triplets(triplets):
lemmatizer = WordNetLemmatizer()
for triplet in triplets:
relation = triplet["relation"]
tagged_tokenized_relation = pos_tag(word_tokenize(str(relation)))
new_relation = ""
for token in tagged_tokenized_relation:
if new_relation == "":
new_relation = new_relation + str(lemmatizer.lemmatize(token[0], get_wordnet_pos(token[1])))
else:
new_relation = new_relation + " " + str(lemmatizer.lemmatize(token[0], get_wordnet_pos(token[1])))
triplet["relation"] = new_relation
def lemmatize_triplets_only_verbs(triplets):
lemmatizer = WordNetLemmatizer()
for triplet in triplets:
relation = triplet["relation"]
tagged_tokenized_relation = pos_tag(word_tokenize(str(relation)))
new_relation = ""
for token in tagged_tokenized_relation:
if new_relation == "":
if get_wordnet_pos(token[1]) is wordnet.VERB:
new_relation = new_relation + str(lemmatizer.lemmatize(token[0], wordnet.VERB))
else:
new_relation = new_relation + str(token[0])
else:
if get_wordnet_pos(token[1]) is wordnet.VERB:
new_relation = new_relation + " " + str(lemmatizer.lemmatize(token[0], wordnet.VERB))
else:
new_relation = new_relation + " " + str(token[0])
triplet["relation"] = new_relation
def setup():
nltk.download()
def print_stopwords():
for word in stopwords.words('english'):
print(word)
def create_stopwords_custom_object(filename):
custom_stopwords = []
with open(filename, "r") as f:
custom_stopwords = f.read().splitlines()
print("Custom stopwords loaded: " + str(len(custom_stopwords)))
print(custom_stopwords)
with open("./obj/custom_stopwords.obj", "wb") as f:
pickle.dump(custom_stopwords, f)
def load_stopwords_custom_object():
with open("./obj/custom_stopwords.obj", "rb") as f:
return pickle.load(f)
def sentence_tokenize(text):
return sent_tokenize(text)
def preprocess_text(text):
text = text.replace("'nt ", " not ")
text = text.replace("'ve ", " have ")
text = text.replace(" what's ", " what is ")
text = text.replace("What's ", "What is ")
text = text.replace(" where's ", " where is ")
text = text.replace("Where's ", "Where is ")
text = text.replace(" how's ", " how is ")
text = text.replace("How's ", "How is ")
text = text.replace(" he's ", " he is ")
text = text.replace(" she's ", " she is ")
text = text.replace(" it's ", " it is ")
text = text.replace("He's ", "He is ")
text = text.replace("She's ", "She is ")
text = text.replace("It's ", "It is ")
text = text.replace("'d ", " had ")
text = text.replace("'ll ", " will ")
text = text.replace("'m ", " am ")
text = text.replace(" ma'am ", " madam ")
text = text.replace(" o'clock ", " of the clock ")
text = text.replace(" 're ", " are ")
text = text.replace(" y'all ", " you all ")
return text