-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
148 lines (121 loc) · 4.19 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import argparse
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import LdaModel
import nltk, re, spacy, string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora
from gensim.models import CoherenceModel
stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation)
lemmatize = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')
words = set(nltk.corpus.words.words())
def preprocess(documents):
"""
Processes sentences before passing on to train model
Arguments
---------
documents: List of reviews split into sentences
Returns
---------
tokens: tokenized, de-accent and lowercased word list
filtered: filtered numbers, symbols, stopwords etc, list of words
"""
# Simple tokens, de-accent and lowercase processor
tokens = []
for i in range(len(documents)):
tokens.append(gensim.utils.simple_preprocess(documents[i], \
deacc=True, min_len=3))
filtered = []
# POS Tagging and filtering sentences
for i in range(len(documents)):
doc = nlp(documents[i])
b = []
for tok in doc:
if tok.is_stop != True and tok.pos_ != 'SYM' and \
tok.tag_ != 'PRP' and tok.tag_ != 'PRP$' and \
tok.tag_ != '_SP' and tok.pos_ != 'NUM' and \
tok.dep_ != 'aux' and tok.dep_ != 'prep' and \
tok.dep_ != 'det' and tok.dep_ != 'cc' and \
tok.lemma_ != 'frac' and len(tok) != 1 and \
tok.lemma_.lower() in words and \
tok.lemma_.lower() not in stopwords and \
tok.lemma_.lower() not in punctuation:
b.append(lemmatize.lemmatize(tok.lemma_.lower()))
filtered.append(b)
return tokens, filtered
def pre_new(doc, dictionary):
"""
Preprocess a new document before infering topics
Arguments
---------
doc: new documnet to preprocess
dictionary: dictionary of the corpus used to train the lda model
Returns
---------
two: preprocessed document
"""
one, _ = preprocess([doc])
return dictionary.doc2bow(one[0])
def inference(df, ldamodel, dictionary):
"""
Run inference on a new document using a pretrained lda model
Arguments
---------
inference_path: Path to the new document
ldamodel: trained LDA model
dictionary: Dictionary of the corpus used to train the lda model
Returns
---------
l: topic list
"""
# Inference data
l = []
values = []
for item in df:
belong = ldamodel[pre_new(str(item), dictionary)]
new = pd.DataFrame(belong,columns=['id','prob']).sort_values('prob',ascending=False)
p = []
for i, val in new.iterrows():
p.append(dictionary.get(int(val['id'])))
p = p[:5]
values += p
l.append(p)
return l, values
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--topic_model", type=str, \
required=True, help="Topic model file")
parser.add_argument("--dictionary", type=str, \
required=True, help="Dictionary file")
parser.add_argument("--corpus", type=str, \
required=True, help="Input corpus file")
parser.add_argument("--output_dict", type=str, \
required=True, help="Ouptut dictionary file")
parser.add_argument("--output_data", type=str, \
required=True, help="Ouptut data file")
parser.add_argument("--title", type=int, \
default=1, help="1 if title column should be used")
parser.add_argument("--description", type=int, \
default=1, help="1 if discription column should be used")
args = parser.parse_args()
# Load a pretrained LDA model from a file
ldamodel = LdaModel.load(args.topic_model, mmap='r')
dictionary = corpora.Dictionary.load(args.dictionary)
df = pd.read_csv(args.corpus)
df = df.drop_duplicates(subset=['resource_id'], keep='first')
df = df['title'] + " " + df['description']
# Infer the topic of the inferenced document
topic_list, values = inference(df, ldamodel, dictionary)
unique_values = ['keys'] + list(pd.unique(values))
import numpy as np
np.savetxt(args.output_dict, unique_values, delimiter="\n", fmt='%s')
save_file = []
for i, val in enumerate(df):
st = str(topic_list[i][0]) + " " + str(topic_list[i][1]) + " " + str(topic_list[i][2]) + " " + str(topic_list[i][3]) + " " + str(topic_list[i][4])
save_file.append([st, val])
save_file = pd.DataFrame(save_file, columns=['tk', 'val'])
save_file.to_csv(args.output_data, header=True, index=False)