-
Notifications
You must be signed in to change notification settings - Fork 9
/
helper_functions.py
424 lines (310 loc) · 8.83 KB
/
helper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
# -*- coding: utf-8 -*-
"""
Created by: Shaheen Syed
Data: August 2018
"""
# packages and modules
import logging, os, requests, textract, glob2, sys, csv
from datetime import datetime
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models
def set_logger(folder_name = 'logs'):
"""
Set up the logging to console layout
Parameters
----------
folder_name : string, optional
name of the folder where the logs can be saved to
"""
# create the logging folder if not exists
create_directory(folder_name)
# define the name of the log file
log_file_name = os.path.join(folder_name, '{:%Y%m%d%H%M%S}.log'.format(datetime.now()))
# set up the logger layout to console
logging.basicConfig(filename=log_file_name, level=logging.NOTSET)
console = logging.StreamHandler()
formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
logger = logging.getLogger(__name__)
def create_directory(name):
"""
Create directory if not exists
Parameters
----------
name : string
name of the folder to be created
"""
try:
if not os.path.exists(name):
os.makedirs(name)
logging.info('Created directory: {}'.format(name))
except Exception, e:
logging.error("[{}] : {}".format(sys._getframe().f_code.co_name,e))
exit(1)
def get_HTTPHeaders():
"""
Create http header so a crawler will be identified as normal browser
Returns
--------
http_header : dictionary
html headers
"""
return {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5"}
def return_html(url):
"""
Scrape html content from url
Parameters
---------
url : string
http link to a website
Returns
-------
html: request html object
the full content of the html page
"""
try:
# retrieve html content
html = requests.get(url, headers=get_HTTPHeaders())
# check for status
if html.status_code == requests.codes.ok:
return html
else:
logging.error("[return_html] invalid status code: {}".format(html.status_code))
return None
except Exception,e:
logging.error('[{}] : {}'.format(sys._getframe().f_code.co_name,e))
return None
def save_pdf(url, folder, name, overwrite = True):
"""
Save PDF file from the web to disk
Parameters
-----------
url : string
http link to PDF file
folder : os.path
location where to store the PDF file
name : string
name of the PDF file
overwrite: Boolean (optional)
if PDF already on disk, set to True if needs to be overwritten, or False to skip
"""
# create folder if not exists
create_directory(folder)
# check if file exists
file_exists = os.path.exists(os.path.join(folder, name))
# retrieve PDF from web
if overwrite == True or file_exists == False:
try:
# retrieve pdf content
response = requests.get(url, headers= get_HTTPHeaders(), stream=True)
# save to folder
with open('{}/{}'.format(folder, name), 'wb') as f:
f.write(response.content)
except Exception, e:
logging.error('[{}] : {}'.format(sys._getframe().f_code.co_name,e))
exit(1)
def pdf_to_plain(pdf_file):
"""
Read PDF file and convert to plain text
Parameters
----------
pdf_file : string
location of pdf file
Returns
---------
plain_pdf = string
plain text version of the PDF file.
"""
try:
# use textract to convert PDF to plain text
return textract.process(pdf_file, encoding='utf8')
except Exception, e:
logging.error('[{}] : {}'.format(sys._getframe().f_code.co_name,e))
return None
def read_directory(directory):
"""
Read file names from directory recursively
Parameters
----------
directory : string
directory/folder name where to read the file names from
Returns
---------
files : list of strings
list of file names
"""
try:
return glob2.glob(os.path.join(directory, '**' , '*.*'))
except Exception, e:
logging.error("[{}] : {}".format(sys._getframe().f_code.co_name,e))
exit(1)
def word_tokenizer(text):
"""
Function to return individual words from text. Note that lemma of word is returned excluding numbers, stopwords and single character words
"""
# start tokenizing
try:
# # create spacey object
# spacy_doc = nlp(text)
# Lemmatize tokens, remove punctuation and remove stopwords.
return [token.lemma_ for token in text if token.is_alpha and not token.is_stop and len(token) > 1]
except Exception, e:
logging.error('[{}] : {}'.format(sys._getframe().f_code.co_name,e))
exit(1)
def get_bigrams(text):
"""
Get all the bigrams from a given text
"""
try:
return list(nltk.bigrams(text.split()))
except Exception, e:
logging.error('[{}] : {}'.format(sys._getframe().f_code.co_name,e))
exit(1)
def named_entity_recognition(text):
"""
Perform named entity recognition on text to return all entities found that are at least two words
"""
try:
# create spacey object
ents = text.ents
entities = [str(entity).lower() for entity in ents if len(str(entity).split()) > 2]
return [ent.strip() for ent in entities if not any(char.isdigit() for char in ent) and all(ord(char) < 128 for char in ent)]
except Exception, e:
logging.error('[{}] : {}'.format(sys._getframe().f_code.co_name,e))
exit(1)
def get_dic_corpus(file_folder):
"""
Read dictionary and corpus for Gensim LDA
Parameters
-----------
file_folder : os.path
locatino of dictionary and corpus
Returns
dictionary : dict()
LDA dictionary
corpus : mm
LDA corpus
"""
# create full path of dictionary
dic_path = os.path.join(file_folder, 'dictionary.dict')
# create full path of corpus
corpus_path = os.path.join(file_folder, 'corpus.mm')
# check if dictionary exists
if os.path.exists(dic_path):
dictionary = corpora.Dictionary.load(dic_path)
else:
logging.error('LDA dictionary not found')
exit(1)
# check if corpus exists
if os.path.exists(corpus_path):
corpus = corpora.MmCorpus(corpus_path)
else:
logging.error('LDA corpus not found')
exit(1)
return dictionary, corpus
def load_lda_model(model_location):
"""
Load the LDA model
Parameters
-----------
model_location : os.path()
location of LDA Model
Returns
-------
model : gensim.models.LdaModel
trained gensim lda model
"""
model_path = os.path.join(model_location, 'lda.model')
if os.path.exists(model_path):
return models.LdaModel.load(model_path)
else:
logging.error('LDA model not found')
exit(1)
def get_topic_label(k, labels_available = True):
"""
Return topic label
Parameters
-----------
k : int
topic id from lda model
labels_available: Boolean (optional)
if set to True, then labels are present, otherwise, return e.g. 'topic (1)' string. Default is true
Returns
-------
label: string
label for topic word distribution
"""
if not labels_available:
return 'Topic {}'.format(k)
else:
topics = { 0 : 'Convergence',
1 : 'State, Policy, Action',
2 : 'Linear Algebra',
3 : 'NLP',
4 : 'Inference',
5 : 'Computer Vision',
6 : 'Graphical Models',
7 : 'Neural Network Learning',
8 : 'Stimulus Response',
9 : 'Neural Network Structure'}
return topics[k]
def save_csv(data, name, folder):
"""
Save list of list as CSV (comma separated values)
Parameters
----------
data : list of list
A list of lists that contain data to be stored into a CSV file format
name : string
The name of the file you want to give it
folder: string
The folder location
"""
try:
# create folder name as directory if not exists
create_directory(folder)
# create the path name (allows for .csv and no .csv extension to be handled correctly)
suffix = '.csv'
if name[-4:] != suffix:
name += suffix
# create the file name
path = os.path.join(folder, name)
# save data to folder with name
with open(path, "w") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerows(data)
except Exception, e:
logging.error('[{}] : {}'.format(sys._getframe().f_code.co_name,e))
exit(1)
def read_csv(filename, folder = None):
"""
Read CSV file and return as a list
Parameters
---------
filename : string
name of the csv file
folder : string (optional)
name of the folder where the csv file can be read
Returns
--------
"""
if folder is not None:
filename = os.path.join(folder, filename)
try:
# increate CSV max size
csv.field_size_limit(sys.maxsize)
# open the filename
with open(filename, 'rb') as f:
# create the reader
reader = csv.reader(f)
# return csv as list
return list(reader)
except Exception, e:
logging.error('[{}] : {}'.format(sys._getframe().f_code.co_name,e))
exit(1)