-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
119 lines (93 loc) · 3.21 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
nltk.download( 'averaged_perceptron_tagger' )
nltk.download( 'stopwords' )
port_stemmer = PorterStemmer()
stopwords = set( stopwords.words( 'english' ) )
def _get_text( row, attr ):
"""
Get the row from a dataframe
"""
return "" if type( row[ attr ] ) is float else str( row[ attr ] )
def _remove_special_chars( text ):
"""
Remove special characters in text
"""
return re.sub( '[^a-zA-Z0-9]', ' ', text )
def _remove_duplicate_file_types( tokens ):
"""
Remove duplicate files
"""
tokens_list = tokens.split( " " )
unique_tokens = ''
for item in tokens_list:
if item not in unique_tokens and item is not "":
unique_tokens = item if unique_tokens == "" else unique_tokens + " " + item
return unique_tokens
def _clean_tokens( text_list, stop_words ):
# discard numbers and one letter words
tokens = [ item.lower() for item in text_list if len( item ) > 1 and not _check_number( item ) ]
# remove stop words
tokens = [ item for item in tokens if item not in stop_words ]
# remove stop words in NLTK library
tokens = [ word for word in tokens if word not in stopwords ]
return [ port_stemmer.stem( item ) for item in tokens ]
def _restore_space( text ):
return " ".join( text.split( "," ) )
def _format_dict_string( dictionary ):
string = ''
for item in dictionary:
string += '%s: %s' % ( item, dictionary[ item ] ) + '\n'
return string
def _check_number( item ):
try:
int( item )
return True
except Exception:
return False
def l1_norm( vector ):
norm = np.sum( vector )
return 0 if norm == 0 else vector / norm
def _euclidean_distance( x, y ):
"""
Get euclidean distance between two vectors
"""
x_norm = l1_norm( x )
y_norm = l1_norm( y )
return np.sqrt( np.sum( ( x_norm - y_norm ) ** 2 ) )
def _cosine_angle_score( vector1, vector2 ):
"""
Get value of the cosine angle between two vectors
"""
# if either of the vectors is zero, then similarity is also 0
# which means the vectors cannot be compared
vec1_length = np.sqrt( np.dot( vector1, vector1 ) )
vec2_length = np.sqrt( np.dot( vector2, vector2 ) )
if vec1_length == 0 or vec2_length == 0:
return 0
else:
return np.dot( vector1, vector2 ) / ( vec1_length * vec2_length )
def _jaccard_score( vector1, vector2 ):
"""
Get jaccard score for two vectors
"""
dot_product = np.dot( vector1, vector2 )
jaccard_denominator = np.dot( vector1, vector1 ) + np.dot( vector2, vector2 ) - dot_product
if jaccard_denominator == 0:
return 0
else:
return dot_product / float( jaccard_denominator )
def _plot_singular_values_rank( rank_list, sum_singular_values_list ):
"""
Generate plot of reduction in singular values with matrix's rank
"""
plt.plot( rank_list, sum_singular_values_list )
plt.xlabel( 'Matrix rank' )
plt.ylabel( '% sum of singular values taken' )
plt.title( 'Variation of sum of singular values with matrix rank' )
plt.grid()
plt.show()