-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
98 lines (80 loc) · 3.42 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import re
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
# Hyperparameters
ngram_len = 3
file_name = 'data/goemotions_1.csv'
labels = ['admiration','amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness',
'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
def ngram(token, n):
output = []
for i in range(n-1, len(token)):
ngram = ' '.join(token[i-n+1:i+1])
output.append(ngram)
return output
def ngram_tokenize(text, nrange=(1, ngram_len)):
text_features = []
text = text.lower()
text_alphanum = re.sub('[^a-z0-9#]', ' ', text) # Preprocessing, just removing characters that are not text or numbers
for n in range(nrange[0], nrange[1]+1):
text_features += ngram(text_alphanum.split(), n)
text_punc = re.sub('[a-z0-9]', ' ', text)
text_features += ngram(text_punc.split(), 1)
return Counter(text_features)
# Read the file
df = pd.read_csv(file_name)
biased_df = df[df['rater_id'] == 9]
print(f'Biased_DF shape {biased_df.shape}')
print('Head 10 elements:')
print(biased_df[['text', 'author', 'rater_id']].head(10))
text_examples = biased_df[biased_df.columns[0]].values.tolist()
raw_labels = biased_df[labels].astype(str).apply(lambda row: row[row == '1'].index, axis=1)
text_labels = []
for key, value in raw_labels.iteritems():
try:
text_labels.append(value[0])
except IndexError:
text_labels.append('unclear')
print ("File loaded")
#split into test and train
y_all = text_labels
x_all = []
for sample in text_examples:
x_all.append(ngram_tokenize(sample, nrange=(1,ngram_len)))
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, random_state=123, shuffle=False)
def train_test(clf, x_train, x_test, y_train, y_test):
clf.fit(x_train, y_train)
train_acc = accuracy_score(y_train, clf.predict(x_train))
test_acc = accuracy_score(y_test, clf.predict(x_test))
return train_acc, test_acc
vectorizer = DictVectorizer(sparse = True)
x_train = vectorizer.fit_transform(x_train)
sample = x_train[0].indices
print(sample)
x_test = vectorizer.transform(x_test)
svc = SVC()
lsvc = LinearSVC(random_state=123)
rforest = RandomForestClassifier(random_state=123)
dtree = DecisionTreeClassifier()
knn = KNeighborsClassifier(n_neighbors=10)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=123, max_iter=400)
#clifs = [svc, lsvc, rforest, dtree, knn, clf]
clifs = [svc]
# train and test them
print("| {:25} | {} | {} |".format("Classifier", "Training Accuracy", "Test Accuracy"))
print("| {} | {} | {} |".format("-"*25, "-"*17, "-"*13))
for clf in clifs:
clf_name = clf.__class__.__name__
train_acc, test_acc = train_test(clf, x_train, x_test, y_train, y_test)
print("| {:25} | {:17.7f} | {:13.7f} |".format(clf_name, train_acc, test_acc))
print('done')