-
Notifications
You must be signed in to change notification settings - Fork 11
/
classifying.py
62 lines (58 loc) · 2.88 KB
/
classifying.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from load_test_data import load
def classify(vectors, labels, train_text, task, type="DT"):
# Random Splitting With Ratio 3 : 1
train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors, labels, test_size=0.333)
test_vectors, test_labels = load(train_text, task)
# Initialize Model
classifier = None
if(type=="MNB"):
classifier = MultinomialNB(alpha=0.7)
classifier.fit(train_vectors, train_labels)
elif(type=="KNN"):
classifier = KNeighborsClassifier(n_jobs=4)
params = {'n_neighbors': [3,5,7,9], 'weights':['uniform', 'distance']}
classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
classifier.fit(train_vectors, train_labels)
classifier = classifier.best_estimator_
elif(type=="SVM"):
classifier = SVC()
classifier = GridSearchCV(classifier, {'C':[0.001, 0.01, 0.1, 1, 10]}, cv=3, n_jobs=4)
classifier.fit(train_vectors, train_labels)
classifier = classifier.best_estimator_
elif(type=="DT"):
classifier = DecisionTreeClassifier(max_depth=800, min_samples_split=5)
params = {'criterion':['gini','entropy']}
classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
classifier.fit(train_vectors, train_labels)
classifier = classifier.best_estimator_
elif(type=="RF"):
classifier = RandomForestClassifier(max_depth=800, min_samples_split=5)
params = {'n_estimators': [n for n in range(50,200,50)], 'criterion':['gini','entropy'], }
classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
classifier.fit(train_vectors, train_labels)
classifier = classifier.best_estimator_
elif(type=="LR"):
classifier = LogisticRegression(multi_class='auto', solver='newton-cg',)
classifier = GridSearchCV(classifier, {"C":np.logspace(-3,3,7), "penalty":["l2"]}, cv=3, n_jobs=4)
classifier.fit(train_vectors, train_labels)
classifier = classifier.best_estimator_
else:
print("Wrong Classifier Type!")
return
accuracy = accuracy_score(train_labels, classifier.predict(train_vectors))
print("Training Accuracy:", accuracy)
test_predictions = classifier.predict(test_vectors)
accuracy = accuracy_score(test_labels, test_predictions)
print("Test Accuracy:", accuracy)
print("Confusion Matrix:", )
print(confusion_matrix(test_labels, test_predictions))