-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsvmSklearn.py
105 lines (79 loc) · 3.75 KB
/
svmSklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 30 22:34:12 2017
@author: fubao
"""
import numpy as np
import kaggle
from sklearn.svm import SVC
from commons import read_tumor_data
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from commons import writeToFile
def svmSklearnCV(kfold = 7, fileTestOutput = "best_cv"):
'''
call svm train and predict for different parameters for tumor data
use cross validation to get out of sample out of sample meansquared error
'''
train_x, train_y, test_x = read_tumor_data()
print('Train=', train_x.shape)
print('Test=', test_x.shape)
cLst = [1, 0.01, 0.0001]
gammaLst = [1, 0.01, 0.001]
kernelParaLst = ['rbf', 'poly=3', 'poly=5', 'linear']
degree = 3
accuracyLargest = -2**32
for c in cLst:
for gamma in gammaLst:
for kernel in kernelParaLst:
if kernel.split("=")[0] == "poly":
degree = int(kernel.split("=")[1])
kernel = kernel.split("=")[0]
clf = SVC(C=c, kernel=kernel, degree=degree, gamma = gamma)
accuracy = np.mean(cross_val_score(clf, train_x, train_y, cv=kfold, scoring="accuracy"))
print ("accuracy: ", c, gamma, kernel, degree, accuracy)
if accuracy > accuracyLargest:
accuracyLargest = accuracy
paramtersBest = [c, gamma, degree, kernel]
print ("best accuracy parameters: ", kfold, paramtersBest, accuracyLargest)
#train whole data
clf = SVC(C=paramtersBest[0], gamma = paramtersBest[1], degree=paramtersBest[2], kernel=paramtersBest[3])
clf.fit(train_x, train_y)
yPred = clf.predict(test_x)
#output file
if fileTestOutput != "":
kaggle.kaggleize(yPred, fileTestOutput, False)
def trainSVMExtra(fileTestOutput, resultFile):
'''
for extra credit 1
try different kernel ridge model or even different kernel
how to select effective kernel
'''
train_x, train_y, test_x = read_tumor_data()
print('Train=', train_x.shape, type(train_x))
print('Test=', test_x.shape)
#[1, 0.01, 0.001, 0.0001]
fd = open(resultFile, 'a')
kfoldLst = range(3,12)
biggestAccuracy = -2**32
for kfold in kfoldLst:
parameters = {'kernel':('linear', 'rbf','sigmoid', 'poly'), 'C':np.linspace(1, 5, 5), 'gamma':[0.001, 0.1, 20], 'degree':np.linspace(1, 5, 5)}
clf = GridSearchCV(SVC(), parameters, cv=kfold, n_jobs=10) #scoring= "neg_mean_squared_error" )
clf.fit(train_x, train_y)
meanTestError = clf.cv_results_['mean_test_score']
bestPara = clf.best_estimator_
if clf.best_score_ > biggestAccuracy:
biggestAccuracy = clf.best_score_
paramtersBest = [bestPara.C, bestPara.gamma, bestPara.degree, bestPara.kernel, kfold, clf.best_score_]
#print ("trainKernelRidgeExtra Result : ", bestPara.C, bestPara.gamma, bestPara.degree, bestPara.kernel, clf.best_score_, meanTestError,)
writeToFile(fd, [bestPara.C, bestPara.gamma, bestPara.degree, bestPara.kernel, kfold, clf.best_score_] + list([meanTestError]))
# kwargs = {'n_neighbors': bestPara.n_neighbors}
clf = SVC(C=bestPara.C, gamma = bestPara.gamma, degree=bestPara.degree, kernel=bestPara.kernel)
clf.fit(train_x, train_y)
predY = clf.predict(test_x)
#print ("predY DT: ", predY)
#output to file
if fileTestOutput != "":
kaggle.kaggleize(predY, fileTestOutput + str(kfold), False)
print ("best final trainKernelRidgeExtra Result: ", paramtersBest)