-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMiniBatchSVM.py
executable file
·147 lines (124 loc) · 5.38 KB
/
MiniBatchSVM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
"""
# Author: XU Kui xukui.cs@gmail.com
# Created Time : Wed 19 Apr 2017 09:46:16 PM CST
# File Name: MiniBatchSVM.py
# Description:Mini-batch SVM / Logistic Regresion, Online SVM training for large scale data
"""
import numpy as np
import os, sys
import h5py
from optparse import OptionParser
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import HashingVectorizer
global model
model = ""
def meanNorm(X):
length = len(X)
ind=[]
for i in range(length):
x=X[i,0,]
if np.max(x)-np.min(x)==0:
ind.append(i)
else:
X[i,0,]=(x-np.mean(x))/(np.max(x)-np.min(x))
return X, ind
def loadH5file(filepath, opt):
train=h5py.File(filepath, 'r')
if opt.labelstart1 :
y_train=np.asarray(train['label']) - 1
else:
y_train=np.asarray(train['label'])
X_train=np.asarray(train['data'])
if opt.norm:
X_train, ind =meanNorm(X_train)
#print X_train.shape, len(ind)
# remove fully zore samples
for i in range(len(ind)-1 ,-1,-1):
X_train = np.delete(X_train,ind[i],0)
#print X_train.shape
y_train=y_train.reshape(y_train.shape[0])
vectorNum = 1
for i in range(1,len(X_train.shape)):
vectorNum *= X_train.shape[i]
X_train=X_train.reshape(X_train.shape[0], vectorNum)
return X_train, y_train
#def train(model, X_train, y_train, nClasses=10, batchSize=256):
def train( X_train, y_train, nClasses=10, batchSize=256):
best_score=0
X_count = X_train.shape[0]
batchCount= X_count / batchSize
j=0
shuffledRange = range(X_count)
shuffledX = X_train[shuffledRange,]
shuffledY = [y_train[i] for i in shuffledRange]
global model
for i in range(0, batchCount): # Iterate over "mini-batches" of 1000 samples each
j+=1
y_train_batch = shuffledY[i*batchSize :(i +1)* batchSize]
X_train_batch = shuffledX[i*batchSize :(i +1)* batchSize,]
#vectorizer.fit_transform(train_data[i:i + batchSize])
# Update the classifier with documents in the current mini-batch
model.partial_fit(X_train_batch, y_train_batch, classes=range(nClasses))
#def test(model, X_test, y_test):
def test(X_test, y_test):
global model
score = model.score(X_test, y_test)
return score
def createModel(modelname ="svm"):
from sklearn.linear_model import SGDClassifier
global model
# SVM classifier trained online with stochastic gradient descent
model = SGDClassifier(loss="hinge", penalty="l2")
if modelname=="log":
# Logistic Regresion classifier trained online with stochastic gradient descent
model = SGDClassifier(loss="log", penalty="l2")
print "Using Logistic Regression..."
else:
print "Using Hinge Loss SVM..."
if __name__ == "__main__":
usage = "usage: %prog [options] [--trianlist path-to-training-data-list-file] \n\
test on mnist dataset, just type: ./MiniBatchSVM.py "
optParser = OptionParser(usage=usage)
optParser.add_option("-m", "--model",
action = "store", type = 'string', dest = "model", default = "svm",
help = "svm, log")
optParser.add_option("-t", "--trainlist", action = "store", type = 'string', \
dest = "trainlist", default = "data/mnist-h5/train.list", help = "trainlist file")
optParser.add_option("-T", "--testlist", action = "store", type = 'string', \
dest = "testlist", default = "data/mnist-h5/test.list", help = "testlist file")
optParser.add_option("-b", "--batchsize", action = "store", type = 'int', \
dest = "batchsize", default = 1000, help = "batch size")
optParser.add_option("-e", "--epoch", action = "store", type = 'int', \
dest = "epoch", default = 20, help = "max epoch")
optParser.add_option("-c", "--nclasses", action = "store", type = 'int', \
dest = "nclasses", default = 10, help = "num of the class")
optParser.add_option("-n", "--norm",
action = "store_true", dest = "norm", default = False,
help = "do mean normalization")
optParser.add_option("-l", "--labelstart1",
action = "store_false", dest = "labelstart1", default = False,
help = "use this option when the label of your data is bengin at 1 ")
(opt, args) = optParser.parse_args()
trainFile = open(opt.trainlist,"r")
trainFileLists = trainFile.readlines()
testFile = open(opt.testlist,"r")
testFileLists = testFile.readlines()
best_score = 0
createModel(opt.model)
for k in range(1,opt.epoch):
print("Epoch %s/%s" % (k, opt.epoch))
for filename in trainFileLists:
filename=filename.strip()
(X,Y) = loadH5file(filename, opt)
train(X, Y, opt.nclasses, opt.batchsize)
for filename in testFileLists:
filename=filename.strip()
(X,Y) = loadH5file(filename, opt)
#score = test(model, X, Y)
score = test(X, Y)
if score > best_score:
best_score = score
#print((" Batch %s/%s Score %.3f BestScore %.3f") %( j, batchCount, score, best_score))
print((" Test Score %.4f BestScore %.4f") %( score, best_score))
print(("Finished, The finally best score is: %.4f") %(best_score) )