-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset-a8a-kernel.py
51 lines (43 loc) · 1.44 KB
/
dataset-a8a-kernel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import sklearn.datasets
import numpy as np
from tqdm import tqdm
import scipy.sparse
import pickle
import time
print('start')
data_dir = './datasets'
def build_kernel_matrix(A, B, kernel_type='Gaussian', kernel_opt = {}):
if kernel_opt.get('bandwidth',-1)==-1:
kernel_opt['bandwidth'] = -1
n, d = A.shape
A_sum = np.sum(A**2,axis=1)
B_sum = np.sum(B**2,axis=1)
if kernel_type == 'Gaussian':
dist_mat = -2*np.matmul(B, A.T)+B_sum.reshape([-1,1])+A_sum.reshape([1,-1])
bandwidth = kernel_opt['bandwidth']
if bandwidth == -1:
bandwidth = np.median(dist_mat)/2/np.log(d+1)
K = np.exp(-dist_mat*0.5/bandwidth)
return K, bandwidth
np.random.seed(1)
A,b = sklearn.datasets.load_svmlight_file('{}/a8a/a8a'.format(data_dir))
A = A.A
indn = np.arange(22696)
np.random.shuffle(indn)
Atrain = A[indn[:10000],:]
Atest = A[indn[10000:],:]
btest = b[indn[10000:]]
btrain = b[indn[:10000]]
kernel_opt = {'bandwidth':10}
Ktrain, bandwidth = build_kernel_matrix(Atrain,Atrain, kernel_opt = kernel_opt)
print(bandwidth)
kernel_opt = {'bandwidth':bandwidth}
Ktest, _ = build_kernel_matrix(Atrain,Atest, kernel_opt = kernel_opt)
with open('{}/a8a/a8a_kernel_train.p'.format(data_dir),'wb') as f:
K_part = Ktrain
b_part = btrain
pickle.dump([K_part,b_part],f)
with open('{}/a8a/a8a_kernel_test.p'.format(data_dir),'wb') as f:
K_part = Ktest
b_part = btest
pickle.dump([K_part,b_part],f)