-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
173 lines (143 loc) · 6.1 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from os.path import join
import pandas as pd
import numpy as np
import os
import time
location = os.path.dirname(os.path.abspath(__file__))
# print(location)
DATASET_ROOT = "data"
def load_toy() -> object:
# load or create your dataset
print('Load data...')
df_train = pd.read_csv(join(location, DATASET_ROOT, 'multiclass_classification', 'multiclass.train'), header=None,
sep='\t')
df_test = pd.read_csv(join(location, DATASET_ROOT, 'multiclass_classification', 'multiclass.test'), header=None,
sep='\t')
y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values
return X_train, X_test, y_train, y_test
def load_titanic():
# We will define a function to fill the missing entries in the age column
def age_set(cols):
age = cols[0]
clas = cols[1]
if pd.isnull(age):
if clas == 1:
return 37.0
elif clas == 2:
return 28.0
else:
return 24.0
else:
return age
# load training data
titanic_df = pd.read_csv(join(location, DATASET_ROOT, 'Titanic', 'train.csv'))
titanic_df['Age'] = titanic_df[['Age', 'Pclass']].apply(age_set, axis=1)
f_df = pd.get_dummies(titanic_df[['Embarked', 'Sex']], drop_first=True)
titanic_df.drop(['Embarked', 'Sex'], axis=1, inplace=True)
titanic_df = pd.concat([titanic_df, f_df], axis=1)
X = titanic_df.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y = titanic_df['Survived']
print(X.shape, y.shape)
def set_fare(cols):
pclass = cols[0]
fare = cols[1]
if pd.isnull(fare):
if pclass == 1:
return 1098.22
elif pclass == 2:
return 1117.94
else:
return 1094.17
else:
return fare
# load testing data
test_df = pd.read_csv(join(location, DATASET_ROOT, 'Titanic', 'test.csv'))
test_df['Age'] = test_df[['Age', 'Pclass']].apply(age_set, axis=1)
# Filling the empty Fare rows and dropping the Cabin column
test_df['Fare'] = test_df[['Pclass', 'Fare']].apply(set_fare, axis=1)
test_df.drop('Cabin', axis=1, inplace=True)
test_df.dropna(axis=0, inplace=True)
f1_df = pd.get_dummies(test_df[['Embarked', 'Sex']], drop_first=True)
test_df.drop(['Embarked', 'Sex'], axis=1, inplace=True)
test_df = pd.concat([test_df, f1_df], axis=1)
test_data_id = test_df["PassengerId"]
test_df1 = test_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
print(test_df1.shape)
return (X, y, test_df1, test_data_id)
def load_santander():
full_train = pd.read_csv(join(location, DATASET_ROOT, "Santander", "train.csv"), sep=",")
# print(full_train.columns)
y = full_train["TARGET"].values
X = full_train.drop(labels=["ID", "TARGET"], axis=1).values
print(full_train.shape)
pos = y[y == 1]
neg = y[y == 0]
print(len(pos), len(neg))
return X, y
def load_santander_test():
test_X = pd.read_csv(join(location, DATASET_ROOT, "Santander", "test.csv"), sep=",")
return test_X.values
def load_safedriver():
full_train = pd.read_csv(join(location, DATASET_ROOT, "SafeDriver", "train.csv"), sep=",")
# print(full_train.columns)
y = full_train["target"].values
X = full_train.drop(labels=["id", "target"], axis=1).values
print(full_train.shape)
pos = y[y == 1]
neg = y[y == 0]
print(len(pos), len(neg))
return X, y
def load_fraud_detection(direct=True, sampled=False):
if sampled:
data = np.loadtxt(join(location, DATASET_ROOT, 'FraudDetection', 'creditcard-sampled.csv'), delimiter=',')
return data[:, :-1], data[:, -1]
start = time.time()
data = np.loadtxt(join(location, DATASET_ROOT, 'FraudDetection', 'creditcard.csv'), delimiter=',', skiprows=1,
dtype=bytes)
data = np.array(data)
full_X = data[:, 0:30].astype(float)
labels = data[:, 30]
# full_y = np.zeros(shape=(full_X.shape[0]))
full_y = np.where(labels == b'"0"', [0], [1])
print("fraud detection loaded in", time.time() - start, "s")
return full_X, full_y
def load_synthetic_noise_uniform() -> (np.ndarray, np.ndarray, np.ndarray):
"""
:return:
merged_data: X values of synthesized data
merged_y: y values of synthesized data, which contains noise, 0 means majority, 1 means minority
outlier_groundtruth: indicates which instances are noise: -1 means noise, 1 mean normal
:rtype:
"""
minority_data = np.loadtxt(join(location, DATASET_ROOT, "synthetic_noise_uniform", "data_minority.csv"), delimiter=",")
majority_data = np.loadtxt(join(location, DATASET_ROOT, "synthetic_noise_uniform", "data_majority.csv"), delimiter=",")
noise_data = np.loadtxt(join(location, DATASET_ROOT, "synthetic_noise_uniform", "data_noise.csv"), delimiter=",")
merged_data = np.concatenate((minority_data, noise_data, majority_data), axis=0)
merged_y = np.array([1] * (len(minority_data) + len(noise_data)) + [0] * len(majority_data))
outlier_groundtruth = np.array([1] * (len(minority_data)) + [-1] * len(noise_data) + [1] * len(majority_data))
return merged_data, merged_y, outlier_groundtruth
def load_synthetic_noise_gaussian() -> (np.ndarray, np.ndarray, np.ndarray):
"""
:return:
merged_data: X values of synthesized data
merged_y: y values of synthesized data, which contains noise, 0 means majority, 1 means minority
outlier_groundtruth: indicates which instances are noise: -1 means noise, 1 mean normal
:rtype:
"""
minority_data = np.loadtxt(join(location, DATASET_ROOT, "synthetic_noise_gaussian", "data_minority_gaussian.csv"), delimiter=",")
majority_data = np.loadtxt(join(location, DATASET_ROOT, "synthetic_noise_gaussian", "data_majority_gaussian.csv"), delimiter=",")
noise_data = np.loadtxt(join(location, DATASET_ROOT, "synthetic_noise_gaussian", "data_noise_gaussian.csv"), delimiter=",")
merged_data = np.concatenate((minority_data, noise_data, majority_data), axis=0)
merged_y = np.array([1] * (len(minority_data) + len(noise_data)) + [0] * len(majority_data))
outlier_groundtruth = np.array([1] * (len(minority_data)) + [-1] * len(noise_data) + [1] * len(majority_data))
return merged_data, merged_y, outlier_groundtruth
from collections import Counter
if __name__ == '__main__':
# X, y = load_fraud_detection(sampled=True)
X, y, gnd = load_synthetic_noise_gaussian()
print(X.shape)
print(Counter(y))
print(Counter(gnd))