-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtask2a.py
136 lines (117 loc) · 4.71 KB
/
task2a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import pandas as pd
import numpy as np
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from statistics import variance
# function used to determine the type of input
# is float or not
def isfloat(value):
try:
float(value)
return True
except ValueError:
return False
# read the world.csv and life.csv
world = pd.read_csv('world.csv',encoding= 'ISO-8859-1')
life = pd.read_csv('life.csv',encoding= 'ISO-8859-1')
# preprocessing the 'Time' column in the world
world_new = world.copy()
world_new.Time = pd.to_numeric(world_new['Time'],errors='coerce')
world_new['Time'] = world_new['Time'].fillna("0")
world_new['Time']=world_new['Time'].astype(int)
# creat a column called 'Time' in life, which has
# same value with 'Year' column for each row
life_new = life.copy()
life_new['Time'] = life.loc[:,'Year']
life_new['Time']=life_new['Time'].astype(int)
# merge two dataframe 'world_new' and 'life_new'
# based on 'Country Code' and 'Time'
merged = world_new.merge(life_new, on=['Country Code','Time'])
# extract all the useful datas for classification
data_no_using = ['Country','Year','Country Name','Time','Country Code','Life expectancy at birth (years)']
data_using = []
for data in merged:
if data not in data_no_using:
data_using.append(data)
datas = merged[data_using]
# get just the class labels
classlabel = merged['Life expectancy at birth (years)']
# randomly select 2/3 of the instances to be training and the rest to be testing
X_train, X_test, y_train, y_test = train_test_split(datas ,classlabel, train_size=2/3, test_size=1/3, random_state=100)
# figure out the median for each column
# used for imputation
dictionary_data_info = {}
for data in X_train:
value_list = []
for value in X_train[data]:
if isfloat(value):
value_list.append(float(value))
value_median = np.median(value_list)
dictionary_data_info[data] = [value_median]
# replace all the '..' in X_train with the median of that column
X_train = X_train.applymap(lambda x: np.nan if not isfloat(x) else x)
for data in X_train:
X_train[data].fillna(dictionary_data_info[data][0], inplace=True)
X_train[data] = X_train[data].astype('float')
# replace all the '..' in X_train with the median of that column
X_test = X_test.applymap(lambda x: np.nan if not isfloat(x) else x)
for data in X_test:
X_test[data].fillna(dictionary_data_info[data][0], inplace=True)
X_test[data] = X_test[data].astype('float')
# find the mean and variance, median for each
# columns
for data in X_train:
value_list = []
for value in X_train[data]:
if isfloat(value):
value_list.append(float(value))
value_mean = sum(value_list)/len(value_list)
value_median = np.median(value_list)
value_variance = variance(value_list)
dictionary_data_info[data] = [value_median,value_mean,value_variance]
# normalise the data to have 0 mean and unit variance
scaler = preprocessing.StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)
# build the model by k-NN classifier on the train set
# and evaluation the accuracy by the test set, the k here
# is 5
knn5 = neighbors.KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)
y_pred=knn5.predict(X_test)
accurate_knn5 = accuracy_score(y_test, y_pred)
# build the model by k-NN classifier on the train set
# and evaluation the accuracy by the test set, the k here
# is 5
knn10 = neighbors.KNeighborsClassifier(n_neighbors=10)
knn10.fit(X_train, y_train)
y_pred=knn10.predict(X_test)
accurate_knn10 = accuracy_score(y_test, y_pred)
# build the model by Decision tree classifier on the train set
# and evaluation the accuracy by the test set, the maximum depth
# here is 4
dt = DecisionTreeClassifier(random_state=100, max_depth=4)
dt.fit(X_train, y_train)
y_pred=dt.predict(X_test)
accurate_dt = accuracy_score(y_test, y_pred)
# store all median, mean, variance for each columns
# into big list
total_info = []
for key in dictionary_data_info.keys():
info = []
info.append(key)
for value in dictionary_data_info[key]:
info.append(round(value,3))
total_info.append(info)
# store the list called total_info as a dataframe
# and convert to csv file called 'task2a.csv'
col_name = ['feature','median','mean','variance']
task2a = pd.DataFrame(total_info,columns = col_name)
task2a.to_csv('task2a.csv',index = False)
# print out the accuracy of each classifier
print('Accuracy of decision tree:' +str(round(accurate_dt,3)))
print('Accuracy of k-nn (k=5):'+str(round(accurate_knn5,3)))
print('Accuracy of k-nn (k=10):'+str(round(accurate_knn10,3)))