-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpima.py
138 lines (107 loc) · 4.84 KB
/
pima.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# The python3 is available in your machine
# Make sure the 'pima_diabetes.csv' file is in the same directory as your Python script or notebook.
# a) Read the dataset using pandas' DataFrame:
data = pd.read_csv('diabetes.csv')
# extra settings for show all data in console
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
print(f"a) The dataset \n {data}, {type(data)} \n\n")
# ans: [768 rows x 9 columns], <class 'pandas.core.frame.DataFrame'>
# b) Find the number of instances and features:
num_instances = data.shape[0]
num_features = data.shape[1]
print("b) Number of instances:", num_instances) # ans: Number of instances: 768
print("b) Number of features:", num_features, end='\n\n') # ans: Number of features: 9
# c) Check for missing entries:
missing_entries = data.isnull().sum().sum()
if missing_entries == 0:
print("c) The dataset has no missing entries.")
else:
print("c) The dataset has", missing_entries, "missing entries.")
# ans: The dataset has no missing entries.
# d) Count the number of instances for each outcome value:
outcome_counts = data['Outcome'].value_counts()
print(f"\nd) Outcome_counts {outcome_counts}")
"""
Outcome
0 500
1 268
Name: count, dtype: int64
"""
# e) Show the first 5 and last 5 instances:
first_instances = data.head(5)
last_instances = data.tail(5)
print("\ne) First 5 instances:\n", first_instances, end='\n\n')
"""
First 5 instances:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
"""
print("e) Last 5 instances:\n", last_instances, end='\n\n')
"""
Last 5 instances:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
763 10 101 76 48 180 32.9 0.171 63 0
764 2 122 70 27 0 36.8 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
766 1 126 60 0 0 30.1 0.349 47 1
767 1 93 70 31 0 30.4 0.315 23 0
"""
# f) Check if missing entries are replaced by zeros:
zero_entries = (data == 0).sum().sum()
if zero_entries > 0:
print("\nf) The dataset contains", zero_entries, "zero entries.")
else:
print("\nf) The dataset does not have missing entries replaced by zeros.")
# ans: The dataset contains 1263 zero entries.
# g) Draw a histogram for each numerical feature:
# Select only the numerical features
numerical_features = data.select_dtypes(include=['int64', 'float64'])
numerical_features.hist(bins=10, figsize=(12, 10))
plt.tight_layout()
plt.show()
# h) Split the dataset into X and y:
x = data.drop('Outcome', axis=1)
y = data['Outcome']
print(f"h) X {x}")
print(f"h) y {y}")
# Here, X contains all the predictors/features, and y contains the target class entries.
# i) Split the dataset into train and test sets:
X_train, X_test, y_train, y_test = train_test_split(
x, y, test_size=0.2, random_state=42
)
"""
This code splits the dataset into an 80% train set and a 20% test set.
Adjust the test_size parameter if you want a different split ratio.
"""
"""
j) Apply the KNN classifier with the Euclidean distance metric and
determine the suitable value of the hyperparameter k:
"""
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Determine suitable k value
train_accuracy = []
test_accuracy = []
k_values = range(1, 31)
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
knn.fit(X_train_scaled, y_train)
train_accuracy.append(knn.score(X_train_scaled, y_train))
test_accuracy.append(knn.score(X_test_scaled, y_test))
# Plot train and test accuracy with respect to k
plt.plot(k_values, train_accuracy, label='Train Accuracy')
plt.show() # result file test_accuracy_result.png