-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
267 lines (223 loc) · 9.6 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# -*- coding: utf-8 -*-
"""Sanket.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1uqIWnfMWsWqiB2YdB3ZIWnqxOoSgc5NS
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from yellowbrick.cluster import KElbowVisualizer
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')
# Load the data
df = pd.read_csv("/content/dataset.csv")
# Encode categorical columns
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'dataset']
label_encoders = {}
for col in categorical_cols:
label_encoders[col] = LabelEncoder()
df[col] = label_encoders[col].fit_transform(df[col])
# Data information
df.head()
df.info()
df.shape
# Basic statistics
df['age'].describe()
# Custom color palette
custom_colors = ["#FF5733", "#3366FF", "#33FF57"]
sns.histplot(df['age'], kde=True, color="#FF5733", palette=custom_colors)
plt.axvline(df['age'].mean(), color='Red')
plt.axvline(df['age'].median(), color='Green')
plt.axvline(df['age'].mode()[0], color='Blue')
plt.show()
# Plotly histogram
fig = px.histogram(data_frame=df, x='age', color='sex')
fig.show()
# Percentage calculations for 'sex' column
male_count = df['sex'].value_counts().get(1, 0)
female_count = df['sex'].value_counts().get(0, 0)
total_count = male_count + female_count
male_percentage = (male_count / total_count) * 100
female_percentage = (female_count / total_count) * 100
print(f'Male percentage in the data: {male_percentage:.2f}%')
print(f'Female percentage in the data: {female_percentage:.2f}%')
difference_percentage = ((male_count - female_count) / female_count) * 100
print(f'Males are {difference_percentage:.2f}% more than females in the data.')
# Group by 'sex' and 'age'
print(df.groupby('sex')['age'].value_counts())
# Counts for 'dataset' column
print(df['dataset'].value_counts())
# Plotly bar chart for 'dataset' column
fig = px.bar(df, x='dataset', color='sex')
fig.show()
# Histogram for 'age' column colored by 'dataset'
fig = px.histogram(data_frame=df, x='age', color='dataset')
fig.show()
# Summary statistics for 'trestbps' column
print(df['trestbps'].describe())
# Missing values in 'trestbps' column
print(f"Percentage of missing values in 'trestbps' column: {df['trestbps'].isnull().sum() / len(df) * 100:.2f}%")
# Impute missing values in 'trestbps' column using IterativeImputer
imputer1 = IterativeImputer(max_iter=10, random_state=42)
df['trestbps'] = imputer1.fit_transform(df[['trestbps']])
print(f"Missing values in 'trestbps' column after imputation: {df['trestbps'].isnull().sum()}")
# Impute other columns
imputer2 = IterativeImputer(max_iter=10, random_state=42)
df['ca'] = imputer2.fit_transform(df[['ca']])
df['oldpeak'] = imputer2.fit_transform(df[['oldpeak']])
df['chol'] = imputer2.fit_transform(df[['chol']])
df['thalch'] = imputer2.fit_transform(df[['thalch']])
# Check for missing values
print(df.isnull().sum().sort_values(ascending=False))
# Function to impute missing values in categorical columns
def impute_categorical_missing_data(df, col):
df_null = df[df[col].isnull()]
df_not_null = df[df[col].notnull()]
X = df_not_null.drop(col, axis=1)
y = df_not_null[col]
# Encode categorical columns
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
imputer = IterativeImputer(estimator=RandomForestClassifier(random_state=16), max_iter=10, random_state=42)
imputer.fit(X, y)
df[col] = imputer.transform(df[col].values.reshape(-1, 1)).flatten()
return df
# Function to impute missing values in continuous columns
def impute_continuous_missing_data(df, col):
df_null = df[df[col].isnull()]
df_not_null = df[df[col].notnull()]
X = df_not_null.drop(col, axis=1)
y = df_not_null[col]
imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=16), max_iter=10, random_state=42)
imputer.fit(X, y)
df[col] = imputer.transform(df[col].values.reshape(-1, 1)).flatten()
return df
# Check for missing values after imputation
print(df.isnull().sum().sort_values(ascending=False))
# Boxen plots for columns
sns.set(rc={"axes.facecolor": "#87CEEB", "figure.facecolor": "#EEE8AA"})
palette = ["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"]
plt.figure(figsize=(10, 8))
for i, col in enumerate(df.columns):
plt.subplot(3, 2, i + 1)
sns.boxenplot(x=df[col], color=palette[i % len(palette)])
plt.title(col)
plt.tight_layout()
plt.show()
# Remove rows with 'trestbps' value of 0
df = df[df['trestbps'] != 0]
# Encode 'dataset' column
df['dataset'] = LabelEncoder().fit_transform(df['dataset'])
# Split data into X and y
X = df.drop('num', axis=1)
y = df['num']
# Encode categorical columns
categorical_cols = ['thal', 'ca', 'slope', 'exang', 'restecg', 'fbs', 'cp', 'sex']
for col in categorical_cols:
X[col] = LabelEncoder().fit_transform(X[col])
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# Define models
models = [
('Logistic Regression', LogisticRegression(random_state=42)),
('KNeighbors Classifier', KNeighborsClassifier()),
('Support Vector Machine', SVC(random_state=42)),
('Decision Tree Classifier', DecisionTreeClassifier(random_state=42)),
('Random Forest', RandomForestClassifier(random_state=42)),
('AdaBoost Classifier', AdaBoostClassifier(random_state=42)),
('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
('XGBoost', XGBClassifier(random_state=42)),
('LightGBM', LGBMClassifier(random_state=42)),
('GaussianNB', GaussianNB())
]
best_model = None
best_accuracy = 0.0
from sklearn.pipeline import Pipeline
# Iterate over the models and evaluate their performance
for name, model in models:
pipeline = Pipeline([
('model', model)
])
scores = cross_val_score(pipeline, X_train, y_train, cv=5)
mean_accuracy = scores.mean()
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model: {name}")
print(f"Cross Validation Accuracy: {mean_accuracy}")
print(f"Test Accuracy: {accuracy}\n")
if accuracy > best_accuracy:
best_accuracy = accuracy
best_model = pipeline
print("Best Model:", best_model)
# Hyperparameter tuning
def hyperparameter_tuning(X, y, categorical_columns, models):
results = {}
X_encoded = X.copy()
label_encoder = LabelEncoder()
for col in categorical_columns:
X_encoded[col] = label_encoder.fit_transform(X_encoded[col])
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
for model_name, model in models.items():
param_grid = {}
if model_name == 'Logistic Regression':
param_grid = {'C': [0.1, 1, 10, 100]}
elif model_name == 'KNN':
param_grid = {'n_neighbors': [3, 5, 7, 9]}
elif model_name == 'Gaussian Naive Bayes':
param_grid = {'var_smoothing': np.logspace(-9, 0, 10)}
elif model_name == 'SVM':
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
elif model_name == 'Decision Tree':
param_grid = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}
elif model_name == 'Random Forest':
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}
elif model_name == 'AdaBoost':
param_grid = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]}
elif model_name == 'Gradient Boosting':
param_grid = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]}
elif model_name == 'XGBoost':
param_grid = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]}
else:
continue
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
results[model_name] = {'best_params': best_params, 'accuracy': accuracy}
return results
# Define models for tuning
models_for_tuning = {
'Logistic Regression': LogisticRegression(),
'KNN': KNeighborsClassifier(),
'Gaussian Naive Bayes': GaussianNB(),
'SVM': SVC(),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'AdaBoost': AdaBoostClassifier(),
'Gradient Boosting': GradientBoostingClassifier(),
'XGBoost': XGBClassifier()
}
# Perform hyperparameter tuning
results = hyperparameter_tuning(X, y, categorical_cols, models_for_tuning)
print(results)