To predict whether the person is suffering from Diabetes Mellitus(DM) or not
We are using dataset from the National Institute of Diabetes and Kidney Disease. All data are female and > 21 years old of PIMA.
Pima, North American Indians who traditionally lived along the Gila and Salt rivers in Arizona, U.S., in what was the core area of the prehistoric Hohokam culture. They have the highest prevalence ever recorded in the world1.
Logistic regression: 0 for not suffering from DM; 1 for suffering from DM
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
column_names = ["Pregnant", "Glucose", "BP", "Skin", "Insulin", "BMI", "Pedigree", "Age", "DM"]
data_set = pd.read_csv("pima_diabetes.csv", header = None, names=column_names, skiprows=(0,0))
data_set.head()
covert_col = ["Pregnant", "Insulin", "BMI", "Age", "BP", "Pedigree"]
for col in covert_col:
data_set[col] = pd.to_numeric(data_set[col])
Select our independent value (x) and dependent value (y)
feature_col = ["Pregnant", "Insulin", "BMI", "Age", "Glucose", "BP" ,"Pedigree"]
X = data_set[feature_col]
y = data_set.DM
plt.figure(figsize=(40, 30))
coor_range = corr[(corr >= 0.3) | (corr <= -0.1)]
sns.heatmap(coor_range, vmax=0.8, linewidths=0.01, square=True, annot=True, cmap='GnBu', linecolor="white", cbar_kws={'label': 'Feature Correlation Color'})
plt.title('Correlation between features of Pima Datasets')
plt.ylabel("Feature Values on Y axis")
plt.xlabel("Feature Values on X axis")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
logistic_function=LogisticRegression()
logistic_function.fit(X_train, y_train)
y_prediction = logistic_function.predict(X_test)
from sklearn import metrics
cnf_matrix_evaluation = metrics.confusion_matrix(y_test, y_prediction)
cnf_matrix_evaluation
class_names = [0, 1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matrix_evaluation), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title("Confusion Matrix: Diabetes Patient", y=1.1)
plt.ylabel("Acutal DMs")
plt.xlabel("Predicted DMs")
y_prediction_probability = logistic_function.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_prediction_probability)
auc = metrics.roc_auc_score(y_test, y_prediction_probability)
plt.plot(fpr, tpr, label="data 1, auc=" + str(auc))
plt.legend(loc=4)
plt.show()
1 Gohdes D: Diabetes in North American Indians and Alaska natives. In Diabetes in America. Washington, DC, U.S. Govt. Printing Office, 1995, p. 1683–1701 (NIH publ. no. 95-1468). Google Scholar