-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhospital_readmission_model.py
140 lines (119 loc) · 5.65 KB
/
hospital_readmission_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
model.py
Description:
This script is designed to facilitate the end-to-end process of training machine learning models specifically tailored for binary classification tasks. It incorporates several key functionalities:
1. Data Preprocessing: Handles missing values and encodes categorical variables to prepare data for modeling.
2. Model Training: Implements a machine learning pipeline using LightGBM through GridSearchCV for hyperparameter optimization. The script utilizes advanced techniques such as SMOTEENN for handling imbalanced datasets to enhance model performance.
3. Performance Visualization: Provides functions to plot ROC curves and calculate the accuracy specifically for 'Unadmitted' patients, helping in the evaluation of the model's performance on classification tasks.
The script is structured to be used with a dataset where users specify input features and target variables. It outputs optimal model parameters, evaluation metrics, and visualizations to assess the model's efficacy in predicting binary outcomes.
"""
import logging
import pandas as pd
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline as ImPipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
class BaseModel:
"""Base class for common functionalities in data preprocessing and modeling."""
def __init__(self):
logging.info(f"{self.__class__.__name__} initialized.")
def log_message(self, message: str):
"""Logs a given message."""
logging.info(message)
class DataPreprocessor(BaseModel):
"""Preprocesses data by filling missing values and encoding categorical variables."""
def __init__(self):
super().__init__()
def fit_transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Identify numeric and categorical columns and transform the dataset.
Fills missing values, encodes categorical variables, and checks for invalid data.
"""
X = X.copy()
# Fill missing values for numeric columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in numeric_cols:
X[col] = X[col].fillna(X[col].median())
# Fill missing values for categorical columns and encode
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
# Convert all columns to numeric if possible
X = X.apply(pd.to_numeric, errors='coerce')
# Check for NaNs after conversion and handle them
if X.isnull().values.any():
self.log_message("Data contains NaN values after preprocessing. Replacing NaNs with 0.")
X = X.fillna(0)
return X
class HospitalReadmissionModel(BaseModel):
"""A machine learning model for predicting hospital readmissions."""
def __init__(self):
super().__init__()
def train_model(self, X: pd.DataFrame, y: pd.Series) -> Pipeline:
"""
Trains a machine learning model using LightGBM with GridSearchCV for hyperparameter optimization.
Utilizes SMOTEENN for handling imbalanced datasets and 'force_col_wise' optimization.
"""
self.log_message("Starting model training with GridSearchCV using LightGBM with 'force_col_wise' optimization...")
pipeline = ImPipeline([
('scaler', StandardScaler()),
('smote', SMOTEENN(smote=SMOTE(sampling_strategy='auto'))),
('classifier', LGBMClassifier(
random_state=42,
n_jobs=-1,
class_weight='balanced',
force_col_wise=True
))
])
param_grid = {
'classifier__num_leaves': [31, 61],
'classifier__max_depth': [-1, 10],
'classifier__learning_rate': [0.05, 0.1],
'classifier__n_estimators': [100, 200]
}
model_search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
scoring='roc_auc',
cv=3,
verbose=1
)
model_search.fit(X, y)
self.log_message(f"Optimal Parameters Found: {model_search.best_params_}")
return model_search.best_estimator_
def plot_roc_curve(
self, y_test: pd.Series,
predictions: pd.Series,
y_scores: pd.Series
) -> None:
"""
Plots ROC curve and accuracy for unadmitted patients.
"""
fpr, tpr, _ = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.subplot(1, 2, 2)
unadmitted_accuracy = accuracy_score(y_test == 0, predictions == 0)
plt.bar(['Unadmitted'], [unadmitted_accuracy], color='blue')
plt.ylim([0, 1])
plt.ylabel('Accuracy')
plt.title('Accuracy for Unadmitted Patients')
plt.show()