-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcateorical_encoder.py
115 lines (94 loc) · 5.24 KB
/
cateorical_encoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
class CategoricalEncoder(BaseEstimator, TransformerMixin):
"""
Custom transformer for encoding categorical and ordinal variables using LabelEncoder
for categorical variables and custom mappings for ordinal variables.
This class also handles unseen values during transformation by replacing them with
the mode (for categorical variables) or predefined mappings (for ordinal variables).
Inherits from sklearn's BaseEstimator and TransformerMixin for compatibility
with scikit-learn's pipeline.
"""
def __init__(self, ordinal_variables, categorical_variables):
"""
Initialize the CategoricalEncoder with the given categorical and ordinal variables.
Args:
ordinal_variables (list): List of ordinal variables to encode.
categorical_variables (list): List of categorical variables to encode using LabelEncoder.
"""
self.ordinal_variables = ordinal_variables
self.categorical_variables = categorical_variables
self.label_encoders = {} # Dictionary to store fitted LabelEncoders for categorical variables
self.modes = {} # Dictionary to store mode values for handling unseen categorical data
self.mappings = {} # Dictionary to store mappings for ordinal variables
# Custom mappings for ordinal variables 'Credit_Mix' and 'Payment_of_Min_Amount'
self.mappings['Credit_Mix'] = {
'Bad': 0,
'Standard': 1,
'Good': 2,
'_': -1
}
self.mappings['Payment_of_Min_Amount'] = {
'Yes': 1,
'No': 0,
'NM': -1
}
def fit(self, X, y=None):
"""
Fit the transformer by creating LabelEncoders for categorical variables and
storing mode values to handle unseen categories during transformation.
Args:
X (pd.DataFrame): Input dataset to fit the transformer on.
y (pd.Series, optional): Target variable, not used here.
Returns:
self: Fitted transformer with initialized encoders and mode values.
"""
# Identify the original categorical features in the dataset
self.original_features = X.select_dtypes(include=['object']).columns
# Update the lists of ordinal and categorical variables to include only the features present in the dataset.
self.check_init_list(X)
# Fit LabelEncoder for each categorical variable
for feature in self.categorical_variables:
if feature in X.columns:
le = LabelEncoder() # Initialize LabelEncoder
le.fit(X[feature])
self.label_encoders[feature] = le # Store fitted LabelEncoder
self.modes[feature] = le.transform(X[feature].mode())[0] # Store the mode value for handling unseen categories
# Handle modes for specific ordinal variables
if 'Credit_Mix' in X.columns:
self.modes['Credit_Mix'] = self.mappings['Credit_Mix'][X['Credit_Mix'].mode()[0]]
if 'Payment_of_Min_Amount' in X.columns:
self.modes['Payment_of_Min_Amount'] = self.mappings['Payment_of_Min_Amount'][X['Payment_of_Min_Amount'].mode()[0]]
return self
def transform(self, X, y=None):
"""
Transform the dataset by encoding categorical and ordinal variables.
Args:
X (pd.DataFrame): Input dataset to transform.
y (pd.Series, optional): Target variable, not used here.
Returns:
pd.DataFrame: Transformed dataset with encoded variables and original categorical variables dropped.
"""
# Apply LabelEncoder for categorical variables and handle unseen values using precomputed mode
for feature in self.categorical_variables:
if feature in X.columns:
le = self.label_encoders[feature]
X[f'{feature}_encoded'] = X[feature].apply(lambda x: le.transform([x])[0] if x in le.classes_ else self.modes[feature])
# Apply custom mappings for ordinal variables and fill unseen/missing values using precomputed mode
for feature in self.ordinal_variables:
if feature in X.columns:
X[f'{feature}_encoded'] = X[feature].map(self.mappings[feature]) # imputes Nan value if the category isn't in the mappings dictionary
X[f'{feature}_encoded'].fillna(self.modes[feature], inplace=True)
# Drop the original categorical columns after encoding
X.drop(columns=[col for col in self.original_features if col in X.columns], inplace=True)
return X
def check_init_list(self, X):
"""
Function to ensure that only variables present in the dataset are considered
for encoding. Updates the lists of ordinal and categorical variables accordingly.
Args:
X (pd.DataFrame): Input dataset.
"""
features = (X.columns)
self.ordinal_variables = [c for c in self.ordinal_variables if c in features]
self.categorical_variables = [c for c in self.categorical_variables if c in features]