Skip to content

Commit

Permalink
Merge pull request #87 from PythonPredictions/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
sandervh14 authored Aug 11, 2021
2 parents 7459b71 + 953e81b commit a44a693
Show file tree
Hide file tree
Showing 2 changed files with 241 additions and 74 deletions.
146 changes: 89 additions & 57 deletions cobra/preprocessing/target_encoder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Incidence Replacement Module. The implementation is inspired by
https://contrib.scikit-learn.org/categorical-encoding/index.html
https://github.com/scikit-learn-contrib/category_encoders.
Authors:
Expand All @@ -9,7 +9,6 @@
"""
import logging

#import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.base import BaseEstimator
Expand All @@ -20,47 +19,76 @@

class TargetEncoder(BaseEstimator):

"""Target encoding for categorical features.
"""Target encoding for categorical features, inspired by
http://contrib.scikit-learn.org/category_encoders/targetencoder.html.
Replace each value of the categorical feature with the average of the
target values (in case of a binary target, this is the incidence of the
group). This encoding scheme is also called Mean encoding.
Note that, when applying this target encoding, values of the categorical
feature that have not been seen during fit will be imputed according to the
configured imputation strategy: replacement with the mean, minimum or
maximum value of the categorical variable.
The main problem with Target encoding is overfitting; the fact that we are
encoding the feature based on target classes may lead to data leakage,
rendering the feature biased. This can be solved using some type of
regularization. A popular way to handle this is to use cross-validation
and compute the means in each out-of-fold. However, the approach
implemented here makes use of additive smoothing
(https://en.wikipedia.org/wiki/Additive_smoothing)
rendering the feature biased.
This can be solved using some type of regularization. A popular way to
handle this is to use cross-validation and compute the means in each
out-of-fold. However, the approach implemented here makes use of
additive smoothing (https://en.wikipedia.org/wiki/Additive_smoothing).
In summary:
- with a binary classification target, a value of a categorical variable is
replaced with:
[count(variable=value) * P(target=1|variable=value) + weight * P(target=1)]
/ [count(variable=value) + weight]
- with a regression target, a value of a categorical variable is replaced
with:
[count(variable=value) * E(target|variable=value) + weight * E(target)]
/ [count(variable=value) + weight]
Attributes
----------
imputation_strategy : str
in case there is a particular column which contains new categories,
the encoding will lead to NULL values which should be imputed.
Valid strategies are to replace with the global mean of the train
set or the min (resp. max) incidence of the categories of that
particular variable.
Valid strategies then are to replace the NULL values with the global
mean of the train set or the min (resp. max) incidence of the
categories of that particular variable.
weight : float
Smoothing parameters (non-negative). The higher the value of the
parameter, the bigger the contribution of the overall mean. When set to
zero, there is no smoothing (e.g. the pure target incidence is used).
Smoothing parameter (non-negative). The higher the value of the
parameter, the bigger the contribution of the overall mean of targets
learnt from all training data (prior) and the smaller the contribution
of the mean target learnt from data with the current categorical value
(posterior), so the bigger the smoothing (regularization) effect.
When set to zero, there is no smoothing (e.g. the mean target of the
current categorical value is used).
"""

valid_strategies = ("mean", "min", "max")
valid_imputation_strategies = ("mean", "min", "max")

def __init__(self, weight: float=0.0,
imputation_strategy: str="mean"):

if weight < 0:
raise ValueError("The value of weight cannot be smaller than zero")
elif imputation_strategy not in self.valid_strategies:
elif imputation_strategy not in self.valid_imputation_strategies:
raise ValueError("Valid options for 'imputation_strategy' are {}."
" Got imputation_strategy={!r} instead"
.format(self.valid_strategies,
" Got imputation_strategy={!r} instead."
.format(self.valid_imputation_strategies,
imputation_strategy))

if weight == 0:
log.warning("The target encoder's additive smoothing weight is "
"set to 0. This disables smoothing and may make the "
"encoding prone to overfitting.")

self.weight = weight
self.imputation_strategy = imputation_strategy

Expand All @@ -69,7 +97,7 @@ def __init__(self, weight: float=0.0,
self._global_mean = None

def attributes_to_dict(self) -> dict:
"""Return the attributes of TargetEncoder in a dictionary
"""Return the attributes of TargetEncoder in a dictionary.
Returns
-------
Expand Down Expand Up @@ -98,13 +126,11 @@ def set_attributes_from_dict(self, params: dict):
Contains the attributes of TargetEncoder with their
names as key.
"""

if "weight" in params and type(params["weight"]) == float:
self.weight = params["weight"]

if ("imputation_strategy" in params and
params["imputation_strategy"] in self.valid_strategies):

params["imputation_strategy"] in self.valid_imputation_strategies):
self.imputation_strategy = params["imputation_strategy"]

if "_global_mean" in params and type(params["_global_mean"]) == float:
Expand All @@ -128,7 +154,7 @@ def dict_to_series(key, value):

def fit(self, data: pd.DataFrame, column_names: list,
target_column: str):
"""Fit the TargetEncoder to the data
"""Fit the TargetEncoder to the data.
Parameters
----------
Expand All @@ -140,7 +166,6 @@ def fit(self, data: pd.DataFrame, column_names: list,
target_column : str
Column name of the target
"""

# compute global mean (target incidence in case of binary target)
y = data[target_column]
self._global_mean = y.sum() / y.count()
Expand All @@ -154,108 +179,113 @@ def fit(self, data: pd.DataFrame, column_names: list,
self._mapping[column] = self._fit_column(data[column], y)

def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series:
"""Summary
"""Replace the values of a column, holding a categorical value,
with a new value reflecting the formulas mentioned in the docstring
of this class.
Parameters
----------
X : pd.Series
data used to compute the encoding mapping for an individual
categorical variable.
y : pd.Series
series containing the targets for each observation
series containing the targets for each observation (value) of
this categorical variable.
Returns
-------
pd.Series
Mapping containing the value to replace each group of the
categorical with.
Mapping containing the new value to replace each distinct value
of the categorical variable with.
"""
stats = y.groupby(X).agg(["mean", "count"])

# Note if self.weight = 0, we have the ordinary incidence replacement
numerator = (stats["count"]*stats["mean"]
# Note: if self.weight = 0, we have the ordinary incidence replacement
numerator = (stats["count"] * stats["mean"]
+ self.weight * self._global_mean)

denominator = stats["count"] + self.weight

return numerator/denominator
return numerator / denominator

def transform(self, data: pd.DataFrame,
column_names: list) -> pd.DataFrame:
"""Replace (e.g. encode) categories of each column with its average
incidence which was computed when the fit method was called
"""Replace (e.g. encode) values of each categorical column with a
new value (reflecting the corresponding average target value,
optionally smoothed by a regularization weight),
which was computed when the fit method was called.
Parameters
----------
X : pd.DataFrame
data to encode
data : pd.DataFrame
the data to encode.
column_names : list
Columns of data to be encoded
the name of the categorical columns in the data to be encoded.
Returns
-------
pd.DataFrame
transformed data
the resulting transformed data.
Raises
------
NotFittedError
Exception when TargetEncoder was not fitted before calling this
method
method.
"""
if (len(self._mapping) == 0) or (self._global_mean is None):
msg = ("This {} instance is not fitted yet. Call 'fit' with "
"appropriate arguments before using this method.")

raise NotFittedError(msg.format(self.__class__.__name__))

for column in tqdm(column_names, desc="Applying target encoding..."):

if column not in data.columns:
log.warning("Unknown column '{}' will be skipped"
log.warning("Unknown column '{}' will be skipped."
.format(column))
continue
elif column not in self._mapping:
log.warning("Column '{}' is not in fitted output "
"and will be skipped".format(column))
"and will be skipped.".format(column))
continue

data = self._transform_column(data, column)

return data

def _transform_column(self, data: pd.DataFrame,
column_name: str) -> pd.DataFrame:
"""Replace (e.g. encode) categories of each column with its average
incidence which was computed when the fit method was called
"""Replace (e.g. encode) values of a categorical column with a
new value (reflecting the corresponding average target value,
optionally smoothed by a regularization weight),
which was computed when the fit method was called.
Parameters
----------
X : pd.DataFrame
data to encode
data : pd.DataFrame
the data to encode.
column_name : str
Name of the column in data to be encoded
the name of the column in the data to be encoded.
Returns
-------
pd.DataFrame
transformed data
the resulting transformed data.
"""
new_column = TargetEncoder._clean_column_name(column_name)

# Convert dtype to float because when the original dtype
# is of type "category", the resulting dtype is also of type
# "category"
# Convert dtype to float, because when the original dtype
# is of type "category", the resulting dtype would otherwise also be of
# type "category":
data[new_column] = (data[column_name].map(self._mapping[column_name])
.astype("float"))

# In case of categorical data, it could be that new categories will
# emerge which were not present in the train set, so this will result
# in missing values (which should be replaced)
# in missing values, which should be replaced according to the
# configured imputation strategy:
if data[new_column].isnull().sum() > 0:
if self.imputation_strategy == "mean":
data[new_column].fillna(self._global_mean, inplace=True)
data[new_column].fillna(self._global_mean,
inplace=True)
elif self.imputation_strategy == "min":
data[new_column].fillna(data[new_column].min(),
inplace=True)
Expand All @@ -282,14 +312,16 @@ def fit_transform(self, data: pd.DataFrame,
Returns
-------
pd.DataFrame
data with additional discretized variables
data with additional columns, holding the target-encoded variables.
"""
self.fit(data, column_names, target_column)
return self.transform(data, column_names)

@staticmethod
def _clean_column_name(column_name: str) -> str:
"""Clean column name string by removing "_bin" and adding "_enc"
"""Generate a name for the new column that this target encoder
generates in the given data, by removing "_bin", "_processed" or
"_cleaned" from the original categorical column, and adding "_enc".
Parameters
----------
Expand Down
Loading

0 comments on commit a44a693

Please sign in to comment.