From fcd761e1f622a0af99c6982dc9fdfddf2bf8a02a Mon Sep 17 00:00:00 2001 From: Sander Vanden Hautte Date: Fri, 6 Aug 2021 09:28:26 +0200 Subject: [PATCH 1/4] Issue #67: Complete review of the target encoder code for linear regression: mainly docstring modifications, but verified the approach, it will work for regression without changes. --- cobra/preprocessing/target_encoder.py | 141 +++++++++++++++----------- 1 file changed, 84 insertions(+), 57 deletions(-) diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 0351049..a828545 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -1,6 +1,6 @@ """ Incidence Replacement Module. The implementation is inspired by -https://contrib.scikit-learn.org/categorical-encoding/index.html +https://github.com/scikit-learn-contrib/category_encoders. Authors: @@ -9,7 +9,6 @@ """ import logging -#import numpy as np import pandas as pd from tqdm.auto import tqdm from sklearn.base import BaseEstimator @@ -20,45 +19,69 @@ class TargetEncoder(BaseEstimator): - """Target encoding for categorical features. + """Target encoding for categorical features, inspired by + http://contrib.scikit-learn.org/category_encoders/targetencoder.html. Replace each value of the categorical feature with the average of the target values (in case of a binary target, this is the incidence of the group). This encoding scheme is also called Mean encoding. + Note that, when applying this target encoding, values of the categorical + feature that have not been seen during fit will be imputed according to the + configured imputation strategy: replacement with the mean, minimum or + maximum value of the categorical variable. + The main problem with Target encoding is overfitting; the fact that we are encoding the feature based on target classes may lead to data leakage, - rendering the feature biased. This can be solved using some type of - regularization. A popular way to handle this is to use cross-validation - and compute the means in each out-of-fold. However, the approach - implemented here makes use of additive smoothing - (https://en.wikipedia.org/wiki/Additive_smoothing) + rendering the feature biased. + This can be solved using some type of regularization. A popular way to + handle this is to use cross-validation and compute the means in each + out-of-fold. However, the approach implemented here makes use of + additive smoothing (https://en.wikipedia.org/wiki/Additive_smoothing). + + In summary: + + - with a binary classification target, a value of a categorical variable is + replaced with: + + [count(variable=value) * P(target=1|variable=value) + weight * P(target=1)] + / [count(variable=value) + weight] + + - with a regression target, a value of a categorical variable is replaced + with: + + [count(variable=value) * E(target|variable=value) + weight * E(target)] + / [count(variable=value) + weight] Attributes ---------- imputation_strategy : str in case there is a particular column which contains new categories, the encoding will lead to NULL values which should be imputed. - Valid strategies are to replace with the global mean of the train - set or the min (resp. max) incidence of the categories of that - particular variable. + Valid strategies then are to replace the NULL values with the global + mean of the train set or the min (resp. max) incidence of the + categories of that particular variable. weight : float - Smoothing parameters (non-negative). The higher the value of the - parameter, the bigger the contribution of the overall mean. When set to - zero, there is no smoothing (e.g. the pure target incidence is used). + Smoothing parameter (non-negative). The higher the value of the + parameter, the bigger the contribution of the overall mean of targets + learnt from all training data (prior) and the smaller the contribution + of the mean target learnt from data with the current categorical value + (posterior), so the bigger the smoothing (regularization) effect. + When set to zero, there is no smoothing (e.g. the mean target of the + current categorical value is used). """ - valid_strategies = ("mean", "min", "max") + valid_imputation_strategies = ("mean", "min", "max") def __init__(self, weight: float=0.0, imputation_strategy: str="mean"): if weight < 0: raise ValueError("The value of weight cannot be smaller than zero") - elif imputation_strategy not in self.valid_strategies: + elif imputation_strategy not in self.valid_imputation_strategies: raise ValueError("Valid options for 'imputation_strategy' are {}." - " Got imputation_strategy={!r} instead" - .format(self.valid_strategies, + " Got imputation_strategy={!r} instead." + .format(self.valid_imputation_strategies, imputation_strategy)) self.weight = weight @@ -69,7 +92,7 @@ def __init__(self, weight: float=0.0, self._global_mean = None def attributes_to_dict(self) -> dict: - """Return the attributes of TargetEncoder in a dictionary + """Return the attributes of TargetEncoder in a dictionary. Returns ------- @@ -98,13 +121,11 @@ def set_attributes_from_dict(self, params: dict): Contains the attributes of TargetEncoder with their names as key. """ - if "weight" in params and type(params["weight"]) == float: self.weight = params["weight"] if ("imputation_strategy" in params and - params["imputation_strategy"] in self.valid_strategies): - + params["imputation_strategy"] in self.valid_imputation_strategies): self.imputation_strategy = params["imputation_strategy"] if "_global_mean" in params and type(params["_global_mean"]) == float: @@ -128,7 +149,7 @@ def dict_to_series(key, value): def fit(self, data: pd.DataFrame, column_names: list, target_column: str): - """Fit the TargetEncoder to the data + """Fit the TargetEncoder to the data. Parameters ---------- @@ -140,7 +161,6 @@ def fit(self, data: pd.DataFrame, column_names: list, target_column : str Column name of the target """ - # compute global mean (target incidence in case of binary target) y = data[target_column] self._global_mean = y.sum() / y.count() @@ -154,7 +174,9 @@ def fit(self, data: pd.DataFrame, column_names: list, self._mapping[column] = self._fit_column(data[column], y) def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series: - """Summary + """Replace the values of a column, holding a categorical value, + with a new value reflecting the formulas mentioned in the docstring + of this class. Parameters ---------- @@ -162,100 +184,103 @@ def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series: data used to compute the encoding mapping for an individual categorical variable. y : pd.Series - series containing the targets for each observation + series containing the targets for each observation (value) of + this categorical variable. Returns ------- pd.Series - Mapping containing the value to replace each group of the - categorical with. + Mapping containing the new value to replace each distinct value + of the categorical variable with. """ stats = y.groupby(X).agg(["mean", "count"]) - # Note if self.weight = 0, we have the ordinary incidence replacement - numerator = (stats["count"]*stats["mean"] + # Note: if self.weight = 0, we have the ordinary incidence replacement + numerator = (stats["count"] * stats["mean"] + self.weight * self._global_mean) denominator = stats["count"] + self.weight - return numerator/denominator + return numerator / denominator def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: - """Replace (e.g. encode) categories of each column with its average - incidence which was computed when the fit method was called + """Replace (e.g. encode) values of each categorical column with a + new value (reflecting the corresponding average target value, + optionally smoothed by a regularization weight), + which was computed when the fit method was called. Parameters ---------- - X : pd.DataFrame - data to encode + data : pd.DataFrame + the data to encode. column_names : list - Columns of data to be encoded + the name of the categorical columns in the data to be encoded. Returns ------- pd.DataFrame - transformed data + the resulting transformed data. Raises ------ NotFittedError Exception when TargetEncoder was not fitted before calling this - method - + method. """ if (len(self._mapping) == 0) or (self._global_mean is None): msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") - raise NotFittedError(msg.format(self.__class__.__name__)) for column in tqdm(column_names, desc="Applying target encoding..."): - if column not in data.columns: - log.warning("Unknown column '{}' will be skipped" + log.warning("Unknown column '{}' will be skipped." .format(column)) continue elif column not in self._mapping: log.warning("Column '{}' is not in fitted output " - "and will be skipped".format(column)) + "and will be skipped.".format(column)) continue - data = self._transform_column(data, column) return data def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFrame: - """Replace (e.g. encode) categories of each column with its average - incidence which was computed when the fit method was called + """Replace (e.g. encode) values of a categorical column with a + new value (reflecting the corresponding average target value, + optionally smoothed by a regularization weight), + which was computed when the fit method was called. Parameters ---------- - X : pd.DataFrame - data to encode + data : pd.DataFrame + the data to encode. column_name : str - Name of the column in data to be encoded + the name of the column in the data to be encoded. Returns ------- pd.DataFrame - transformed data + the resulting transformed data. """ new_column = TargetEncoder._clean_column_name(column_name) - # Convert dtype to float because when the original dtype - # is of type "category", the resulting dtype is also of type - # "category" + # Convert dtype to float, because when the original dtype + # is of type "category", the resulting dtype would otherwise also be of + # type "category": data[new_column] = (data[column_name].map(self._mapping[column_name]) .astype("float")) # In case of categorical data, it could be that new categories will # emerge which were not present in the train set, so this will result - # in missing values (which should be replaced) + # in missing values, which should be replaced according to the + # configured imputation strategy: if data[new_column].isnull().sum() > 0: if self.imputation_strategy == "mean": - data[new_column].fillna(self._global_mean, inplace=True) + data[new_column].fillna(self._global_mean, + inplace=True) elif self.imputation_strategy == "min": data[new_column].fillna(data[new_column].min(), inplace=True) @@ -282,14 +307,16 @@ def fit_transform(self, data: pd.DataFrame, Returns ------- pd.DataFrame - data with additional discretized variables + data with additional columns, holding the target-encoded variables. """ self.fit(data, column_names, target_column) return self.transform(data, column_names) @staticmethod def _clean_column_name(column_name: str) -> str: - """Clean column name string by removing "_bin" and adding "_enc" + """Generate a name for the new column that this target encoder + generates in the given data, by removing "_bin", "_processed" or + "_cleaned" from the original categorical column, and adding "_enc". Parameters ---------- From 872da784966a02d6097f2389a26446caf2c71791 Mon Sep 17 00:00:00 2001 From: Sander Vanden Hautte Date: Fri, 6 Aug 2021 15:35:45 +0200 Subject: [PATCH 2/4] Issue #67: Unit testing target encoder for linear regression. --- tests/preprocessing/test_target_encoder.py | 169 ++++++++++++++++++--- 1 file changed, 152 insertions(+), 17 deletions(-) diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py index 609f9b1..d6007c9 100644 --- a/tests/preprocessing/test_target_encoder.py +++ b/tests/preprocessing/test_target_encoder.py @@ -1,18 +1,22 @@ import pytest import pandas as pd +from sklearn.exceptions import NotFittedError from cobra.preprocessing.target_encoder import TargetEncoder class TestTargetEncoder: - def test_target_encoder_constructor_value_error(self): + def test_target_encoder_constructor_weight_value_error(self): with pytest.raises(ValueError): TargetEncoder(weight=-1) + def test_target_encoder_constructor_imputation_value_error(self): + with pytest.raises(ValueError): + TargetEncoder(imputation_strategy="median") + # Tests for attributes_attributes_to_dict and set_attributes_from_dict def test_target_encoder_attributes_to_dict(self): - encoder = TargetEncoder() mapping_data = pd.Series(data=[0.333333, 0.50000, 0.666667], @@ -40,7 +44,6 @@ def test_target_encoder_attributes_to_dict(self): ["weight", "mapping"], ids=["test_weight", "test_mapping"]) def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute): - encoder = TargetEncoder() data = {"weight": 1.0} @@ -58,7 +61,6 @@ def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute): assert expected == actual def test_target_encoder_set_attributes_from_dict(self): - encoder = TargetEncoder() data = {"weight": 0.0, @@ -79,9 +81,8 @@ def test_target_encoder_set_attributes_from_dict(self): pd.testing.assert_series_equal(actual, expected) - # Tests for _fit_column - def test_target_encoder_fit_column(self): - + # Tests for _fit_column: + def test_target_encoder_fit_column_binary_classification(self): df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'neutral', @@ -98,8 +99,24 @@ def test_target_encoder_fit_column(self): pd.testing.assert_series_equal(actual, expected) - def test_target_encoder_fit_column_global_mean(self): + def test_target_encoder_fit_column_linear_regression(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral', 'positive'], + 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + + encoder = TargetEncoder() + encoder._global_mean = 0.454545 + actual = encoder._fit_column(X=df.variable, y=df.target) + + expected = pd.Series(data=[-4.666667, 0.250000, 4.500000], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + + pd.testing.assert_series_equal(actual, expected) + def test_target_encoder_fit_column_global_mean_binary_classification(self): df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'neutral', @@ -117,9 +134,33 @@ def test_target_encoder_fit_column_global_mean(self): pd.testing.assert_series_equal(actual, expected) - # Tests for fit method - def test_target_encoder_fit(self): + def test_target_encoder_fit_column_global_mean_linear_regression(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral', 'positive'], + 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + + encoder = TargetEncoder(weight=1) + encoder._global_mean = 0.454545 + + actual = encoder._fit_column(X=df.variable, y=df.target) + + # expected new value: + # [count of the value * its mean encoding + weight (= 1) * global mean] + # / [count of the value + weight (=1)]. + expected = pd.Series(data=[(3 * -4.666667 + 1 * 0.454545) / (3 + 1), + (4 * 0.250000 + 1 * 0.454545) / (4 + 1), + (4 * 4.500000 + 1 * 0.454545) / (4 + 1)], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + + pd.testing.assert_series_equal(actual, expected) + # Tests for fit method + def test_target_encoder_fit_binary_classification(self): + # test_target_encoder_fit_column_linear_regression() tested on one + # column input as a numpy series; this test runs on a dataframe input. df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'neutral', @@ -136,9 +177,41 @@ def test_target_encoder_fit(self): pd.testing.assert_series_equal(actual, expected) + def test_target_encoder_fit_linear_regression(self): + # test_target_encoder_fit_column_linear_regression() tested on one + # column input as a numpy series; this test runs on a dataframe input. + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral', 'positive'], + 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + + encoder = TargetEncoder() + encoder.fit(data=df, column_names=["variable"], target_column="target") + + expected = pd.Series(data=[-4.666667, 0.250000, 4.500000], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + actual = encoder._mapping["variable"] + + pd.testing.assert_series_equal(actual, expected) + # Tests for transform method - def test_target_encoder_transform(self): + def test_target_encoder_transform_when_not_fitted(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + # inputs of TargetEncoder will be of dtype category + df["variable"] = df["variable"].astype("category") + encoder = TargetEncoder() + with pytest.raises(NotFittedError): + encoder.transform(data=df, column_names=["variable"]) + + def test_target_encoder_transform_binary_classification(self): df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'neutral', @@ -159,8 +232,28 @@ def test_target_encoder_transform(self): pd.testing.assert_frame_equal(actual, expected) - def test_target_encoder_transform_new_category(self): + def test_target_encoder_transform_linear_regression(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral', 'positive'], + 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + + # inputs of TargetEncoder will be of dtype category + df["variable"] = df["variable"].astype("category") + + expected = df.copy() + expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000, + -4.666667, 4.500000, -4.666667, 0.250000, + 0.250000, 0.250000, 4.500000] + + encoder = TargetEncoder() + encoder.fit(data=df, column_names=["variable"], target_column="target") + actual = encoder.transform(data=df, column_names=["variable"]) + + pd.testing.assert_frame_equal(actual, expected) + def test_target_encoder_transform_new_category_binary_classification(self): df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'neutral', @@ -185,10 +278,35 @@ def test_target_encoder_transform_new_category(self): pd.testing.assert_frame_equal(actual, expected) - # Tests for _clean_column_name - def test_target_encoder_clean_column_name(self): + def test_target_encoder_transform_new_category_linear_regression(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral', 'positive'], + 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) - column_name = "test_column" + df_appended = df.append({"variable": "new", "target": 10}, + ignore_index=True) + + # inputs of TargetEncoder will be of dtype category + df["variable"] = df["variable"].astype("category") + df_appended["variable"] = df_appended["variable"].astype("category") + + expected = df_appended.copy() + expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000, + -4.666667, 4.500000, -4.666667, 0.250000, + 0.250000, 0.250000, 4.500000, + -4.666667] # min imputation for new value + + encoder = TargetEncoder(imputation_strategy="min") + encoder.fit(data=df, column_names=["variable"], target_column="target") + actual = encoder.transform(data=df_appended, column_names=["variable"]) + + pd.testing.assert_frame_equal(actual, expected) + + # Tests for _clean_column_name: + def test_target_encoder_clean_column_name_binned_column(self): + column_name = "test_column_bin" expected = "test_column_enc" encoder = TargetEncoder() @@ -196,9 +314,26 @@ def test_target_encoder_clean_column_name(self): assert actual == expected - def test_target_encoder_clean_column_name_binned_column(self): + def test_target_encoder_clean_column_name_processed_column(self): + column_name = "test_column_processed" + expected = "test_column_enc" - column_name = "test_column_bin" + encoder = TargetEncoder() + actual = encoder._clean_column_name(column_name) + + assert actual == expected + + def test_target_encoder_clean_column_name_cleaned_column(self): + column_name = "test_column_cleaned" + expected = "test_column_enc" + + encoder = TargetEncoder() + actual = encoder._clean_column_name(column_name) + + assert actual == expected + + def test_target_encoder_clean_column_other_name(self): + column_name = "test_column" expected = "test_column_enc" encoder = TargetEncoder() From c48919147ff9be666ab5aa74af398437852a2c2b Mon Sep 17 00:00:00 2001 From: Sander Vanden Hautte Date: Fri, 6 Aug 2021 16:11:39 +0200 Subject: [PATCH 3/4] Issue #67: Adding warning about potential overfitting in target encoding when weight=0. --- cobra/preprocessing/target_encoder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index a828545..603779b 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -78,6 +78,10 @@ def __init__(self, weight: float=0.0, if weight < 0: raise ValueError("The value of weight cannot be smaller than zero") + elif weight == 0: + log.warning("The target encoder's additive smoothing weight is " + "set to 0. This disables smoothing and may make the " + "encoding prone to overfitting.") elif imputation_strategy not in self.valid_imputation_strategies: raise ValueError("Valid options for 'imputation_strategy' are {}." " Got imputation_strategy={!r} instead." From 0165d4af9ade6c6bed756d597b6e3eb226693aad Mon Sep 17 00:00:00 2001 From: Sander Vanden Hautte Date: Fri, 6 Aug 2021 16:12:57 +0200 Subject: [PATCH 4/4] Issue #67: Adding warning about potential overfitting in target encoding when weight=0. (part 2, fixing an oopsie.) --- cobra/preprocessing/target_encoder.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 603779b..64cdfcd 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -78,16 +78,17 @@ def __init__(self, weight: float=0.0, if weight < 0: raise ValueError("The value of weight cannot be smaller than zero") - elif weight == 0: - log.warning("The target encoder's additive smoothing weight is " - "set to 0. This disables smoothing and may make the " - "encoding prone to overfitting.") elif imputation_strategy not in self.valid_imputation_strategies: raise ValueError("Valid options for 'imputation_strategy' are {}." " Got imputation_strategy={!r} instead." .format(self.valid_imputation_strategies, imputation_strategy)) + if weight == 0: + log.warning("The target encoder's additive smoothing weight is " + "set to 0. This disables smoothing and may make the " + "encoding prone to overfitting.") + self.weight = weight self.imputation_strategy = imputation_strategy