From b109798f737a59991b7d6b9ae2ea5166438e1a68 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 18 Sep 2019 10:48:34 +0200 Subject: [PATCH 01/98] Add KBinsDiscretizer to new preprocessing module Created a new preprocessing module and add KBinsDiscretizer as the first class to that module. Also included unittests for that class. --- cobra/preprocessing/__init__.py | 0 cobra/preprocessing/kbins_discretizer.py | 398 ++++++++++++++++++ tests/preprocessing/__init__.py | 0 tests/preprocessing/test_kbins_discretizer.py | 73 ++++ 4 files changed, 471 insertions(+) create mode 100644 cobra/preprocessing/__init__.py create mode 100644 cobra/preprocessing/kbins_discretizer.py create mode 100644 tests/preprocessing/__init__.py create mode 100644 tests/preprocessing/test_kbins_discretizer.py diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py new file mode 100644 index 0000000..efd0e54 --- /dev/null +++ b/cobra/preprocessing/kbins_discretizer.py @@ -0,0 +1,398 @@ +""" +This class is a rework of +https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/preprocessing/_discretization.py +However, it is purely written in pandas instead of numpy because +it is more intuitive +""" +# standard lib imports +from copy import deepcopy +from typing import List +import numbers + +import logging +log = logging.getLogger(__name__) + +# third party imports +import numpy as np +import pandas as pd + +from sklearn.exceptions import NotFittedError + + +class KBinsDiscretizer: + + """Bin continuous data into intervals of predefined size + + Attributes + ---------- + auto_adapt_bins : bool + reduces the number of bins (starting from n_bins) as a function of + the number of missings + bins_by_column : dict + Placeholder for the fitted output + change_endpoint_format : bool + Whether or not to change the format of the lower and upper bins into + "< x" and "> y" resp. + closed : str + Whether to close the bins (intervals) from the left or right + label_format : str + format string to display the bin labels e.g. min - max, (min, max], ... + n_bins : int + Number of bins to produce. Raises ValueError if ``n_bins < 2``. + starting_precision : int + Initial precision for the bin edges to start from, + can also be negative. Given a list of bin edges, the class will + automatically choose the minimal precision required to have proper bins + e.g. [5.5555, 5.5744, ...] will be rounded to [5.56, 5.57, ...]. In + case of a negative number, an attempt will be made to round up the + numbers of the bin edges e.g. 5.55 -> 10, 146 -> 100, ... + strategy : str + Binning strategy. Currently only "uniform" and "quantile" + e.g. equifrequency is supported + """ + + valid_strategies = ("uniform", "quantile") + + def __init__(self, n_bins: int=10, strategy: str="quantile", + closed: str="right", + auto_adapt_bins: bool=False, + starting_precision: int=0, + label_format: str="{} - {}", + change_endpoint_format: bool=False): + + # validate number of bins + self._validate_n_bins(n_bins) + + self.n_bins = n_bins + self.strategy = strategy.lower() + self.closed = closed + self.auto_adapt_bins = auto_adapt_bins + self.starting_precision = starting_precision + self.label_format = label_format + self.change_endpoint_format = change_endpoint_format + + # dict to store fitted output in + self._bins_by_column = {} + + def _validate_n_bins(self, n_bins: int): + """Check if n_bins is of the proper type and if it is bigger than two + + Parameters + ---------- + n_bins : int + Number of bins KBinsDiscretizer has to produce for each variable + + Raises + ------ + ValueError + in case n_bins is not an integer or if n_bins < 2 + """ + if not isinstance(n_bins, numbers.Integral): + raise ValueError("{} received an invalid n_bins type. " + "Received {}, expected int." + .format(KBinsDiscretizer.__name__, + type(n_bins).__name__)) + if n_bins < 2: + raise ValueError("{} received an invalid number " + "of bins. Received {}, expected at least 2." + .format(KBinsDiscretizer.__name__, n_bins)) + + def set_bins_by_columns(self, bins_by_column: List[tuple]): + # To do: add checks! + self._bins_by_column = bins_by_column + + def _compute_minimal_precision_of_cutpoints(self, cutpoints: list) -> int: + """Compute the minimal precision of a list of cutpoints so that we end + up with a strictly ascending sequence of numbers. + The starting_precision attribute will be used as the initial precision. + In case of a negative starting_precision, the bin edges will be rounded + to the nearest 10, 100, ... (e.g. 5.55 -> 10, 246 -> 200, ...) + + Parameters + ---------- + cutpoints : list + The bin edges for binning a continuous variable + + Returns + ------- + int + minimal precision for the bin edges + """ + + precision = self.starting_precision + while True: + cont = False + for a, b in zip(cutpoints, cutpoints[1:]): + if a != b and round(a, precision) == round(b, precision): + # precision is not high enough, so increase + precision += 1 + cont = True # set cont to True to keep looping + break # break out of the for loop + if not cont: + # if minimal precision was found, + # return to break out of while loop + return precision + + def _compute_bins_from_cutpoints(self, cutpoints: list) -> List[tuple]: + """Given a list of bin edges, compute the minimal precision for which + we can make meaningful bins and make those bins + + Parameters + ---------- + cutpoints : list + The bin edges for binning a continuous variable + + Returns + ------- + List[tuple] + A (sorted) list of bins as tuples + """ + # compute the minimal precision of the cutpoints + # this can be a negative number, which then + # rounds numbers to the nearest 10, 100, ... + precision = self._compute_minimal_precision_of_cutpoints(cutpoints) + + bins = [] + for a, b in zip(cutpoints, cutpoints[1:]): + fmt_a = round(a, precision) + fmt_b = round(b, precision) + + bins.append((fmt_a, fmt_b)) + + return bins + + @staticmethod + def _create_index(intervals: List[tuple], + closed: str="right") -> pd.IntervalIndex: + """Create an pd.IntervalIndex based on a list of tuples. + This is basically a wrapper around pd.IntervalIndex.from_tuples + However, the lower bound of the first entry in the list (the lower bin) + is replaced by -np.inf. Similarly, the upper bound of the last entry in + the list (upper bin) is replaced by np.inf. + + Parameters + ---------- + intervals : List[tuple] + a list of tuples describing the intervals + closed : str, optional + Whether the intervals should be closed on the left-side, + right-side, both or neither. + + Returns + ------- + pd.IntervalIndex + Description + """ + # deepcopy variable because we do not want to modify the content + # of intervals (which is still used outside of this function) + _intervals = deepcopy(intervals) + # Modify min and max with -np.inf and np.inf resp. so that these values + # are guaranteed to be included when transforming the data + _intervals[0] = (-np.inf, _intervals[0][1]) + _intervals[-1] = (_intervals[-1][0], np.inf) + + return pd.IntervalIndex.from_tuples(_intervals, closed) + + def _create_bin_labels(self, bins: List[tuple]) -> list: + """Given a list of bins, create a list of string containing the bins + as a string with a specific format (e.g. bin labels) + + Parameters + ---------- + bins : List[tuple] + list of bins + + Returns + ------- + list + list of (formatted) bin labels + """ + bin_labels = [] + for interval in bins: + bin_labels.append(self.label_format.format(interval[0], + interval[1])) + + # Format first and last bin as < x and > y resp. + if self.change_endpoint_format: + bin_labels[0] = "< {}".format(bins[0][1]) + bin_labels[-1] = "> {}".format(bins[-1][0]) + + return bin_labels + + def _fit_column(self, data: pd.DataFrame, + column_name: str) -> List[tuple]: + """Compute bins for a specific column in data + + Parameters + ---------- + data : pd.DataFrame + Description + column_name : str + Description + + Returns + ------- + List[tuple] + list of bins as tuples + """ + + col_min, col_max = data[column_name].min(), data[column_name].max() + + if col_min == col_max: + log.warning("Predictor {} is constant and " + "will be ignored in computation".format(column_name)) + return None + + n_bins = self.n_bins + if self.auto_adapt_bins: + size = len(data.index) + missing_pct = data[column_name].isnull().sum()/size + n_bins = int(max((1 - missing_pct) * n_bins), 2) + + cutpoints = [] + if self.strategy == "quantile": + cutpoints = list(data[column_name] + .quantile(np.linspace(0, 1, n_bins + 1), + interpolation='linear')) + elif self.strategy == "uniform": + cutpoints = list(np.linspace(col_min, col_max, n_bins + 1)) + + # Make sure the cutpoints are unique and sorted + cutpoints = sorted(list(set(cutpoints))) + + if len(cutpoints) < 3: + log.warning("Only 1 bin was found for predictor {} and will be " + "ignored in computation".format(column_name)) + return None + + if len(cutpoints) < n_bins + 1: + log.warning("The number of actual bins for column {} is {} " + "which is smaller than the requested number of bins " + "{}".format(column_name, len(cutpoints) - 1, n_bins)) + + return self._compute_bins_from_cutpoints(cutpoints) + + def fit(self, data: pd.DataFrame, column_names: list): + """Fits the estimator + + Parameters + ---------- + data : pd.DataFrame + Data to be discretized + column_names : list + Columns of data to be discretized + """ + + if self.strategy not in self.valid_strategies: + raise ValueError("{}: valid options for 'strategy' are {}. " + "Got strategy={!r} instead." + .format(KBinsDiscretizer.__name__, + self.valid_strategies, self.strategy)) + + for column_name in column_names: + + bins = self._fit_column(data, column_name) + + # Add to bins_by_column for later use + self._bins_by_column[column_name] = bins + + def _transform_column(self, data: pd.DataFrame, + column_name: str, + bins: List[tuple]) -> pd.DataFrame: + """Given a DataFrame, a column name and a list of bins, + create an additional column which determines the bin in which the value + of column_name lies in. + + Parameters + ---------- + data : pd.DataFrame + Original data to be discretized + column_name : str + name of the column to discretize + bins : List[tuple] + bins to discretize the data into + + Returns + ------- + pd.DataFrame + original DataFrame with an added binned column + """ + + interval_idx = KBinsDiscretizer._create_index(bins, self.closed) + + column_name_bin = column_name + "_bin" + + # use pd.cut to compute bins + data[column_name_bin] = pd.cut(x=data[column_name], + bins=interval_idx) + + # Rename bins so that the output has a proper format + bin_labels = self._create_bin_labels(bins) + + data[column_name_bin] = (data[column_name_bin] + .cat.rename_categories(bin_labels)) + + if data[column_name_bin].isnull().sum() > 0: + + # Add an additional bin for missing values + data[column_name_bin].cat.add_categories(["Missing"], inplace=True) + + # Replace NULL with "Missing" + # Otherwise these will be ignored in groupby + data[column_name_bin].fillna("Missing", inplace=True) + + return data + + def transform(self, data: pd.DataFrame, + column_names: list) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Data to be discretized + column_names : list + Columns of data to be discretized + + Returns + ------- + pd.DataFrame + data with additional discretized variables + """ + if len(self._bins_by_column) == 0: + msg = ("{} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + for column_name in column_names: + if column_name not in self._bins_by_column: + log.warning("Column {} is not in fitted output " + "and will be skipped".format(column_name)) + continue + + # can be None for a column with a constant value! + bins = self._bins_by_column[column_name] + if bins: + data = self._transform_column(data, column_name, bins) + + return data + + def fit_transform(self, data: pd.DataFrame, + column_names: list) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Data to be discretized + column_names : list + Columns of data to be discretized + + Returns + ------- + pd.DataFrame + data with additional discretized variables + """ + self.fit(data, column_names) + return self.transform(data, column_names) diff --git a/tests/preprocessing/__init__.py b/tests/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py new file mode 100644 index 0000000..1b09ae6 --- /dev/null +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -0,0 +1,73 @@ +import pandas as pd +import pytest + +from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer + + +class TestKBinsDiscretizer: + + # tests for _validate_n_bins function + + def test_validate_n_bins_exception_1(self): + + with pytest.raises(ValueError): + KBinsDiscretizer()._validate_n_bins(n_bins=1) + + def test_validate_n_bins_exception_no_integral(self): + + with pytest.raises(ValueError): + KBinsDiscretizer()._validate_n_bins(n_bins=10.5) + + def test_validate_n_bins_valid_n_bins(self): + + KBinsDiscretizer()._validate_n_bins(n_bins=2) + + # tests for _compute_minimal_precision_of_cutpoints + + def test_compute_minimal_precision_of_cutpoints_less_precision(self): + # If starting precision is bigger than actual precision, should return + # starting precision + + cutpoints = [-10, 0, 1, 2] + discretizer = KBinsDiscretizer(starting_precision=1) + res = discretizer._compute_minimal_precision_of_cutpoints(cutpoints) + assert res == 1 + + def test_compute_minimal_precision_of_cutpoints_more_precision(self): + # If starting precision is smaller than actual precision, should return + # actual precision + + cutpoints = [-10, 0, 1, 1.01] + discretizer = KBinsDiscretizer() + res = discretizer._compute_minimal_precision_of_cutpoints(cutpoints) + assert res == 2 + + def test_compute_minimal_precision_of_cutpoints_equal_precision(self): + # If starting precision is equal to actual precision, should return + # starting precision + + cutpoints = [-10, 0, 1, 1.1] + discretizer = KBinsDiscretizer(starting_precision=1) + res = discretizer._compute_minimal_precision_of_cutpoints(cutpoints) + assert res == 1 + + def test_compute_minimal_precision_of_cutpoints_negative_start(self): + # Check if negative starting precision also leads to the correct result + + cutpoints = [-10, 0, 1, 2] + discretizer = KBinsDiscretizer(starting_precision=-1) + res = discretizer._compute_minimal_precision_of_cutpoints(cutpoints) + assert res == 0 + + def test_compute_minimal_precision_of_cutpoints_round_up(self): + # Check if negative starting precision leads to rounding up + # bin edges to the nearest multiple of 10 + + cutpoints = [-10, 0, 10, 21] + discretizer = KBinsDiscretizer(starting_precision=-1) + res = discretizer._compute_minimal_precision_of_cutpoints(cutpoints) + assert res == -1 + + # tests for _compute_bins_from_cutpoints + + # tests for _create_bin_labels From 41a79cb308a1df46238b8bb8b596b009ac02178d Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 18 Sep 2019 10:50:39 +0200 Subject: [PATCH 02/98] Add TargetEncoder to preprocessing module --- cobra/preprocessing/target_encoder.py | 205 ++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 cobra/preprocessing/target_encoder.py diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py new file mode 100644 index 0000000..d4dad60 --- /dev/null +++ b/cobra/preprocessing/target_encoder.py @@ -0,0 +1,205 @@ +import numpy as np +import pandas as pd + +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.exceptions import NotFittedError + + +# Prototype of TargetEncoder +class TargetEncoder(BaseEstimator, TransformerMixin): + + """Target encoding for categorical features. + + Replace each value of the categorical feature with the average of the + target values (in case of a binary target, this is the incidence of the + group). This encoding scheme is also called Mean encoding. + + The main problem with Target encoding is overfitting; the fact that we are + encoding the feature based on target classes may lead to data leakage, + rendering the feature biased. This can be solved using some type of + regularization. A popular way to handle this is to use cross-validation + and compute the means in each out-of-fold. However, the approach + implemented makes use of additive smoothing + (https://en.wikipedia.org/wiki/Additive_smoothing) + + Attributes + ---------- + columns : list + A list of columns to encode, if None, all string columns will be + encoded. + weight : float + Smoothing parameters (non-negative). The higher the value of the + parameter, the bigger the contribution of the overall mean. When set to + zero, there is no smoothing (e.g. the pure target incidence is used). + """ + + def __init__(self, columns: list=None, weight: float=0.0): + """Constructor + + Parameters + ---------- + columns : list, optional + A list of columns to encode, if None, all string columns will be + encoded. + weight : float, optional + Smoothing parameters (non-negative). The higher the value of the + parameter, the bigger the contribution of the overall mean. When + set to zero, there is no smoothing + (e.g. the pure target incidence is used) + """ + if weight < 0: + raise ValueError("The value of weight cannot be smaller than zero") + + self.columns = columns + self.weight = weight + self._mapping = {} # placeholder for fitted output + + # not implemented yet! + # randomized: bool=False, sigma=0.05 + # self.randomized = randomized + # self.sigma = sigma + + @staticmethod + def _get_categorical_columns(data: pd.DataFrame) -> list: + """Get the columns containing categorical data + (dtype "object" or "category") + + Parameters + ---------- + data : pd.DataFrame + Description + + Returns + ------- + list + List of column names containing categorical data + """ + object_columns = data.dtypes[data.dtypes == object].index + categorical_columns = data.dtypes[data.dtypes == "category"].index + + return list(set(object_columns).union(categorical_columns)) + + def fit(self, X: pd.DataFrame, y: pd.Series): + """Fit the TargetEncoder to X and y + + Parameters + ---------- + X : pd.DataFrame + data used to compute the mapping to encode the categorical + variables with. + y : pd.Series + series containing the targets for each observation + + Raises + ------ + ValueError + if the length of X and y are not equal + """ + # The lengths must be equal + if len(X.index) != len(y.index): + raise ValueError("The length of X is {}, but the length of y is {}" + .format(len(X.index), len(y.index))) + + if self.columns is None: + self.columns = TargetEncoder._get_categorical_columns(X) + + # compute global mean (target incidence in case of binary target) + global_mean = y.sum() / y.count() + + for column in self.columns: + self._mapping[column] = self._fit_column(X[column], y, global_mean) + + def _fit_column(self, X: pd.Series, y: pd.Series, + global_mean: float) -> pd.Series: + """Summary + + Parameters + ---------- + X : pd.Series + data used to compute the encoding mapping for an individual + categorical variable. + y : pd.Series + series containing the targets for each observation + global_mean : float + Global mean of the target + + Returns + ------- + pd.Series + Mapping containing the value to replace each group of the + categorical with. + """ + stats = y.groupby(X).agg(["mean", "count"]) + + # To do: add Gaussian noise to the estimate + # Q: do we need to do this here or during the transform phase??? + + # Note if self.weight = 0, we have the ordinary incidence replacement + numerator = stats["count"]*stats["mean"] + self.weight*global_mean + denominator = stats["count"] + self.weight + + return numerator/denominator + + @staticmethod + def _clean_column_name(column_name: str) -> str: + """Clean column name string by removing "_bin" and adding "_enc" + + Parameters + ---------- + column_name : str + column name to be cleaned + + Returns + ------- + str + cleaned column name + """ + if "_bin" in column_name: + return column_name.replace("_bin", "") + "_enc" + else: + return column_name + "_enc" + + def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: + """Summary + + Parameters + ---------- + X : pd.DataFrame + data to encode + y : pd.Series, optional + Ignored (added for compatibility with scikit-learn) + + Returns + ------- + pd.DataFrame + transformed data + + Raises + ------ + NotFittedError + Exception when TargetEncoder was not fitted before calling this + method + + """ + if len(self._mapping) == 0: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + transform_columns = list(X.columns) + new_columns = [] + for column in self.columns: + + if column not in transform_columns: + # skip if this column was not in X + # print WARNING here!!! + continue + + new_column = TargetEncoder._clean_column_name(column) + + X[new_column] = X[column].map(self._mapping[column]) + + new_columns.append(new_column) + + return X[new_columns] From 0b89146e4811bd737e05f113ed9eb3563920dfa3 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 18 Sep 2019 15:24:13 +0200 Subject: [PATCH 03/98] Add a scripts module with a first example script The goal of the scripts module is to provide a set of ready-to-use functions for a set of concrete problems and situations. The first script for example generates a csv file containing the PIG tables for a given dataset, which can then be used to create PIG graphs for every predictor in the dataset. --- cobra/scripts/__init__.py | 0 cobra/scripts/export_pigs.py | 143 +++++++++++++++++++++++++++++++++++ cobra/utils.py | 69 +++++++++++++++++ 3 files changed, 212 insertions(+) create mode 100644 cobra/scripts/__init__.py create mode 100644 cobra/scripts/export_pigs.py create mode 100644 cobra/utils.py diff --git a/cobra/scripts/__init__.py b/cobra/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cobra/scripts/export_pigs.py b/cobra/scripts/export_pigs.py new file mode 100644 index 0000000..086f342 --- /dev/null +++ b/cobra/scripts/export_pigs.py @@ -0,0 +1,143 @@ +# third party lib imports +import pandas as pd +# custom imports +import cobra.utils as utils +from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer + + +def preprocess_categoricals(data: pd.DataFrame, + categorical_columns: list) -> pd.DataFrame: + + for column_name in categorical_columns: + + # change data to categorical + data[column_name] = data[column_name].astype("category") + + # check for null values + if data[column_name].isnull().sum() > 0: + + # Add an additional category + data[column_name].cat.add_categories(["Missing"], inplace=True) + + # Replace NULL with "Missing" + # Otherwise these will be ignored in groupby + data[column_name].fillna("Missing", inplace=True) + + return data + + +def compute_pig_table(data: pd.DataFrame, + column_name: str, + target_column_name: str, + id_column_name: str) -> pd.DataFrame: + """Compute the pig table of a given predictor for a given target + + Parameters + ---------- + data : pd.DataFrame + input data from which to compute the pig table + column_name : str + predictor name of which to compute the pig table + target_column_name : str + name of the target variable + id_column_name : str + name of the id column (used to count population size) + + Returns + ------- + pd.DataFrame + pig table as a DataFrame + """ + avg_incidence = data[target_column_name].mean() + + # group by the binned variable, compute the incidence + # (=mean of the target for the given bin) and compute the bin size + # (e.g. COUNT(id_column_name)). After that, rename the columns + res = (data.groupby(column_name) + .agg({target_column_name: "mean", id_column_name: "size"}) + .reset_index() + .rename(columns={column_name: "label", + target_column_name: "incidence", + id_column_name: "pop_size"})) + + # add the column name to a variable column + # add the average incidence + # replace population size by a percentage of total population + res["variable"] = column_name + res["avg_incidence"] = avg_incidence + res["pop_size"] = res["pop_size"]/len(data.index) + + # make sure to always return the data with the proper column order + column_order = ["variable", "label", "pop_size", + "avg_incidence", "incidence"] + + return res[column_order] + + +def generate_pig_tables(data: pd.DataFrame, + id_column_name: str, + target_column_name: str, + n_bins: int, + strategy: str, + label_format: str) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + basetable to compute PIG tables of + id_column_name : str + column name of the id (e.g. customernumber) + target_column_name : str + column name of the target + n_bins : int + Number of bins to produce after discretization + strategy : str + Binning strategy. Currently only "uniform" and "quantile" + e.g. equifrequency is supported + label_format : str + format string to display the bin labels e.g. min - max, (min, max], ... + + Returns + ------- + pd.DataFrame + DataFrame containing a PIG table for all predictors + """ + + # Based on the data, get column names by datatype + # threshold to decide whether a numeric column should be considered + # a categorical variable (if the number of distinct values is smaller + # or equal to the number of requested bins) + categorical_threshold = n_bins + columns_by_type = utils.get_column_datatypes(data, id_column_name, + target_column_name, + categorical_threshold) + + # process continuous variables + discretizer = KBinsDiscretizer(n_bins=n_bins, + strategy=strategy, + label_format=label_format) + + # Transform the data + data = discretizer.fit_transform(data, + columns_by_type["numeric_variables"]) + + # Process categorical and dummy variables + categorical_vars = columns_by_type["categorical_variables"] + dummy_vars = columns_by_type["dummy_variables"] + relevant_columns = set(categorical_vars).union(set(dummy_vars)) + + data = preprocess_categoricals(data, list(relevant_columns)) + + # Get relevant columns, e.g. the ones that are transformed + # into categorical dtypes by the preprocessing steps + relevant_columns = set(data.dtypes[data.dtypes == "category"].index) + + pigs = [compute_pig_table(data, column_name, target_column_name, + id_column_name) + for column_name in sorted(relevant_columns) + if column_name not in [id_column_name, target_column_name]] + + output = pd.concat(pigs) + + return output diff --git a/cobra/utils.py b/cobra/utils.py new file mode 100644 index 0000000..b6f25ec --- /dev/null +++ b/cobra/utils.py @@ -0,0 +1,69 @@ +import numpy as np +import pandas as pd + + +def get_column_datatypes(data: pd.DataFrame, + target_column_name: str=None, + id_column_name: str=None, + numeric_is_categorical_threshold: int=10) -> dict: + """Get a list of column names by data type from a pandas DataFrame, + excluding the id column and the target_column if provided + + Parameters + ---------- + data : pd.DataFrame + data to extract columns by type from + target_column_name : str, optional + Description + id_column_name : str, optional + Description + numeric_is_categorical_threshold : int, optional + Threshold to decide whether a numeric variable is categorical based + on the number of unique values in that column + + Returns + ------- + dict + Description + """ + column_names = list(data.columns) + + # dummies variables: case they have only 2 values + vars_dummy = set([col for col in column_names + if len(data[col].unique()) == 2]) + + # categorical vars + vars_cat = (set(data.dtypes[data.dtypes == object].index) + .union(set(data.dtypes[data.dtypes == "category"].index))) + + # Numeric variables + is_number = np.vectorize(lambda x: np.issubdtype(x, np.number)) + bool_arr_is_numeric = is_number(data.dtypes) + vars_numeric = set(data.columns[bool_arr_is_numeric]) + + # remove dummy variables from set + vars_numeric = vars_numeric.difference(vars_dummy) + + # Remark: numeric variables can still be "categorical" + # i.e. when they only contain some distinct values! + # We only consider a variable continuous if they have more distinct values + # than the requested number bins (using numeric_is_categorical_threshold) + + # continuous if more than numeric_is_categorical_threshold distinct values + threshold = numeric_is_categorical_threshold + vars_cat_numeric = set([col for col in vars_numeric + if len(data[col].unique()) < threshold]) + + # remove from numeric set + vars_numeric = vars_numeric.difference(vars_cat_numeric) + # add to categorical set + vars_cat = vars_cat.union(vars_cat_numeric) + + if id_column_name: + vars_cat = vars_cat.difference(set([id_column_name])) + if target_column_name: + vars_dummy = vars_dummy.difference(set([target_column_name])) + + return {"numeric_variables": vars_numeric, + "categorical_variables": vars_cat, + "dummy_variables": vars_dummy} From ed180799637ef83c93064199b5196518f07d68d1 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Sep 2019 17:05:25 +0200 Subject: [PATCH 04/98] Refactor KBinsDiscretizer - Renamed cutpoints to bin_edges throughout code as this is easier to understand - Reorganised code so that public methods come first in the source code. These functions are then easier to trace in the source code and as they are the ones that will most likely be inspected. - Migrated the computation of bin edges to a separate (private) method. - Added already a snippet (in comment) for kmeans discretization, but this still needs work --- cobra/preprocessing/kbins_discretizer.py | 373 ++++++++++-------- tests/preprocessing/test_kbins_discretizer.py | 22 +- 2 files changed, 224 insertions(+), 171 deletions(-) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index efd0e54..bc2a7fc 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -17,6 +17,7 @@ import pandas as pd from sklearn.exceptions import NotFittedError +from sklearn.cluster import KMeans class KBinsDiscretizer: @@ -101,123 +102,29 @@ def set_bins_by_columns(self, bins_by_column: List[tuple]): # To do: add checks! self._bins_by_column = bins_by_column - def _compute_minimal_precision_of_cutpoints(self, cutpoints: list) -> int: - """Compute the minimal precision of a list of cutpoints so that we end - up with a strictly ascending sequence of numbers. - The starting_precision attribute will be used as the initial precision. - In case of a negative starting_precision, the bin edges will be rounded - to the nearest 10, 100, ... (e.g. 5.55 -> 10, 246 -> 200, ...) - - Parameters - ---------- - cutpoints : list - The bin edges for binning a continuous variable - - Returns - ------- - int - minimal precision for the bin edges - """ - - precision = self.starting_precision - while True: - cont = False - for a, b in zip(cutpoints, cutpoints[1:]): - if a != b and round(a, precision) == round(b, precision): - # precision is not high enough, so increase - precision += 1 - cont = True # set cont to True to keep looping - break # break out of the for loop - if not cont: - # if minimal precision was found, - # return to break out of while loop - return precision - - def _compute_bins_from_cutpoints(self, cutpoints: list) -> List[tuple]: - """Given a list of bin edges, compute the minimal precision for which - we can make meaningful bins and make those bins + def fit(self, data: pd.DataFrame, column_names: list): + """Fits the estimator Parameters ---------- - cutpoints : list - The bin edges for binning a continuous variable - - Returns - ------- - List[tuple] - A (sorted) list of bins as tuples + data : pd.DataFrame + Data to be discretized + column_names : list + Columns of data to be discretized """ - # compute the minimal precision of the cutpoints - # this can be a negative number, which then - # rounds numbers to the nearest 10, 100, ... - precision = self._compute_minimal_precision_of_cutpoints(cutpoints) - - bins = [] - for a, b in zip(cutpoints, cutpoints[1:]): - fmt_a = round(a, precision) - fmt_b = round(b, precision) - bins.append((fmt_a, fmt_b)) - - return bins - - @staticmethod - def _create_index(intervals: List[tuple], - closed: str="right") -> pd.IntervalIndex: - """Create an pd.IntervalIndex based on a list of tuples. - This is basically a wrapper around pd.IntervalIndex.from_tuples - However, the lower bound of the first entry in the list (the lower bin) - is replaced by -np.inf. Similarly, the upper bound of the last entry in - the list (upper bin) is replaced by np.inf. - - Parameters - ---------- - intervals : List[tuple] - a list of tuples describing the intervals - closed : str, optional - Whether the intervals should be closed on the left-side, - right-side, both or neither. - - Returns - ------- - pd.IntervalIndex - Description - """ - # deepcopy variable because we do not want to modify the content - # of intervals (which is still used outside of this function) - _intervals = deepcopy(intervals) - # Modify min and max with -np.inf and np.inf resp. so that these values - # are guaranteed to be included when transforming the data - _intervals[0] = (-np.inf, _intervals[0][1]) - _intervals[-1] = (_intervals[-1][0], np.inf) - - return pd.IntervalIndex.from_tuples(_intervals, closed) - - def _create_bin_labels(self, bins: List[tuple]) -> list: - """Given a list of bins, create a list of string containing the bins - as a string with a specific format (e.g. bin labels) - - Parameters - ---------- - bins : List[tuple] - list of bins + if self.strategy not in self.valid_strategies: + raise ValueError("{}: valid options for 'strategy' are {}. " + "Got strategy={!r} instead." + .format(KBinsDiscretizer.__name__, + self.valid_strategies, self.strategy)) - Returns - ------- - list - list of (formatted) bin labels - """ - bin_labels = [] - for interval in bins: - bin_labels.append(self.label_format.format(interval[0], - interval[1])) + for column_name in column_names: - # Format first and last bin as < x and > y resp. - if self.change_endpoint_format: - bin_labels[0] = "< {}".format(bins[0][1]) - bin_labels[-1] = "> {}".format(bins[-1][0]) + bins = self._fit_column(data, column_name) - return bin_labels + # Add to bins_by_column for later use + self._bins_by_column[column_name] = bins def _fit_column(self, data: pd.DataFrame, column_name: str) -> List[tuple]: @@ -249,31 +156,24 @@ def _fit_column(self, data: pd.DataFrame, missing_pct = data[column_name].isnull().sum()/size n_bins = int(max((1 - missing_pct) * n_bins), 2) - cutpoints = [] - if self.strategy == "quantile": - cutpoints = list(data[column_name] - .quantile(np.linspace(0, 1, n_bins + 1), - interpolation='linear')) - elif self.strategy == "uniform": - cutpoints = list(np.linspace(col_min, col_max, n_bins + 1)) - - # Make sure the cutpoints are unique and sorted - cutpoints = sorted(list(set(cutpoints))) + bin_edges = self._compute_bin_edges(data, column_name, n_bins, + col_min, col_max) - if len(cutpoints) < 3: + if len(bin_edges) < 3: log.warning("Only 1 bin was found for predictor {} and will be " "ignored in computation".format(column_name)) return None - if len(cutpoints) < n_bins + 1: + if len(bin_edges) < n_bins + 1: log.warning("The number of actual bins for column {} is {} " "which is smaller than the requested number of bins " - "{}".format(column_name, len(cutpoints) - 1, n_bins)) + "{}".format(column_name, len(bin_edges) - 1, n_bins)) - return self._compute_bins_from_cutpoints(cutpoints) + return self._compute_bins_from_edges(bin_edges) - def fit(self, data: pd.DataFrame, column_names: list): - """Fits the estimator + def transform(self, data: pd.DataFrame, + column_names: list) -> pd.DataFrame: + """Summary Parameters ---------- @@ -281,20 +181,30 @@ def fit(self, data: pd.DataFrame, column_names: list): Data to be discretized column_names : list Columns of data to be discretized + + Returns + ------- + pd.DataFrame + data with additional discretized variables """ + if len(self._bins_by_column) == 0: + msg = ("{} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") - if self.strategy not in self.valid_strategies: - raise ValueError("{}: valid options for 'strategy' are {}. " - "Got strategy={!r} instead." - .format(KBinsDiscretizer.__name__, - self.valid_strategies, self.strategy)) + raise NotFittedError(msg.format(self.__class__.__name__)) for column_name in column_names: + if column_name not in self._bins_by_column: + log.warning("Column {} is not in fitted output " + "and will be skipped".format(column_name)) + continue - bins = self._fit_column(data, column_name) + # can be None for a column with a constant value! + bins = self._bins_by_column[column_name] + if bins: + data = self._transform_column(data, column_name, bins) - # Add to bins_by_column for later use - self._bins_by_column[column_name] = bins + return data def _transform_column(self, data: pd.DataFrame, column_name: str, @@ -343,8 +253,8 @@ def _transform_column(self, data: pd.DataFrame, return data - def transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: + def fit_transform(self, data: pd.DataFrame, + column_names: list) -> pd.DataFrame: """Summary Parameters @@ -359,40 +269,183 @@ def transform(self, data: pd.DataFrame, pd.DataFrame data with additional discretized variables """ - if len(self._bins_by_column) == 0: - msg = ("{} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + self.fit(data, column_names) + return self.transform(data, column_names) - raise NotFittedError(msg.format(self.__class__.__name__)) + def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, + n_bins: int, col_min: float, + col_max: float) -> list: + """Compute the bin edges for a given column, a DataFrame and the number + of required bins - for column_name in column_names: - if column_name not in self._bins_by_column: - log.warning("Column {} is not in fitted output " - "and will be skipped".format(column_name)) - continue + Parameters + ---------- + data : pd.DataFrame + Data to be discretized + column_name : str + name of the column to discretize + n_bins : int + Number of bins to produce. + col_min : float + min value of the variable + col_max : float + max value of the variable - # can be None for a column with a constant value! - bins = self._bins_by_column[column_name] - if bins: - data = self._transform_column(data, column_name, bins) + Returns + ------- + list + list of bin edges from which to compute the bins + """ - return data + bin_edges = [] + if self.strategy == "quantile": + bin_edges = list(data[column_name] + .quantile(np.linspace(0, 1, n_bins + 1), + interpolation='linear')) + elif self.strategy == "uniform": + bin_edges = list(np.linspace(col_min, col_max, n_bins + 1)) - def fit_transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: - """Summary + # elif self.strategy == "kmeans": + + # if data[column_name].isnull().sum() > 0: + # raise ValueError("{}: kmeans strategy cannot handle NULL " + # "values in the data." + # .format(KBinsDiscretizer.__name__)) + + # # Deterministic initialization with uniform spacing + # uniform_edges = np.linspace(col_min, col_max, n_bins + 1) + # init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 + + # # 1D k-means + # kmeans = KMeans(n_clusters=n_bins, init=init, n_init=1) + # centers = (kmeans.fit(data[column_name][:, None]) + # .cluster_centers_[:, 0]) + + # # Make sure to sort centers as they may be unsorted, + # # even with sorted init! + # centers.sort() + + # # compute bin_edges from centers + # bin_edges = (centers[1:] + centers[:-1]) * 0.5 + # bin_edges = np.r_[col_min, bin_edges, col_max] + + # Make sure the bin_edges are unique and sorted + return sorted(list(set(bin_edges))) + + def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int: + """Compute the minimal precision of a list of bin_edges so that we end + up with a strictly ascending sequence of numbers. + The starting_precision attribute will be used as the initial precision. + In case of a negative starting_precision, the bin edges will be rounded + to the nearest 10, 100, ... (e.g. 5.55 -> 10, 246 -> 200, ...) Parameters ---------- - data : pd.DataFrame - Data to be discretized - column_names : list - Columns of data to be discretized + bin_edges : list + The bin edges for binning a continuous variable Returns ------- - pd.DataFrame - data with additional discretized variables + int + minimal precision for the bin edges """ - self.fit(data, column_names) - return self.transform(data, column_names) + + precision = self.starting_precision + while True: + cont = False + for a, b in zip(bin_edges, bin_edges[1:]): + if a != b and round(a, precision) == round(b, precision): + # precision is not high enough, so increase + precision += 1 + cont = True # set cont to True to keep looping + break # break out of the for loop + if not cont: + # if minimal precision was found, + # return to break out of while loop + return precision + + def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]: + """Given a list of bin edges, compute the minimal precision for which + we can make meaningful bins and make those bins + + Parameters + ---------- + bin_edges : list + The bin edges for binning a continuous variable + + Returns + ------- + List[tuple] + A (sorted) list of bins as tuples + """ + # compute the minimal precision of the bin_edges + # this can be a negative number, which then + # rounds numbers to the nearest 10, 100, ... + precision = self._compute_minimal_precision_of_bin_edges(bin_edges) + + bins = [] + for a, b in zip(bin_edges, bin_edges[1:]): + fmt_a = round(a, precision) + fmt_b = round(b, precision) + + bins.append((fmt_a, fmt_b)) + + return bins + + @staticmethod + def _create_index(intervals: List[tuple], + closed: str="right") -> pd.IntervalIndex: + """Create an pd.IntervalIndex based on a list of tuples. + This is basically a wrapper around pd.IntervalIndex.from_tuples + However, the lower bound of the first entry in the list (the lower bin) + is replaced by -np.inf. Similarly, the upper bound of the last entry in + the list (upper bin) is replaced by np.inf. + + Parameters + ---------- + intervals : List[tuple] + a list of tuples describing the intervals + closed : str, optional + Whether the intervals should be closed on the left-side, + right-side, both or neither. + + Returns + ------- + pd.IntervalIndex + Description + """ + # deepcopy variable because we do not want to modify the content + # of intervals (which is still used outside of this function) + _intervals = deepcopy(intervals) + # Replace min and max with -np.inf and np.inf resp. so that these + # values are guaranteed to be included when transforming the data + _intervals[0] = (-np.inf, _intervals[0][1]) + _intervals[-1] = (_intervals[-1][0], np.inf) + + return pd.IntervalIndex.from_tuples(_intervals, closed) + + def _create_bin_labels(self, bins: List[tuple]) -> list: + """Given a list of bins, create a list of string containing the bins + as a string with a specific format (e.g. bin labels) + + Parameters + ---------- + bins : List[tuple] + list of tuple containing for each bin the upper and lower bound + + Returns + ------- + list + list of (formatted) bin labels + """ + bin_labels = [] + for interval in bins: + bin_labels.append(self.label_format.format(interval[0], + interval[1])) + + # Format first and last bin as < x and > y resp. + if self.change_endpoint_format: + bin_labels[0] = "< {}".format(bins[0][1]) + bin_labels[-1] = "> {}".format(bins[-1][0]) + + return bin_labels diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py index 1b09ae6..8fd89c8 100644 --- a/tests/preprocessing/test_kbins_discretizer.py +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -22,50 +22,50 @@ def test_validate_n_bins_valid_n_bins(self): KBinsDiscretizer()._validate_n_bins(n_bins=2) - # tests for _compute_minimal_precision_of_cutpoints + # tests for _compute_minimal_precision_of_bin_edges - def test_compute_minimal_precision_of_cutpoints_less_precision(self): + def test_compute_minimal_precision_of_bin_edges_less_precision(self): # If starting precision is bigger than actual precision, should return # starting precision cutpoints = [-10, 0, 1, 2] discretizer = KBinsDiscretizer(starting_precision=1) - res = discretizer._compute_minimal_precision_of_cutpoints(cutpoints) + res = discretizer._compute_minimal_precision_of_bin_edges(cutpoints) assert res == 1 - def test_compute_minimal_precision_of_cutpoints_more_precision(self): + def test_compute_minimal_precision_of_bin_edges_more_precision(self): # If starting precision is smaller than actual precision, should return # actual precision cutpoints = [-10, 0, 1, 1.01] discretizer = KBinsDiscretizer() - res = discretizer._compute_minimal_precision_of_cutpoints(cutpoints) + res = discretizer._compute_minimal_precision_of_bin_edges(cutpoints) assert res == 2 - def test_compute_minimal_precision_of_cutpoints_equal_precision(self): + def test_compute_minimal_precision_of_bin_edges_equal_precision(self): # If starting precision is equal to actual precision, should return # starting precision cutpoints = [-10, 0, 1, 1.1] discretizer = KBinsDiscretizer(starting_precision=1) - res = discretizer._compute_minimal_precision_of_cutpoints(cutpoints) + res = discretizer._compute_minimal_precision_of_bin_edges(cutpoints) assert res == 1 - def test_compute_minimal_precision_of_cutpoints_negative_start(self): + def test_compute_minimal_precision_of_bin_edges_negative_start(self): # Check if negative starting precision also leads to the correct result cutpoints = [-10, 0, 1, 2] discretizer = KBinsDiscretizer(starting_precision=-1) - res = discretizer._compute_minimal_precision_of_cutpoints(cutpoints) + res = discretizer._compute_minimal_precision_of_bin_edges(cutpoints) assert res == 0 - def test_compute_minimal_precision_of_cutpoints_round_up(self): + def test_compute_minimal_precision_of_bin_edges_round_up(self): # Check if negative starting precision leads to rounding up # bin edges to the nearest multiple of 10 cutpoints = [-10, 0, 10, 21] discretizer = KBinsDiscretizer(starting_precision=-1) - res = discretizer._compute_minimal_precision_of_cutpoints(cutpoints) + res = discretizer._compute_minimal_precision_of_bin_edges(cutpoints) assert res == -1 # tests for _compute_bins_from_cutpoints From d37d936955e9cafb496fd1d4456148191acd30ef Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 2 Oct 2019 13:19:21 +0200 Subject: [PATCH 05/98] Refactor TargetEncoder - Reorganised code so that public methods become first in code as they are then easier to trace in the source code. - Added additional safety checks to make the code more robuust. - Added unittests for the TargetEncoder - Added a missing __init__.py in tests module --- cobra/preprocessing/target_encoder.py | 96 +++++++------- tests/__init__.py | 0 tests/preprocessing/test_target_encoder.py | 141 +++++++++++++++++++++ 3 files changed, 192 insertions(+), 45 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/preprocessing/test_target_encoder.py diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index d4dad60..91e3d6f 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -1,11 +1,13 @@ -import numpy as np +import logging +log = logging.getLogger(__name__) + +#import numpy as np import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin from sklearn.exceptions import NotFittedError -# Prototype of TargetEncoder class TargetEncoder(BaseEstimator, TransformerMixin): """Target encoding for categorical features. @@ -59,26 +61,6 @@ def __init__(self, columns: list=None, weight: float=0.0): # self.randomized = randomized # self.sigma = sigma - @staticmethod - def _get_categorical_columns(data: pd.DataFrame) -> list: - """Get the columns containing categorical data - (dtype "object" or "category") - - Parameters - ---------- - data : pd.DataFrame - Description - - Returns - ------- - list - List of column names containing categorical data - """ - object_columns = data.dtypes[data.dtypes == object].index - categorical_columns = data.dtypes[data.dtypes == "category"].index - - return list(set(object_columns).union(categorical_columns)) - def fit(self, X: pd.DataFrame, y: pd.Series): """Fit the TargetEncoder to X and y @@ -107,6 +89,11 @@ def fit(self, X: pd.DataFrame, y: pd.Series): global_mean = y.sum() / y.count() for column in self.columns: + if column not in X.columns: + log.warning("DataFrame has no column {}, so it will be " + "skipped in fitting" .format(column)) + continue + self._mapping[column] = self._fit_column(X[column], y, global_mean) def _fit_column(self, X: pd.Series, y: pd.Series, @@ -140,25 +127,6 @@ def _fit_column(self, X: pd.Series, y: pd.Series, return numerator/denominator - @staticmethod - def _clean_column_name(column_name: str) -> str: - """Clean column name string by removing "_bin" and adding "_enc" - - Parameters - ---------- - column_name : str - column name to be cleaned - - Returns - ------- - str - cleaned column name - """ - if "_bin" in column_name: - return column_name.replace("_bin", "") + "_enc" - else: - return column_name + "_enc" - def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: """Summary @@ -187,13 +155,12 @@ def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: raise NotFittedError(msg.format(self.__class__.__name__)) - transform_columns = list(X.columns) new_columns = [] for column in self.columns: - if column not in transform_columns: - # skip if this column was not in X - # print WARNING here!!! + if column not in X.columns: + log.warning("Column {} is not in fitted output " + "and will be skipped".format(column)) continue new_column = TargetEncoder._clean_column_name(column) @@ -203,3 +170,42 @@ def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: new_columns.append(new_column) return X[new_columns] + + @staticmethod + def _get_categorical_columns(data: pd.DataFrame) -> list: + """Get the columns containing categorical data + (dtype "object" or "category") + + Parameters + ---------- + data : pd.DataFrame + Description + + Returns + ------- + list + List of column names containing categorical data + """ + object_columns = data.dtypes[data.dtypes == object].index + categorical_columns = data.dtypes[data.dtypes == "category"].index + + return list(set(object_columns).union(set(categorical_columns))) + + @staticmethod + def _clean_column_name(column_name: str) -> str: + """Clean column name string by removing "_bin" and adding "_enc" + + Parameters + ---------- + column_name : str + column name to be cleaned + + Returns + ------- + str + cleaned column name + """ + if "_bin" in column_name: + return column_name.replace("_bin", "") + "_enc" + else: + return column_name + "_enc" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py new file mode 100644 index 0000000..cea1f3d --- /dev/null +++ b/tests/preprocessing/test_target_encoder.py @@ -0,0 +1,141 @@ +import pytest +import pandas as pd + +from cobra.preprocessing.target_encoder import TargetEncoder + + +class TestTargetEncoder: + + def test_target_encoder_constructor_value_error(self): + with pytest.raises(ValueError): + TargetEncoder(weight=-1) + + # Tests for _fit_column + def test_target_encoder_fit_column(self): + + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + encoder = TargetEncoder(columns=["variable"]) + actual = encoder._fit_column(X=df.variable, y=df.target, + global_mean=0.0) + + expected = pd.Series(data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + + pd.testing.assert_series_equal(actual, expected, + check_less_precise=5) + + def test_target_encoder_fit_column_global_mean(self): + + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + global_mean = df.target.sum() / df.target.count() # is 0.5 + + encoder = TargetEncoder(columns=["variable"], weight=1) + actual = encoder._fit_column(X=df.variable, y=df.target, + global_mean=global_mean) + + expected = pd.Series(data=[0.375, 0.500, 0.625], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + + pd.testing.assert_series_equal(actual, expected, + check_less_precise=3) + + # Tests for fit method + def test_target_encoder_fit_value_error(self): + + X = pd.DataFrame({'variable': ['positive', 'positive', 'negative']}) + + target = pd.Series([1, 1, 0, 0]) + + encoder = TargetEncoder(columns=["variable"]) + with pytest.raises(ValueError): + encoder.fit(X, target) + + def test_target_encoder_fit(self): + + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + encoder = TargetEncoder(columns=["variable"]) + encoder.fit(X=df, y=df.target) + + expected = pd.Series(data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + actual = encoder._mapping["variable"] + + pd.testing.assert_series_equal(actual, expected, + check_less_precise=5) + + # Tests for transform method + def test_target_encoder_transform(self): + + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + expected = pd.DataFrame({'variable_enc': [0.666667, 0.666667, + 0.333333, 0.50000, + 0.333333, 0.666667, + 0.333333, 0.50000, + 0.50000, 0.50000]}) + + encoder = TargetEncoder(columns=["variable"]) + encoder.fit(X=df, y=df.target) + actual = encoder.transform(X=df, y=df.target) + + pd.testing.assert_frame_equal(actual, expected, + check_less_precise=5) + + # Tests for _get_categorical_columns + def test_target_encoder_get_categorical_columns(self): + + df = pd.DataFrame({"continuous": [1.0, 1.5, 2.0], + "categorical": ["negative", "neutral", "positive"], + "object": ["cats", "dogs", "goldfish"]}) + + expected = ["categorical", "object"] + + encoder = TargetEncoder() + actual = encoder._get_categorical_columns(df) + + # It is OK to take sets here because we also do that in the + # _get_categorical_columns function + assert set(actual) == set(expected) + + # Tests for _clean_column_name + def test_target_encoder_clean_column_name(self): + + column_name = "test_column" + expected = "test_column_enc" + + encoder = TargetEncoder() + actual = encoder._clean_column_name(column_name) + + assert actual == expected + + def test_target_encoder_clean_column_name_binned_column(self): + + column_name = "test_column_bin" + expected = "test_column_enc" + + encoder = TargetEncoder() + actual = encoder._clean_column_name(column_name) + + assert actual == expected From 7f1a2b893420046d94460870331544cbdebd646f Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 2 Oct 2019 14:23:39 +0200 Subject: [PATCH 06/98] Add unittests for all private methods of KBinsDiscretizer --- cobra/preprocessing/kbins_discretizer.py | 10 +- cobra/preprocessing/target_encoder.py | 4 +- tests/preprocessing/test_kbins_discretizer.py | 100 +++++++++++++++--- 3 files changed, 90 insertions(+), 24 deletions(-) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index bc2a7fc..7767319 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -146,7 +146,7 @@ def _fit_column(self, data: pd.DataFrame, col_min, col_max = data[column_name].min(), data[column_name].max() if col_min == col_max: - log.warning("Predictor {} is constant and " + log.warning("Predictor '{}' is constant and " "will be ignored in computation".format(column_name)) return None @@ -160,12 +160,12 @@ def _fit_column(self, data: pd.DataFrame, col_min, col_max) if len(bin_edges) < 3: - log.warning("Only 1 bin was found for predictor {} and will be " - "ignored in computation".format(column_name)) + log.warning("Only 1 bin was found for predictor '{}' so it will " + "be ignored in computation".format(column_name)) return None if len(bin_edges) < n_bins + 1: - log.warning("The number of actual bins for column {} is {} " + log.warning("The number of actual bins for predictor '{}' is {} " "which is smaller than the requested number of bins " "{}".format(column_name, len(bin_edges) - 1, n_bins)) @@ -195,7 +195,7 @@ def transform(self, data: pd.DataFrame, for column_name in column_names: if column_name not in self._bins_by_column: - log.warning("Column {} is not in fitted output " + log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column_name)) continue diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 91e3d6f..7ce5fbb 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -90,7 +90,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): for column in self.columns: if column not in X.columns: - log.warning("DataFrame has no column {}, so it will be " + log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting" .format(column)) continue @@ -159,7 +159,7 @@ def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: for column in self.columns: if column not in X.columns: - log.warning("Column {} is not in fitted output " + log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column)) continue diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py index 8fd89c8..19d92ae 100644 --- a/tests/preprocessing/test_kbins_discretizer.py +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -6,68 +6,134 @@ class TestKBinsDiscretizer: - # tests for _validate_n_bins function + ################# Test for public methods ################# - def test_validate_n_bins_exception_1(self): + ################# Test for private methods ################# + # Tests for _validate_n_bins function + def test_kbins_discretizer_validate_n_bins_exception_1(self): with pytest.raises(ValueError): KBinsDiscretizer()._validate_n_bins(n_bins=1) - def test_validate_n_bins_exception_no_integral(self): + def test_kbins_discretizer_validate_n_bins_exception_no_integral(self): with pytest.raises(ValueError): KBinsDiscretizer()._validate_n_bins(n_bins=10.5) - def test_validate_n_bins_valid_n_bins(self): + def test_kbins_discretizer_validate_n_bins_valid_n_bins(self): KBinsDiscretizer()._validate_n_bins(n_bins=2) - # tests for _compute_minimal_precision_of_bin_edges + # Test for _compute_bin_edges + def test_kbins_discretizer_compute_bin_edges_quantile_method(self): + data = pd.DataFrame({"variable": list(range(0, 11))}) # ints from 0-10 + + discretizer = KBinsDiscretizer() + actual = discretizer._compute_bin_edges(data, column_name="variable", + n_bins=4, + col_min=data.variable.min(), + col_max=data.variable.max()) + expected = [0.0, 2.5, 5, 7.5, 10.0] + + assert expected == actual + + def test_kbins_discretizer_compute_bin_edges_uniform_method(self): + + data = pd.DataFrame({"variable": list(range(0, 10))}) # ints from 0-9 + + discretizer = KBinsDiscretizer(strategy="uniform") + actual = discretizer._compute_bin_edges(data, column_name="variable", + n_bins=3, + col_min=data.variable.min(), + col_max=data.variable.max()) + expected = [0.0, 3.0, 6.0, 9.0] + + assert expected == actual + + # Tests for _compute_minimal_precision_of_bin_edges def test_compute_minimal_precision_of_bin_edges_less_precision(self): # If starting precision is bigger than actual precision, should return # starting precision - cutpoints = [-10, 0, 1, 2] + bin_edges = [-10, 0, 1, 2] discretizer = KBinsDiscretizer(starting_precision=1) - res = discretizer._compute_minimal_precision_of_bin_edges(cutpoints) + res = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) assert res == 1 def test_compute_minimal_precision_of_bin_edges_more_precision(self): # If starting precision is smaller than actual precision, should return # actual precision - cutpoints = [-10, 0, 1, 1.01] + bin_edges = [-10, 0, 1, 1.01] discretizer = KBinsDiscretizer() - res = discretizer._compute_minimal_precision_of_bin_edges(cutpoints) + res = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) assert res == 2 def test_compute_minimal_precision_of_bin_edges_equal_precision(self): # If starting precision is equal to actual precision, should return # starting precision - cutpoints = [-10, 0, 1, 1.1] + bin_edges = [-10, 0, 1, 1.1] discretizer = KBinsDiscretizer(starting_precision=1) - res = discretizer._compute_minimal_precision_of_bin_edges(cutpoints) + res = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) assert res == 1 def test_compute_minimal_precision_of_bin_edges_negative_start(self): # Check if negative starting precision also leads to the correct result - cutpoints = [-10, 0, 1, 2] + bin_edges = [-10, 0, 1, 2] discretizer = KBinsDiscretizer(starting_precision=-1) - res = discretizer._compute_minimal_precision_of_bin_edges(cutpoints) + res = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) assert res == 0 def test_compute_minimal_precision_of_bin_edges_round_up(self): # Check if negative starting precision leads to rounding up # bin edges to the nearest multiple of 10 - cutpoints = [-10, 0, 10, 21] + bin_edges = [-10, 0, 10, 21] discretizer = KBinsDiscretizer(starting_precision=-1) - res = discretizer._compute_minimal_precision_of_bin_edges(cutpoints) + res = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) assert res == -1 - # tests for _compute_bins_from_cutpoints + # Tests for _compute_bins_from_edges + def test_kbins_discretizer_compute_bins_from_edges(self): + + bin_edges = [0, 1, 1.5, 2] + + discretizer = KBinsDiscretizer() + actual = discretizer._compute_bins_from_edges(bin_edges) + + expected = [(0, 1), (1, 1.5), (1.5, 2)] + assert actual == expected + + def test_kbins_discretizer_compute_bins_from_edges_round_up(self): + + bin_edges = [0, 1, 1.5, 3] + + discretizer = KBinsDiscretizer() + actual = discretizer._compute_bins_from_edges(bin_edges) + + expected = [(0, 1), (1, 2), (2, 3)] + assert actual == expected + + # Tests for _create_bin_labels + def test_kbins_discretizer_create_bin_labels(self): + + bins = [(0, 1), (1, 2), (2, 3)] + + discretizer = KBinsDiscretizer() + actual = discretizer._create_bin_labels(bins) + expected = ["0 - 1", "1 - 2", "2 - 3"] + + assert actual == expected + + def test_kbins_discretizer_create_bin_labels_different_endpoint_fmt(self): + + bins = [(0, 1), (1, 2), (2, 3)] + + discretizer = KBinsDiscretizer(change_endpoint_format=True) + actual = discretizer._create_bin_labels(bins) + expected = ["< 1", "1 - 2", "> 2"] - # tests for _create_bin_labels + assert actual == expected From 72adaa31f9e7760eae8fd1a2132ac232f2584532 Mon Sep 17 00:00:00 2001 From: JanBenisek Date: Thu, 3 Oct 2019 09:25:48 +0200 Subject: [PATCH 07/98] added cobra_env --- .gitignore | 3 ++ cobra_env.txt | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 cobra_env.txt diff --git a/.gitignore b/.gitignore index 5a98460..9ef222c 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,6 @@ ENV/ # mypy .mypy_cache/ + +#VScode settins +.vscode diff --git a/cobra_env.txt b/cobra_env.txt new file mode 100644 index 0000000..a7e89cf --- /dev/null +++ b/cobra_env.txt @@ -0,0 +1,97 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: win-64 +@EXPLICIT +https://conda.anaconda.org/anaconda/win-64/blas-1.0-mkl.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/ca-certificates-2019.8.28-0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/icc_rt-2019.0.0-h0cc432a_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/intel-openmp-2019.5-281.tar.bz2 +https://repo.anaconda.com/pkgs/msys2/win-64/msys2-conda-epoch-20160418-1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/pandoc-2.2.3.2-0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/vs2015_runtime-15.5.2-3.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/winpty-0.4.3-4.tar.bz2 +https://repo.anaconda.com/pkgs/msys2/win-64/m2w64-gmp-6.1.0-2.tar.bz2 +https://repo.anaconda.com/pkgs/msys2/win-64/m2w64-libwinpthread-git-5.0.0.4634.697f757-2.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/mkl-2019.5-281.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/vc-14.1-h21ff451_3.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/icu-58.2-ha66f8fd_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/jpeg-9b-hb83a4c4_2.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/libsodium-1.0.16-h9d3ae62_0.tar.bz2 +https://repo.anaconda.com/pkgs/msys2/win-64/m2w64-gcc-libs-core-5.3.0-7.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/openssl-1.1.1-he774522_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/sqlite-3.29.0-he774522_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/zlib-1.2.11-h62dcd97_3.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/libpng-1.6.37-h2a8f88b_0.tar.bz2 +https://repo.anaconda.com/pkgs/msys2/win-64/m2w64-gcc-libgfortran-5.3.0-6.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/python-3.7.4-h5263a28_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/zeromq-4.3.1-h33f27b4_3.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/attrs-19.1.0-py37_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/backcall-0.1.0-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/certifi-2019.9.11-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/colorama-0.4.1-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/decorator-4.4.0-py37_1.tar.bz2 +https://conda.anaconda.org/anaconda/noarch/defusedxml-0.6.0-py_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/entrypoints-0.3-py37_0.tar.bz2 +https://conda.anaconda.org/conda-forge/win-64/freetype-2.10.0-h563cfd7_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/ipython_genutils-0.2.0-py37_0.tar.bz2 +https://conda.anaconda.org/conda-forge/win-64/kiwisolver-1.1.0-py37he980bc4_0.tar.bz2 +https://repo.anaconda.com/pkgs/msys2/win-64/m2w64-gcc-libs-5.3.0-7.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/markupsafe-1.1.1-py37he774522_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/mistune-0.8.4-py37he774522_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/numpy-base-1.16.4-py37hc3f5095_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/pandocfilters-1.4.2-py37_1.tar.bz2 +https://conda.anaconda.org/anaconda/noarch/parso-0.5.1-py_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/pickleshare-0.7.5-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/noarch/prometheus_client-0.7.1-py_0.tar.bz2 +https://conda.anaconda.org/conda-forge/noarch/pyparsing-2.4.2-py_0.tar.bz2 +https://conda.anaconda.org/anaconda/noarch/pytz-2019.2-py_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/pywin32-223-py37hfa6e2cd_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/pyzmq-18.1.0-py37ha925a31_0.tar.bz2 +https://repo.anaconda.com/pkgs/main/win-64/qt-5.9.7-vc14h73c81de_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/send2trash-1.5.0-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/sip-4.19.13-py37ha925a31_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/six-1.12.0-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/testpath-0.4.2-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/tornado-6.0.3-py37he774522_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/wcwidth-0.1.7-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/webencodings-0.5.1-py37_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/wincertstore-0.2-py37_0.tar.bz2 +https://conda.anaconda.org/conda-forge/noarch/cycler-0.10.0-py_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/jedi-0.15.1-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/mkl_random-1.0.2-py37h343c172_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/pyqt-5.9.2-py37ha878b3d_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/pyrsistent-0.14.11-py37he774522_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/python-dateutil-2.8.0-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/pywinpty-0.5.5-py37_1000.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/setuptools-41.2.0-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/traitlets-4.3.2-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/bleach-3.1.0-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/jinja2-2.10.1-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/jsonschema-3.0.2-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/noarch/jupyter_core-4.5.0-py_0.tar.bz2 +https://conda.anaconda.org/anaconda/noarch/pygments-2.4.2-py_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/terminado-0.8.2-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/wheel-0.33.6-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/jupyter_client-5.3.3-py37_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/nbformat-4.4.0-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/pip-19.2.3-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/prompt_toolkit-2.0.9-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/ipython-7.8.0-py37h39e3cac_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/nbconvert-5.6.0-py37_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/ipykernel-5.1.2-py37h39e3cac_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/jupyter_console-6.0.0-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/notebook-6.0.1-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/noarch/qtconsole-4.5.5-py_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/widgetsnbextension-3.5.1-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/noarch/ipywidgets-7.5.1-py_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/jupyter-1.0.0-py37_7.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/mkl-service-2.0.2-py37he774522_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/mkl_fft-1.0.12-py37h14836fe_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/numpy-1.16.4-py37h19fb1c0_0.tar.bz2 +https://conda.anaconda.org/conda-forge/win-64/matplotlib-base-3.1.1-py37h2852a4a_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/pandas-0.25.1-py37ha925a31_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/scipy-1.3.1-py37h29ff71c_0.tar.bz2 +https://conda.anaconda.org/conda-forge/win-64/matplotlib-3.1.1-py37_1.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/patsy-0.5.1-py37_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/statsmodels-0.10.1-py37h8c2d366_0.tar.bz2 +https://conda.anaconda.org/anaconda/win-64/seaborn-0.9.0-py37_0.tar.bz2 From b71d469837ecacbdbb248c42702f1c39b58e8c90 Mon Sep 17 00:00:00 2001 From: JanBenisek Date: Thu, 3 Oct 2019 11:45:29 +0200 Subject: [PATCH 08/98] new data, better init, test file --- cobra/preprocessing/__init__.py | 5 + datasets/titanic_data.csv | 892 ++++++++++++++++++++++++++++++++ examples/testing.py | 19 + 3 files changed, 916 insertions(+) create mode 100644 datasets/titanic_data.csv create mode 100644 examples/testing.py diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py index e69de29..f3884e0 100644 --- a/cobra/preprocessing/__init__.py +++ b/cobra/preprocessing/__init__.py @@ -0,0 +1,5 @@ +from .kbins_discretizer import KBinsDiscretizer +from .target_encoder import TargetEncoder + +__all__ = ['KBinsDiscretizer', + 'TargetEncoder'] \ No newline at end of file diff --git a/datasets/titanic_data.csv b/datasets/titanic_data.csv new file mode 100644 index 0000000..5cc466e --- /dev/null +++ b/datasets/titanic_data.csv @@ -0,0 +1,892 @@ +PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked +1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S +2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C +3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S +4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S +5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S +6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q +7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S +8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S +9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S +10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C +11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S +12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S +13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S +14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S +15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S +16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S +17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q +18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S +19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S +20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C +21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S +22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S +23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q +24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S +25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S +26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S +27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C +28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S +29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q +30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S +31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C +32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C +33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q +34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S +35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C +36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S +37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C +38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S +39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S +40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C +41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S +42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S +43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C +44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C +45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q +46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S +47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q +48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q +49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C +50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S +51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S +52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S +53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C +54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S +55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C +56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S +57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S +58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C +59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S +60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S +61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C +62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28, +63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S +64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S +65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C +66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C +67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S +68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S +69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S +70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S +71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S +72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S +73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S +74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C +75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S +76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S +77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S +78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S +79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S +80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S +81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S +82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S +83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q +84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S +85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S +86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S +87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S +88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S +89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S +90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S +91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S +92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S +93,0,1,"Chaffee, Mr. Herbert Fuller",male,46,1,0,W.E.P. 5734,61.175,E31,S +94,0,3,"Dean, Mr. Bertram Frank",male,26,1,2,C.A. 2315,20.575,,S +95,0,3,"Coxon, Mr. Daniel",male,59,0,0,364500,7.25,,S +96,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.05,,S +97,0,1,"Goldschmidt, Mr. George B",male,71,0,0,PC 17754,34.6542,A5,C +98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C +99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S +100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S +101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S +102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S +103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S +104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S +105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S +106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S +107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S +108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S +109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S +110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q +111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S +112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C +113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S +114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S +115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C +116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S +117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q +118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S +119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C +120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S +121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S +122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S +123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C +124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S +125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S +126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C +127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q +128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S +129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C +130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S +131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C +132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S +133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47,1,0,A/5. 3337,14.5,,S +134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S +135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S +136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C +137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S +138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S +139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S +140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C +141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C +142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S +143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S +144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q +145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S +146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S +147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S +148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S +149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S +150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S +151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S +152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22,1,0,113776,66.6,C2,S +153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.05,,S +154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S +155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S +156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C +157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q +158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S +159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S +160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S +161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S +162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S +163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S +164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S +165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S +166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S +167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S +168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S +169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S +170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S +171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S +172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q +173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S +174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S +175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C +176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S +177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S +178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C +179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S +180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S +181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S +182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C +183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S +184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S +185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S +186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S +187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q +188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S +189,0,3,"Bourke, Mr. John",male,40,1,1,364849,15.5,,Q +190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S +191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S +192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S +193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S +194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S +195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C +196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C +197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q +198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S +199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q +200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S +201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S +202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S +203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S +204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C +205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S +206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S +207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S +208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C +209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q +210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C +211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S +212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S +213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S +214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S +215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q +216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C +217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S +218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S +219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C +220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S +221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S +222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S +223,0,3,"Green, Mr. George Henry",male,51,0,0,21440,8.05,,S +224,0,3,"Nenkoff, Mr. Christo",male,,0,0,349234,7.8958,,S +225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38,1,0,19943,90,C93,S +226,0,3,"Berglund, Mr. Karl Ivar Sven",male,22,0,0,PP 4348,9.35,,S +227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S +228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S +229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S +230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S +231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S +232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S +233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S +234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S +235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S +236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S +237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S +238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S +239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S +240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S +241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C +242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q +243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S +244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S +245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C +246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q +247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S +248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S +249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S +250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S +251,0,3,"Reed, Mr. James George",male,,0,0,362316,7.25,,S +252,0,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29,1,1,347054,10.4625,G6,S +253,0,1,"Stead, Mr. William Thomas",male,62,0,0,113514,26.55,C87,S +254,0,3,"Lobb, Mr. William Arthur",male,30,1,0,A/5. 3336,16.1,,S +255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S +256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C +257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C +258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S +259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C +260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S +261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q +262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S +263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S +264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S +265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q +266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S +267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S +268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S +269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S +270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S +271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S +272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S +273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S +274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C +275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q +276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S +277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S +278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S +279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q +280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S +281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q +282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S +283,0,3,"de Pelsmaeker, Mr. Alfons",male,16,0,0,345778,9.5,,S +284,1,3,"Dorking, Mr. Edward Arthur",male,19,0,0,A/5. 10482,8.05,,S +285,0,1,"Smith, Mr. Richard William",male,,0,0,113056,26,A19,S +286,0,3,"Stankovic, Mr. Ivan",male,33,0,0,349239,8.6625,,C +287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S +288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S +289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S +290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q +291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S +292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C +293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C +294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S +295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S +296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C +297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C +298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S +299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S +300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C +301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q +302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q +303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S +304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q +305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S +306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S +307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C +308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C +309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C +310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C +311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C +312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C +313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S +314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S +315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S +316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S +317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S +318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S +319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S +320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C +321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S +322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S +323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q +324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S +325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S +326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C +327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S +328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S +329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S +330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C +331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q +332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S +333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S +334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S +335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S +336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S +337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S +338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C +339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S +340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S +341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S +342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S +343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S +344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S +345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S +346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S +347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S +348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S +349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S +350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S +351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S +352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S +353,0,3,"Elias, Mr. Tannous",male,15,1,1,2695,7.2292,,C +354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S +355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C +356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S +357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S +358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S +359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q +360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q +361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S +362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C +363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C +364,0,3,"Asim, Mr. Adola",male,35,0,0,SOTON/O.Q. 3101310,7.05,,S +365,0,3,"O'Brien, Mr. Thomas",male,,1,0,370365,15.5,,Q +366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S +367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C +368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C +369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q +370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C +371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C +372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S +373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S +374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C +375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S +376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C +377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S +378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C +379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C +380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S +381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C +382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C +383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S +384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S +385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S +386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S +387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S +388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S +389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q +390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C +391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S +392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S +393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S +394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C +395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S +396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S +397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S +398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S +399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S +400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S +401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S +402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S +403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S +404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S +405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S +406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S +407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S +408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S +409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S +410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S +411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S +412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q +413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q +414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S +415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S +416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S +417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S +418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S +419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S +420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S +421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C +422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q +423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S +424,0,3,"Danbom, Mrs. Ernst Gilbert (Anna Sigrid Maria Brogren)",female,28,1,1,347080,14.4,,S +425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S +426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S +427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S +428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S +429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q +430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S +431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S +432,1,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,,1,0,376564,16.1,,S +433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S +434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S +435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S +436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S +437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S +438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S +439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S +440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S +441,1,2,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",female,45,1,1,F.C.C. 13529,26.25,,S +442,0,3,"Hampe, Mr. Leon",male,20,0,0,345769,9.5,,S +443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S +444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S +445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S +446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S +447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S +448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S +449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C +450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S +451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S +452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S +453,0,1,"Foreman, Mr. Benjamin Laventall",male,30,0,0,113051,27.75,C111,C +454,1,1,"Goldenberg, Mr. Samuel L",male,49,1,0,17453,89.1042,C92,C +455,0,3,"Peduzzi, Mr. Joseph",male,,0,0,A/5 2817,8.05,,S +456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C +457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S +458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S +459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S +460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q +461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S +462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S +463,0,1,"Gee, Mr. Arthur H",male,47,0,0,111320,38.5,E63,S +464,0,2,"Milling, Mr. Jacob Christian",male,48,0,0,234360,13,,S +465,0,3,"Maisner, Mr. Simon",male,,0,0,A/S 2816,8.05,,S +466,0,3,"Goncalves, Mr. Manuel Estanslas",male,38,0,0,SOTON/O.Q. 3101306,7.05,,S +467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S +468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S +469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q +470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C +471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S +472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S +473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S +474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C +475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S +476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S +477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S +478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S +479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S +480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S +481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S +482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S +483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S +484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S +485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C +486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S +487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S +488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C +489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S +490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S +491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S +492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S +493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S +494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C +495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S +496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C +497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C +498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S +499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S +500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S +501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S +502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q +503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q +504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S +505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S +506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C +507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S +508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S +509,0,3,"Olsen, Mr. Henry Margido",male,28,0,0,C 4001,22.525,,S +510,1,3,"Lang, Mr. Fang",male,26,0,0,1601,56.4958,,S +511,1,3,"Daly, Mr. Eugene Patrick",male,29,0,0,382651,7.75,,Q +512,0,3,"Webber, Mr. James",male,,0,0,SOTON/OQ 3101316,8.05,,S +513,1,1,"McGough, Mr. James Robert",male,36,0,0,PC 17473,26.2875,E25,S +514,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54,1,0,PC 17603,59.4,,C +515,0,3,"Coleff, Mr. Satio",male,24,0,0,349209,7.4958,,S +516,0,1,"Walker, Mr. William Anderson",male,47,0,0,36967,34.0208,D46,S +517,1,2,"Lemore, Mrs. (Amelia Milley)",female,34,0,0,C.A. 34260,10.5,F33,S +518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q +519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S +520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S +521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S +522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S +523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C +524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C +525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C +526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q +527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S +528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S +529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S +530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S +531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S +532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C +533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C +534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C +535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S +536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S +537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S +538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C +539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S +540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C +541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S +542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S +543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S +544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S +545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C +546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S +547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S +548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C +549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S +550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S +551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C +552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S +553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q +554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C +555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S +556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S +557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C +558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C +559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39,1,1,110413,79.65,E67,S +560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36,1,0,345572,17.4,,S +561,0,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.75,,Q +562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S +563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S +564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S +565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S +566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S +567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S +568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S +569,0,3,"Doharr, Mr. Tannous",male,,0,0,2686,7.2292,,C +570,1,3,"Jonsson, Mr. Carl",male,32,0,0,350417,7.8542,,S +571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S +572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S +573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S +574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q +575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S +576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S +577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S +578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S +579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C +580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S +581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S +582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C +583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S +584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C +585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C +586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S +587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S +588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C +589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S +590,0,3,"Murdlin, Mr. Joseph",male,,0,0,A./5. 3235,8.05,,S +591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S +592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C +593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S +594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q +595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S +596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S +597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S +598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S +599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C +600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C +601,1,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)",female,24,2,1,243847,27,,S +602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.8958,,S +603,0,1,"Harrington, Mr. Charles H",male,,0,0,113796,42.4,,S +604,0,3,"Torber, Mr. Ernst William",male,44,0,0,364511,8.05,,S +605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35,0,0,111426,26.55,,C +606,0,3,"Lindell, Mr. Edvard Bengtsson",male,36,1,0,349910,15.55,,S +607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S +608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S +609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C +610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S +611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S +612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S +613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q +614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q +615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S +616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S +617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S +618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S +619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S +620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S +621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C +622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S +623,1,3,"Nakid, Mr. Sahid",male,20,1,1,2653,15.7417,,C +624,0,3,"Hansen, Mr. Henry Damsgaard",male,21,0,0,350029,7.8542,,S +625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S +626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S +627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q +628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S +629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S +630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q +631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S +632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S +633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C +634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S +635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S +636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S +637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S +638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S +639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S +640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S +641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S +642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C +643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S +644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S +645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C +646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C +647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S +648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C +649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S +650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S +651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S +652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S +653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S +654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q +655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q +656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S +657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S +658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q +659,0,2,"Eitemiller, Mr. George Floyd",male,23,0,0,29751,13,,S +660,0,1,"Newell, Mr. Arthur Webster",male,58,0,2,35273,113.275,D48,C +661,1,1,"Frauenthal, Dr. Henry William",male,50,2,0,PC 17611,133.65,,S +662,0,3,"Badt, Mr. Mohamed",male,40,0,0,2623,7.225,,C +663,0,1,"Colley, Mr. Edward Pomeroy",male,47,0,0,5727,25.5875,E58,S +664,0,3,"Coleff, Mr. Peju",male,36,0,0,349210,7.4958,,S +665,1,3,"Lindqvist, Mr. Eino William",male,20,1,0,STON/O 2. 3101285,7.925,,S +666,0,2,"Hickman, Mr. Lewis",male,32,2,0,S.O.C. 14879,73.5,,S +667,0,2,"Butler, Mr. Reginald Fenton",male,25,0,0,234686,13,,S +668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S +669,0,3,"Cook, Mr. Jacob",male,43,0,0,A/5 3536,8.05,,S +670,1,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",female,,1,0,19996,52,C126,S +671,1,2,"Brown, Mrs. Thomas William Solomon (Elizabeth Catherine Ford)",female,40,1,1,29750,39,,S +672,0,1,"Davidson, Mr. Thornton",male,31,1,0,F.C. 12750,52,B71,S +673,0,2,"Mitchell, Mr. Henry Michael",male,70,0,0,C.A. 24580,10.5,,S +674,1,2,"Wilhelms, Mr. Charles",male,31,0,0,244270,13,,S +675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S +676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S +677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S +678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S +679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S +680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C +681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q +682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C +683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S +684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S +685,0,2,"Brown, Mr. Thomas William Solomon",male,60,1,1,29750,39,,S +686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25,1,2,SC/Paris 2123,41.5792,,C +687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S +688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S +689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S +690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S +691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S +692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C +693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S +694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C +695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S +696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S +697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S +698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q +699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C +700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S +701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C +702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S +703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C +704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q +705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S +706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S +707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S +708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S +709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S +710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C +711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C +712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S +713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S +714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S +715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S +716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S +717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C +718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S +719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q +720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S +721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S +722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S +723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S +724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S +725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S +726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S +727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S +728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q +729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S +730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S +731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S +732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C +733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S +734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S +735,0,2,"Troupiansky, Mr. Moses Aaron",male,23,0,0,233639,13,,S +736,0,3,"Williams, Mr. Leslie",male,28.5,0,0,54636,16.1,,S +737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48,1,3,W./C. 6608,34.375,,S +738,1,1,"Lesurer, Mr. Gustave J",male,35,0,0,PC 17755,512.3292,B101,C +739,0,3,"Ivanoff, Mr. Kanio",male,,0,0,349201,7.8958,,S +740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S +741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S +742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S +743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C +744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S +745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S +746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S +747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S +748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S +749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S +750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q +751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S +752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S +753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S +754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S +755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S +756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S +757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S +758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S +759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S +760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)",female,33,0,0,110152,86.5,B77,S +761,0,3,"Garfirth, Mr. John",male,,0,0,358585,14.5,,S +762,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41,0,0,SOTON/O2 3101272,7.125,,S +763,1,3,"Barah, Mr. Hanna Assi",male,20,0,0,2663,7.2292,,C +764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",female,36,1,2,113760,120,B96 B98,S +765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S +766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S +767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C +768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q +769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q +770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S +771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S +772,0,3,"Jensen, Mr. Niels Peder",male,48,0,0,350047,7.8542,,S +773,0,2,"Mack, Mrs. (Mary)",female,57,0,0,S.O./P.P. 3,10.5,E77,S +774,0,3,"Elias, Mr. Dibo",male,,0,0,2674,7.225,,C +775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S +776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S +777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q +778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S +779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q +780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S +781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C +782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S +783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S +784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S +785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S +786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S +787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S +788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q +789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S +790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C +791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q +792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S +793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S +794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C +795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S +796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S +797,1,1,"Leader, Dr. Alice (Farnham)",female,49,0,0,17465,25.9292,D17,S +798,1,3,"Osman, Mrs. Mara",female,31,0,0,349244,8.6833,,S +799,0,3,"Ibrahim Shawah, Mr. Yousseff",male,30,0,0,2685,7.2292,,C +800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S +801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S +802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S +803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S +804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C +805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S +806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S +807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S +808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S +809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S +810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S +811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S +812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S +813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S +814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S +815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S +816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S +817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S +818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C +819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S +820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S +821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S +822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S +823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S +824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S +825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S +826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q +827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S +828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C +829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q +830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28, +831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C +832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S +833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C +834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S +835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S +836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C +837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S +838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S +839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S +840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C +841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S +842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S +843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C +844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C +845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S +846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S +847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S +848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C +849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S +850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C +851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S +852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S +853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C +854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S +855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S +856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S +857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S +858,1,1,"Daly, Mr. Peter Denis ",male,51,0,0,113055,26.55,E17,S +859,1,3,"Baclini, Mrs. Solomon (Latifa Qurban)",female,24,0,3,2666,19.2583,,C +860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C +861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S +862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S +863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S +864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S +865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S +866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S +867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C +868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S +869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S +870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S +871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S +872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S +873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S +874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S +875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C +876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C +877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S +878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S +879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S +880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C +881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S +882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S +883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S +884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S +885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S +886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q +887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S +888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S +889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S +890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C +891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q diff --git a/examples/testing.py b/examples/testing.py new file mode 100644 index 0000000..0345284 --- /dev/null +++ b/examples/testing.py @@ -0,0 +1,19 @@ +#%% +import pandas as pd +import numpy as np +import sys +sys.path.insert(0,"C:/Local/pers/Documents/GitHub/Cobra") + +ROOT = "C:/Local/pers/Documents/GitHub/Cobra/" + +#%% +df_data = pd.read_csv(ROOT + "datasets/titanic_data.csv") + +#%% +from cobra.preprocessing import KBinsDiscretizer + +KBD = KBinsDiscretizer() +df_prep = KBD.fit_transform(data=df_data, column_names=['Age','Fare']) + +#%% +from cobra.preprocessing import TargetEncoder \ No newline at end of file From e022cb73ea9451959f548b45f5776e87516194cd Mon Sep 17 00:00:00 2001 From: JanBenisek Date: Fri, 4 Oct 2019 16:31:00 +0200 Subject: [PATCH 09/98] develop = cat regrouper --- cobra/metrics/__init__.py | 3 + cobra/metrics/all_metrics_plots.py | 527 +++++++++++++++++++++++++++++ dev/preprocessor/develop.py | 279 +++++++++++++++ dev/preprocessor/new_regroup.py | 189 +++++++++++ 4 files changed, 998 insertions(+) create mode 100644 cobra/metrics/__init__.py create mode 100644 cobra/metrics/all_metrics_plots.py create mode 100644 dev/preprocessor/develop.py create mode 100644 dev/preprocessor/new_regroup.py diff --git a/cobra/metrics/__init__.py b/cobra/metrics/__init__.py new file mode 100644 index 0000000..67656f7 --- /dev/null +++ b/cobra/metrics/__init__.py @@ -0,0 +1,3 @@ +from .all_metrics_plots import Evaluator + +__all__ = ['Evaluator'] \ No newline at end of file diff --git a/cobra/metrics/all_metrics_plots.py b/cobra/metrics/all_metrics_plots.py new file mode 100644 index 0000000..d18bb64 --- /dev/null +++ b/cobra/metrics/all_metrics_plots.py @@ -0,0 +1,527 @@ +""" +====================================================================================== +--------------------------------------- Evaluation Class code ------------------------ +====================================================================================== +author: jan.benisek@pythonpredictins.com - benoit.vandekerkhove@pythonpredictions.com +date: 23/09/2019 +purpose: library for model evaluation class + +""" +#%% +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import sklearn.metrics as mt +from typing import Tuple +#%% + + +class Evaluator(): + ''' + Class to evaluate models + + Parameters + ----------- + y_true : array, shape = [1, n_features] + array with true values + + y_pred_p : array, shape = [1, n_features] + array with predicted values (probabilities) + + lift_at : int , default=0.05 + calculate lift at given level (0-1) + + save_pth : str, default=None + path to where save the plot + + binary_cutoff : float, default=0.5 + cutoff to convert predictions to binary + + ''' + + def __init__(self, y_true: np.ndarray, y_pred_p: np.ndarray, + lift_at: float=0.05, save_pth: str=None, binary_cutoff: int=0.5): + + self.y_true = y_true.flatten() + self.y_pred_p = y_pred_p.flatten() #As probability + self.lift_at = lift_at + self.save_pth = save_pth + self.binary_cutoff = binary_cutoff + + self.y_pred_b = np.where(self.y_pred_p > self.binary_cutoff,1,0) + + + + + '''============================================================= + ----------------------------- PLOTS ---------------------------- + =============================================================''' + def plotROCCurve(self, desc: str=None): + ''' + Plot ROC curve and print best cutoff value + Transform probabilities predictions to bool based on best AUC based cutoff + + Parameters + ---------- + desc : str, default=None + description of the plot, used also as a name of saved plot + + ''' + if desc is None: + desc = '' + + fpr,tpr,thresholds = mt.roc_curve(self.y_true,self.y_pred_p) + + #--------------------------- + #Calculate AUC + #-------------------------- + score = mt.roc_auc_score(self.y_true, self.y_pred_p) + + fig, ax = plt.subplots(figsize=(8,5)) + ax.plot(fpr,tpr, color='darkorange', lw=2, label='ROC curve (area = {s:.3})'.format(s=score)) + ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') + ax.set_xlabel('False Positive Rate', fontsize=15) + ax.set_ylabel('True Positive Rate', fontsize=15) + ax.legend(loc="lower right") + ax.set_title('ROC Curve {}' .format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + '''============================================================= + ---------------------------- METRICS --------------------------- + =============================================================''' + + def printPerformance(self): + ''' + Print out performance measures + + EV.printPerformance() + %timeit 2min 19s ± 784 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + ''' + + if self.threshold != np.nan : + out_perfo = self._evaluation() + + print('=== Test on', self.test_on, '===') + print('Precision: {s:.3}'.format(s=out_perfo['precision'])) #If we mark customer as a churner, how often we are correct + print('Accuracy: {s:.3}'.format(s=out_perfo['accuracy'])) #Overall performance + print('Recall: {s:.3}'.format(s=out_perfo['recall'])) #How many churners can the model detect + print('F1 Score: {s:.3}'.format(s=out_perfo['F1'])) # 2 * (precision * recall) / (precision + recall) + print('Lift at top {l}%: {s:.3}'.format(l=self.lift_at*100, s=out_perfo['lift'])) # 2 * (precision * recall) / (precision + recall) + print('AUC: {s:.3}'.format(s=out_perfo['AUC'])) # 2 * (precision * recall) / (precision + recall) + + else : + raise ValueError('Please call .plotROCCurve() method first to get the best threshold for probabilities, and try again') + + def plotLift(self, desc : str=None, save_pth : str=None): + ''' + Method plots lift per decile + + Parameters + ---------- + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + #--------------------- + #-- CALCULATE LIFT --- + #--------------------- +# inc_rate = self.y_true.mean() + lifts = [Evaluator.liftCalculator(y_true=self.y_true, y_pred=self.y_pred_p, lift_at=perc_lift) + for perc_lift in np.arange(0.05,1.05,0.05)] + + #--------------------- + #------- PLOT -------- + #--------------------- + if desc is None: + desc = '' + + fig, ax = plt.subplots(figsize=(8,5)) + plt.style.use('seaborn-darkgrid') + + nrows = len(lifts) + x_labels = [nrows/2-x/2 for x in np.arange(0,nrows,1)] + + #plt.bar(x_labels[::-1], df['lift'].values.tolist(), align='center', color="cornflowerblue") + plt.bar(x_labels[::-1], lifts, align='center', color="green", width=0.2) + plt.ylabel('lift', fontsize=15) + plt.xlabel('decile', fontsize=15) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=1, color='darkorange', linestyle='--', + xmin=0.05, xmax=0.9, linewidth=3, label='Baseline') + + #Legend + ax.legend(loc='upper right') + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title('Cumulative Lift {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + + + '''------------------------------------------------------------------- + -------------------------------- UTILS ------------------------------- + -------------------------------------------------------------------''' + def estimateCutoff(self) -> float: + ''' + Estimates optimal cutoff based on maximization of AUC curve + https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python + + Parameters + ---------- + None + + Returns + ------- + best_cutoff : float + optimal cutoff as a float <0;1> + + ''' + fpr,tpr,thresholds = mt.roc_curve(self.y_true,self.y_pred_p) + i = np.arange(len(tpr)) + roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), + 'threshold' : pd.Series(thresholds, index=i)}) + roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]] + + best_cutoff = list(roc_t['threshold']) + + return best_cutoff[0] + + + def _testA(self, test : np.ndarray, pred : np.ndarray, train_M : np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + ''' + Limits the evaluation to potential A offers + (that a customer has not purchase in the train timeframe) + + Parameters + ---------- + test: true values -> array + pred: predictions as probabilities -> array + train_M : train matrix of interactions -> ndarray + + Output + ------ + testA: vector of interaction on potential A offers -> array + predA: vector of predictions on potential A offers -> array + ''' + + train = train_M.flatten() + testA = np.where(train>0, np.nan, test) + predA = np.where(train>0, np.nan, pred) + testA = testA[testA>=0] + predA = predA[predA>=0] + + return testA, predA + + def _evaluation(self): + ''' + Convenient function, returns various performance measures in a dict + + Parameters + ---------- + y_true: true values + y_pred: predictions as booleans + + Output + ------ + Returns dictionary with the measures + ''' + + dict_perfo = {'precision': mt.precision_score(self.y_true, self.y_pred_b), + 'accuracy': mt.accuracy_score(self.y_true, self.y_pred_b), + 'recall': mt.recall_score(self.y_true, self.y_pred_b), + 'F1': mt.f1_score(self.y_true, self.y_pred_b, average=None)[1], + 'lift': np.round(Evaluator.liftCalculator(y_true=self.y_true, + y_pred=self.y_pred_p, + lift_at=self.lift_at),2), + 'AUC': mt.roc_auc_score(self.y_true, self.y_pred_p) + } + return dict_perfo + + @staticmethod + def liftCalculator(y_true : np.ndarray, y_pred : np.ndarray, lift_at : float=0.05, **kwargs) -> float: + ''' + Calculates lift given two arrays on specified level + + Parameters + ---------- + y_true: numpy array with true values + y_pred: numpy array with predictions (probabilities) + lift_at: lift at what top percentage + + Output + ------ + Scalar value, lift. + + 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) + ''' + #Make sure it is numpy array + y_true_ = np.array(y_true) + y_pred_ = np.array(y_pred) + + #Make sure it has correct shape + y_true_ = y_true_.reshape(len(y_true_),1) + y_pred_ = y_pred_.reshape(len(y_pred_),1) + + #Merge data together + y_data = np.hstack([y_true_, y_pred_]) + + #Calculate necessary variables + nrows = len(y_data) + stop = int(np.floor(nrows*lift_at)) + avg_incidence = np.einsum('ij->j',y_true_)/float(len(y_true_)) + + #Sort and filter data + data_sorted = y_data[y_data[:,1].argsort()[::-1]][:stop,0].reshape(stop, 1) + + #Calculate lift (einsum is very fast way of summing, needs specific shape) + inc_in_top_n = np.einsum('ij->j',data_sorted)/float(len(data_sorted)) + + lift = np.round(inc_in_top_n/avg_incidence,2)[0] + + return lift + + '''------------------------------------------------------------------- + ------------------------JUST IN CASE ------------------------------- + -------------------------------------------------------------------''' + + def plotConfusionMatrix(self, labels : list=None, color : str='Reds', + save_pth : str=None, desc : str=None): + ''' + Plot Confusion matrix + + Parameters + ---------- + y_test: True values of target y + pred: Predicted values of target y, boolean + labels: labels for the matrix, if empty, values from y_test_ are used + color: Color of the matrix, its a cmap, so many values possible + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + if labels is None: + labels = [str(lab) for lab in np.unique(self.y_true)] + + if desc is None: + desc = '' + + cm = mt.confusion_matrix(self.y_true, self.y_pred_b) + + fig, ax = plt.subplots(figsize=(8,5)) + ax = sns.heatmap(cm, annot=cm.astype(str), fmt="s", cmap=color, xticklabels=labels, yticklabels=labels) + ax.set_title('Confusion matrix {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + def plotCumulativeGains(self, save_pth : str=None, desc : str=None): + ''' + Functions plot cumulative gains + + Parameters + ---------- + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + if desc is None: + desc = '' + + #--------------------------- + #Calculate cumulative gains + #-------------------------- + nrows = len(self.y_true) + npositives = self.y_true.sum() + df_y_pred = pd.DataFrame({"y":self.y_true, "y_pred":self.y_pred_p}).sort_values(by='y_pred', ascending=False).reset_index(drop=True) + cgains = [0] + for stop in (np.linspace(0.01,1,100)*nrows).astype(int): + cgains.append(round(df_y_pred.loc[:stop,'y'].sum()/npositives*max(100,1),2)) + + #--------------------------- + #Plot it + #--------------------------- + plt.style.use('seaborn-darkgrid') + fig, ax_cgains = plt.subplots(figsize=(8,5)) + ax_cgains.plot(cgains, color='blue', linewidth=3, label='cumulative gains') + ax_cgains.plot(ax_cgains.get_xlim(), ax_cgains.get_ylim(), linewidth=3, ls="--", color="darkorange", label='random selection') + ax_cgains.set_title('Cumulative Gains ' + desc, fontsize=20) + + ax_cgains.set_title('Cumulative Gains {}' .format(desc), fontsize=20) + #Format axes + ax_cgains.set_xlim([0,100]) + ax_cgains.set_ylim([0,100]) + #Format ticks + ax_cgains.set_yticklabels(['{:3.0f}%'.format(x) for x in ax_cgains.get_yticks()]) + ax_cgains.set_xticklabels(['{:3.0f}%'.format(x) for x in ax_cgains.get_xticks()]) + #Legend + ax_cgains.legend(loc='lower right') + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + def plotCumulativeResponse(self, desc : str=None, save_pth : str=None): + #--------------------- + #-- CALCULATE LIFT --- + #--------------------- + inc_rate = self.y_true.mean() + lifts = [Evaluator.liftCalculator(y_true=self.y_true, y_pred=self.y_pred_p, lift_at=perc_lift) + for perc_lift in np.arange(0.1,1.1,0.1)] + lifts = np.array(lifts)*inc_rate*100 + #--------------------- + #------- PLOT -------- + #--------------------- + if desc is None: + desc = '' + + fig, ax = plt.subplots(figsize=(8,5)) + #plt.style.use('seaborn-darkgrid') + plt.style.use('default') + + nrows = len(lifts) + x_labels = [nrows-x for x in np.arange(0,nrows,1)] + + #plt.bar(x_labels[::-1], df['lift'].values.tolist(), align='center', color="cornflowerblue") + plt.bar(x_labels[::-1], lifts, align='center', color="#00ccff") + plt.ylabel('response (%)', fontsize=16) + plt.xlabel('decile', fontsize=16) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=inc_rate*100, color='#ff9500', linestyle='--', + xmin=0.05, xmax=0.95, linewidth=3, label='Incidence') + + #Legend + ax.legend(loc='upper right') + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title('Cumulative response {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + +def plotIncidence(df, variable, dim=(12,8)): + ''' + Method plots Incidence plot on train partition + Returns plot + ---------------------------------------------------- + df: dataframe with cleaned, binned, partitioned and prepared data + variable: variable for which the incidence plot will be shown` + dim: tuple with width and lentgh of the plot + ---------------------------------------------------- + ''' + def masterOfOrder(x): + ''' + Function converts interval or string (category) to a number, so the incidence plot can be orderd. + In case of interval -> '(151, 361]' to integer 151. + In case of string -> order is alphabetical + Missings and Non-significants are always put at the end + + Parameters + ---------- + x: value to be converted + + Output + ------ + Order of given value + ''' + x_split = x.split(',')[0] + replace_strings = (('...', '0'),('Missing','999999999999'), ('Non-significants','999999999999')) + for repl_str in replace_strings: + x_split = x_split.replace(repl_str[0], repl_str[1]) + x_split = x_split.strip("()[]") + + try: + order = float(x_split) + except: + LETTERS = {letter: index for index, letter in enumerate(ascii_lowercase, start=1)} + order = LETTERS[x[0].lower()] + + return order + + plt.style.use('seaborn-darkgrid') + + #---------------------------------- + #------ Prepare the data -------- + #---------------------------------- + #Set up the variable and dataframe + var_prefix = 'B_' + variable + df_plt = df[['TARGET', var_prefix]][df['PARTITION'] == 'train'].copy() + + #Aggregate the data + avg_inc_rate = df_plt['TARGET'].mean() + + aggregations = { + 'bin_inc_rate': 'mean', + 'bin_size': 'count' + } + df_plt = df_plt.groupby(var_prefix, as_index=False)['TARGET'].agg(aggregations) + df_plt['avg_inc_rate'] = avg_inc_rate + + #create a sort column and sort by it + df_plt['sort_by'] = df_plt[var_prefix].apply(lambda x: masterOfOrder(x)) + df_plt.sort_values(by='sort_by', ascending=True, inplace=True) + df_plt.reset_index(inplace=True) + + #---------------------------------- + #----- Plot the incidence ------- + #---------------------------------- + fig, ax = plt.subplots(figsize=dim) + ##First Axis + #Bin size + y_pos = np.arange(len(df_plt[var_prefix])) + plt.bar(y_pos, df_plt['bin_size'].values.tolist(), align='center', color="cornflowerblue") + plt.xticks(y_pos, df_plt[var_prefix]) + plt.ylabel('Bin Size') + plt.xlabel(variable + ' Bins') + + max_inc = max(df_plt['bin_inc_rate']) + + ##Second Axis + ax2 = ax.twinx() + #incidence rate per bin + plt.plot(df_plt['bin_inc_rate'], color="darkorange", marker=".", markersize=20, linewidth=3, label='incidence rate per bin') + plt.plot(df_plt['avg_inc_rate'], color="dimgrey", linewidth=4, label='average incidence rate') + ax2.plot(np.nan, "cornflowerblue", linewidth=6, label = 'bin size') #dummy line to have label on second axis from first + ax2.set_yticks(np.arange(0, max_inc+0.05, 0.05)) + ax2.set_yticklabels(['{:3.1f}%'.format(x*100) for x in ax2.get_yticks()]) + plt.ylabel('Incidence') + + ##Set Axis + sns.despine(ax=ax, right=True, left=True) + sns.despine(ax=ax2, left=True, right=False) + ax2.spines['right'].set_color('white') + + #remove white line from second grid axes + #the white lines are reguler, Spyder sometimes fails to visualize it (try to export the pic!) + ax2.grid(False) + + ##Description + fig.suptitle('Incidence Plot - ' + variable, fontsize=20, y=1.02) + ax2.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=1, mode="expand", borderaxespad=0.) + plt.show() diff --git a/dev/preprocessor/develop.py b/dev/preprocessor/develop.py new file mode 100644 index 0000000..b1cb9bf --- /dev/null +++ b/dev/preprocessor/develop.py @@ -0,0 +1,279 @@ +#%% +import pandas as pd +import numpy as np +from random import shuffle +from scipy import stats +from typing import Dict, Tuple + +import logging +log = logging.getLogger(__name__) + +ROOT = "C:/Local/pers/Documents/GitHub/Cobra/" +df_data = pd.read_csv(ROOT + "datasets/titanic_data.csv") +df_data.rename(columns={'Survived': 'TARGET'}, inplace=True) + +split = ['TRAIN']*int(df_data.shape[0]*0.5) + \ + ['TEST']*int(df_data.shape[0]*0.2)+ \ + ['VALIDATION']*int(np.ceil(df_data.shape[0]*0.3)) + +shuffle(split) + +df_data['PARTITION'] = split + +df_x = pd.DataFrame(df_data[['Parch', 'Embarked']][df_data['PARTITION'] == "TRAIN"]) +df_y = df_data['TARGET'][df_data['PARTITION'] == "TRAIN"] + + +#%% +column = 'Embarked' +scale_cont = True +incidence_mean = df_y.mean() +pval_thresh = 0.001 +keep_categories = [] +keep = 'Missing' +category_map = {} +replace_with = 'non-significant' + +for category in grps: + #category = 'S' + df_aux = pd.concat([df_x[column], df_y], axis=1) + df_aux['obs_other'] = np.where(df_aux[column] == category, 0, 1) + + cont_table = pd.crosstab(df_aux['obs_other'], df_aux['TARGET'], margins=False) + + if scale_cont: + size_other_cat = cont_table.iloc[1].sum() + cont_table.iloc[1, 0] = (1-incidence_mean)*size_other_cat + cont_table.iloc[1, 1] = incidence_mean*size_other_cat + cont_table = cont_table.values.astype(np.int64) + + pval = stats.chi2_contingency(cont_table, correction=False)[1] + #0.17914169501249405 + + if pval<=pval_thresh: + keep_categories.append(category) + +if keep not in keep_categories and keep in df_x[column].unique().tolist(): + keep_categories.append(keep) + +for category in df_x[column].unique().tolist(): + if category in keep_categories: + category_map[category] = category + else: + category_map[category] = replace_with + + + + + +#%% + +class CategoryRegrouper(): + + """ + TOOD + -test the keep_categories and give warning if category not in column + -transform will be just df.replace() with a dict + -I am keeping missings, but not in the original code, inspect + -ask geert about the _removeCategories function + + -write the rest + -combine with categorical_processor.py + -add to init + -test if same as the old code + -unit tests + + Regroups categories in categorical variables if based on signicicance + with target variable + + Attributes + ---------- + scale_cont : bool, default=True + whether contingency table should be scaled before chi-2 + pval_thresh : float, default=0.001 + significance threshold for regroupping + regroup_rename : str, default='non-significant' + new name of non-significant regroupped variables + missing_rename : str, default='Missing' + new name of missing categories + keep_missing : bool, default=True + whether missing category should be kept in the result + forced_categories : Dict, default=None + dictionary to force categories - + for each colum dict of {col:[forced vars]} + """ + + def __init__(self, scale_cont: bool=True, + pval_thresh: float=0.001, + regroup_rename: str="non-significant", + missing_rename: str="Missing", + keep_missing: bool=True, + forced_categories: Dict=None): + self.scale_cont = scale_cont + self.pval_thresh = pval_thresh + self.regroup_rename = regroup_rename + self.missing_rename = missing_rename + self.keep_missing = keep_missing + self.forced_categories = forced_categories + + def fit(self): + pass + + def _fit_column(self, X: pd.DataFrame, + y: pd.Series, + column: str) -> Dict: + + category_map = {} + keep_categories = [] + self.incidence_mean = y.mean() + all_uq_categories = X[column].unique().tolist() + + # Rename target + y.rename("TARGET", inplace=True) + + # Replace missings + X = self._replaceMissings(X=X, column=column, + replace_with=self.missing_rename) + + # Remove small categories + categories = self._removeCategories(X=X, y=y, column=column) + + # Inspect remaining categories and test significance + for category in categories: + df_aux = pd.concat([X[column], y], axis=1) + df_aux['other_cats'] = np.where(df_aux[column] == category, 0, 1) + cont_table = pd.crosstab(index=df_aux['other_cats'], + columns=df_aux['TARGET'], + margins=False) + + # if true, we scale the "other" categories + if self.scale_cont: + size_other_cats = cont_table.iloc[1].sum() + cont_table.iloc[1, 0] = (1-self.incidence_mean)*size_other_cats + cont_table.iloc[1, 1] = self.incidence_mean*size_other_cats + cont_table = cont_table.values.astype(np.int64) + + pval = stats.chi2_contingency(cont_table, correction=False)[1] + + # If significant, keep it + if pval <= self.pval_thresh: + keep_categories.append(category) + + # Keep "Missing" even if it wasn't selected if + # it is in the original categories and set to True + if ((self.missing_rename not in keep_categories) and + (self.missing_rename in all_uq_categories) and self.keep_missing): + keep_categories.append(self.missing_rename) + + # Keep forced categories + if self.forced_categories is not None: + # If doesnt exists, give warning + forced = [col for col in self.forced_categories[column] + if col in all_uq_categories] + + # Extend list and remove duplicates + keep_categories = list(set(keep_categories.extend(forced))) + + difference = set(forced) - set(self.forced_categories[column]) + if len(difference) > 0: + log.warning("Following forced categories: {} " + "are not in column: {}.".format(difference, + column)) + + # Return dictionary as {old column : new column} + for category in all_uq_categories: + if category in keep_categories: + category_map[category] = category + else: + category_map[category] = self.regroup_rename + + return category_map + + def transform(self): + pass + + def _transform_column(self): + pass + + def fit_transform(self): + pass + + def _replaceMissings(self, X: pd.DataFrame, + column: str, + replace_with: str='Missing') -> pd.DataFrame: + """ + Method replaces missing and empty cells with `Missing` (default) in + a pd.DataFrame + + df_tst = _replaceMissings(X=df_x, column='Embarked') + + Parameters + ---------- + X : pd.DataFrame + Dataframe where a value will be replaced if empty or nan + column : str + Column to be analyzed for missings + replace_with : str default='Missing' + string to replace the missings + + Raises + ------ + ValueError + in case input column is not a string + + Returns + ------- + pd.DataFrame + modified dataframe with replaced missings + """ + if not X[column].dtype == 'object': + raise TypeError("columns must be a string") + + X[column].fillna(replace_with, inplace=True) + X[column] = X[column].str.strip() + X[column].replace('', replace_with, inplace=True) + + return X + + def _removeCategories(self, X: pd.DataFrame, + y: pd.Series, + column: str, + threshold: int=5) -> np.ndarray: + """ + Method removes category which fail to meet certain condition + + grps = _removeGroups(X=df_x, y=df_y, column='Embarked') + + Parameters + ---------- + X : pd.DataFrame + Dataframe with columns to be inspected for group removal + y : pd.Series + Series with target + column : str + Column to be analyzed group removal + threshold : int default=5 + Threshold for group removal + + Returns + ------- + np.ndarray + numpy array with groups to be kept + """ + category_cnts = pd.DataFrame(X.groupby(column)[column].count()) + train_inc = y.mean() + factor = max(train_inc, 1-train_inc) + keep_categories = category_cnts.where((category_cnts*factor) > + threshold) + + return np.array(keep_categories.index.tolist()) + + + +#%% +CR = CategoryRegrouper() + +output = CR._fit_column(X=df_x, y=df_y, column='Embarked') +output + +#%% diff --git a/dev/preprocessor/new_regroup.py b/dev/preprocessor/new_regroup.py new file mode 100644 index 0000000..6a467c2 --- /dev/null +++ b/dev/preprocessor/new_regroup.py @@ -0,0 +1,189 @@ +#%% +import pandas as pd +import numpy as np +from random import shuffle +from scipy import stats + +ROOT = "C:/Local/pers/Documents/GitHub/Cobra/" +df_data = pd.read_csv(ROOT + "datasets/titanic_data.csv") +df_data.rename(columns={'Survived': 'TARGET'}, inplace=True) + +split = ['TRAIN']*int(df_data.shape[0]*0.5) + \ + ['TEST']*int(df_data.shape[0]*0.2)+ \ + ['VALIDATION']*int(np.ceil(df_data.shape[0]*0.3)) + +shuffle(split) + +df_data['PARTITION'] = split + +#%% +''' ORIGINAL CODE ''' +def __regroup(var,target,train,pval_thresh=0.01,dummy=True,keep='Missing',rename='Other'): + ''' + Method regroups categorical variables + Returns DF mask + ---------------------------------------------------- + var: input pd.Serie with cat column + target: pd.Serie with target variable + train: pd.Serie with parition variable + pval_thresh: threshold for regrouping + dummy: scale of booleans (?) + keep: keep specific groups (?) + rename: rename the insignificant category + ---------------------------------------------------- + - Each group is tested with a chi² for relevant incidence differences in comparison to a rest-group + - The rest group has the size of the remaining groups and an 'overall average incidence' (if dummy=True) or + - remaining groups average incidence' (if dummy=False) + - Groups with a pvalue above the threshold are relabled to a single group + ''' + + # Define the chi² test condition + # Groups that do not meet the condition are not analyzed and will be unconditionally relabled + def _chi2cond_(var=var,target=target,train=train): + varcounts = var[train].groupby(by=var).count() + train_inc = target[train].sum()/len(target[train]) + factor = max(train_inc, 1-train_inc) + analyze_mask = (varcounts*factor)>5 + analyze_groups = analyze_mask.index[analyze_mask].values + return analyze_groups + + # Compute overal incidence mean + incidence_mean = target[train].mean() + # Create container of which groups will be kept, compared to the groups which will be relabled + keepgroups = [] + # Cycle and test each group that meets the chi² condition + for group in _chi2cond_(): + # Container for target 0/1 observations of the group under scrutiny + obs_group = [] + # Counts of the target 0/1 occurences for the group under scrutiny + obs_group.append(((target[train]==0)&(var[train]==group)).sum()) + obs_group.append(((target[train]==1)&(var[train]==group)).sum()) + obs_group = np.array(obs_group) + # Container for target 0/1 observations of the remaining groups together + obs_other = [] + # Counts of the target 0/1 occurences for the remaining groups together + obs_other.append(((target[train]==0)&(var[train]!=group)).sum()) + obs_other.append(((target[train]==1)&(var[train]!=group)).sum()) + obs_other = np.array(obs_other) + # If dummy=True, we scale the two groups of target 0/1 occurences such that the incidence is equal to the overall incidence + # The size of the two groups of target 0/1 occurences is still equal to the size of the remaining groups + if dummy: + obs_other_size = obs_other.sum() + obs_other[0]=(1-incidence_mean)*obs_other_size # 0(1) index coincides with target = 0(1) + obs_other[1]=( incidence_mean)*obs_other_size + obs = np.array([obs_group,obs_other]) + # Place at least 1 observation to avoid error in chi2 test + obs[obs==0] = 1 + # Perform chi² test + pval = stats.chi2_contingency(obs, correction=False)[1] + # If pval outperforms threshold, append the group in the keepgroups list + if pval<=pval_thresh: + keepgroups.append(group) + #elif group==keep: + # keepgroups.append(group) + # If the specific group to be kept (e.g. 'Missing') didn't pass the test, append it to the keepgroups list + if keep not in keepgroups: + keepgroups.append(keep) + # Makes a list of all groups not in the keepgroups list + regroup_mask = [val not in keepgroups for val in var.values] + var_regroup = var.copy() + # Rename those groups + var_regroup[regroup_mask] = rename + var_regroup.name = "B_"+var.name + info = (var.name+": from "+str(len(var.unique()))+" to "+str(len(var_regroup.unique()))) + return var_regroup, info + +#%% +''' RUN ORIGINAL CODE ''' +result = __regroup(var=df_data['Embarked'], #Cabin, Pclass, SibSp, Parch, Embarked + target=df_data.loc[:,'TARGET'], + train=df_data['PARTITION']=='TRAIN', + pval_thresh=0.05, + dummy=True, + keep='Missing', + rename='Non-significants') + +print(result[0].unique()) +print(result[0].head(n=5)) +print(result[1]) +df_tst = result[0] + +#%% +''' TEST CHISQR CONDITION ''' +def _chi2cond_(var,target,train): + #simple group by - pandas series + varcounts = var[train].groupby(by=var).count() + #train incidence - 0.3775280898876405 + train_inc = target[train].sum()/len(target[train]) + #Why? -0.6224719101123595 + factor = max(train_inc, 1-train_inc) + #which groups to analyze - boolean + analyze_mask = (varcounts*factor)>5 + #filter groups to be kept - array([0, 1, 2], dtype=int64) + analyze_groups = analyze_mask.index[analyze_mask].values + return analyze_groups + +chi = _chi2cond_(var=df_data['Embarked'], + target=df_data.loc[:,'TARGET'], + train=df_data['PARTITION']=='TRAIN') + +#%% +varcounts = df_data['Parch'][df_data['PARTITION']=='TRAIN'].groupby(by=df_data['Parch']).count() +train_inc = df_data.loc[:,'TARGET'][df_data['PARTITION']=='TRAIN'].sum()/len(df_data.loc[:,'TARGET'][df_data['PARTITION']=='TRAIN']) +factor = max(train_inc, 1-train_inc) +analyze_mask = (varcounts*factor)>5 +analyze_groups = analyze_mask.index[analyze_mask].values + +#%% +df_data['TARGET'][df_data['PARTITION']=='TRAIN'].mean() + +#%% +''' TEST TESTING ''' +target = df_data.loc[:,'TARGET'] +train = df_data['PARTITION']=='TRAIN' +var = df_data['Embarked'] + +for group in chi: + group == 'S' + # Container for target 0/1 observations of the group under scrutiny + obs_group = [] + # Counts of the target 0/1 occurences for the group under scrutiny + obs_group.append(((target[train]==0)&(var[train]==group)).sum()) + obs_group.append(((target[train]==1)&(var[train]==group)).sum()) + obs_group = np.array(obs_group) + # Container for target 0/1 observations of the remaining groups together + obs_other = [] + # Counts of the target 0/1 occurences for the remaining groups together + obs_other.append(((target[train]==0)&(var[train]!=group)).sum()) + obs_other.append(((target[train]==1)&(var[train]!=group)).sum()) + obs_other = np.array(obs_other) + +#S +#obs_group +#array([225, 95], dtype=int64) +# +#obs_other +#array([58, 67], dtype=int64) + +#%% +pd.crosstab(df.regiment, df_data.loc[:,'TARGET'], margins=True) + +#%% +incidence_mean = target[train].mean() +dummy=True + +if dummy: + obs_other_size = obs_other.sum() #400 + obs_other[0]=(1-incidence_mean)*obs_other_size # 0(1) index coincides with target = 0(1) + obs_other[1]=( incidence_mean)*obs_other_size +obs = np.array([obs_group,obs_other]) +# Place at least 1 observation to avoid error in chi2 test +obs[obs==0] = 1 +# Perform chi² test +pval = stats.chi2_contingency(obs, correction=False)[1] + +#obs +#array([[ 19, 26], +# [248, 151]], dtype=int64) + +#%% From dde17de1990fbdfd45da19d9ba49f06ef7ceb9a1 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 11 Oct 2019 13:51:55 +0200 Subject: [PATCH 10/98] Add matrics module to cobra --- cobra/metrics/__init__.py | 3 + cobra/metrics/all_metrics_plots.py | 527 +++++++++++++++++++++++++++++ 2 files changed, 530 insertions(+) create mode 100644 cobra/metrics/__init__.py create mode 100644 cobra/metrics/all_metrics_plots.py diff --git a/cobra/metrics/__init__.py b/cobra/metrics/__init__.py new file mode 100644 index 0000000..67656f7 --- /dev/null +++ b/cobra/metrics/__init__.py @@ -0,0 +1,3 @@ +from .all_metrics_plots import Evaluator + +__all__ = ['Evaluator'] \ No newline at end of file diff --git a/cobra/metrics/all_metrics_plots.py b/cobra/metrics/all_metrics_plots.py new file mode 100644 index 0000000..d18bb64 --- /dev/null +++ b/cobra/metrics/all_metrics_plots.py @@ -0,0 +1,527 @@ +""" +====================================================================================== +--------------------------------------- Evaluation Class code ------------------------ +====================================================================================== +author: jan.benisek@pythonpredictins.com - benoit.vandekerkhove@pythonpredictions.com +date: 23/09/2019 +purpose: library for model evaluation class + +""" +#%% +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import sklearn.metrics as mt +from typing import Tuple +#%% + + +class Evaluator(): + ''' + Class to evaluate models + + Parameters + ----------- + y_true : array, shape = [1, n_features] + array with true values + + y_pred_p : array, shape = [1, n_features] + array with predicted values (probabilities) + + lift_at : int , default=0.05 + calculate lift at given level (0-1) + + save_pth : str, default=None + path to where save the plot + + binary_cutoff : float, default=0.5 + cutoff to convert predictions to binary + + ''' + + def __init__(self, y_true: np.ndarray, y_pred_p: np.ndarray, + lift_at: float=0.05, save_pth: str=None, binary_cutoff: int=0.5): + + self.y_true = y_true.flatten() + self.y_pred_p = y_pred_p.flatten() #As probability + self.lift_at = lift_at + self.save_pth = save_pth + self.binary_cutoff = binary_cutoff + + self.y_pred_b = np.where(self.y_pred_p > self.binary_cutoff,1,0) + + + + + '''============================================================= + ----------------------------- PLOTS ---------------------------- + =============================================================''' + def plotROCCurve(self, desc: str=None): + ''' + Plot ROC curve and print best cutoff value + Transform probabilities predictions to bool based on best AUC based cutoff + + Parameters + ---------- + desc : str, default=None + description of the plot, used also as a name of saved plot + + ''' + if desc is None: + desc = '' + + fpr,tpr,thresholds = mt.roc_curve(self.y_true,self.y_pred_p) + + #--------------------------- + #Calculate AUC + #-------------------------- + score = mt.roc_auc_score(self.y_true, self.y_pred_p) + + fig, ax = plt.subplots(figsize=(8,5)) + ax.plot(fpr,tpr, color='darkorange', lw=2, label='ROC curve (area = {s:.3})'.format(s=score)) + ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') + ax.set_xlabel('False Positive Rate', fontsize=15) + ax.set_ylabel('True Positive Rate', fontsize=15) + ax.legend(loc="lower right") + ax.set_title('ROC Curve {}' .format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + '''============================================================= + ---------------------------- METRICS --------------------------- + =============================================================''' + + def printPerformance(self): + ''' + Print out performance measures + + EV.printPerformance() + %timeit 2min 19s ± 784 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + ''' + + if self.threshold != np.nan : + out_perfo = self._evaluation() + + print('=== Test on', self.test_on, '===') + print('Precision: {s:.3}'.format(s=out_perfo['precision'])) #If we mark customer as a churner, how often we are correct + print('Accuracy: {s:.3}'.format(s=out_perfo['accuracy'])) #Overall performance + print('Recall: {s:.3}'.format(s=out_perfo['recall'])) #How many churners can the model detect + print('F1 Score: {s:.3}'.format(s=out_perfo['F1'])) # 2 * (precision * recall) / (precision + recall) + print('Lift at top {l}%: {s:.3}'.format(l=self.lift_at*100, s=out_perfo['lift'])) # 2 * (precision * recall) / (precision + recall) + print('AUC: {s:.3}'.format(s=out_perfo['AUC'])) # 2 * (precision * recall) / (precision + recall) + + else : + raise ValueError('Please call .plotROCCurve() method first to get the best threshold for probabilities, and try again') + + def plotLift(self, desc : str=None, save_pth : str=None): + ''' + Method plots lift per decile + + Parameters + ---------- + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + #--------------------- + #-- CALCULATE LIFT --- + #--------------------- +# inc_rate = self.y_true.mean() + lifts = [Evaluator.liftCalculator(y_true=self.y_true, y_pred=self.y_pred_p, lift_at=perc_lift) + for perc_lift in np.arange(0.05,1.05,0.05)] + + #--------------------- + #------- PLOT -------- + #--------------------- + if desc is None: + desc = '' + + fig, ax = plt.subplots(figsize=(8,5)) + plt.style.use('seaborn-darkgrid') + + nrows = len(lifts) + x_labels = [nrows/2-x/2 for x in np.arange(0,nrows,1)] + + #plt.bar(x_labels[::-1], df['lift'].values.tolist(), align='center', color="cornflowerblue") + plt.bar(x_labels[::-1], lifts, align='center', color="green", width=0.2) + plt.ylabel('lift', fontsize=15) + plt.xlabel('decile', fontsize=15) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=1, color='darkorange', linestyle='--', + xmin=0.05, xmax=0.9, linewidth=3, label='Baseline') + + #Legend + ax.legend(loc='upper right') + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title('Cumulative Lift {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + + + '''------------------------------------------------------------------- + -------------------------------- UTILS ------------------------------- + -------------------------------------------------------------------''' + def estimateCutoff(self) -> float: + ''' + Estimates optimal cutoff based on maximization of AUC curve + https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python + + Parameters + ---------- + None + + Returns + ------- + best_cutoff : float + optimal cutoff as a float <0;1> + + ''' + fpr,tpr,thresholds = mt.roc_curve(self.y_true,self.y_pred_p) + i = np.arange(len(tpr)) + roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), + 'threshold' : pd.Series(thresholds, index=i)}) + roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]] + + best_cutoff = list(roc_t['threshold']) + + return best_cutoff[0] + + + def _testA(self, test : np.ndarray, pred : np.ndarray, train_M : np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + ''' + Limits the evaluation to potential A offers + (that a customer has not purchase in the train timeframe) + + Parameters + ---------- + test: true values -> array + pred: predictions as probabilities -> array + train_M : train matrix of interactions -> ndarray + + Output + ------ + testA: vector of interaction on potential A offers -> array + predA: vector of predictions on potential A offers -> array + ''' + + train = train_M.flatten() + testA = np.where(train>0, np.nan, test) + predA = np.where(train>0, np.nan, pred) + testA = testA[testA>=0] + predA = predA[predA>=0] + + return testA, predA + + def _evaluation(self): + ''' + Convenient function, returns various performance measures in a dict + + Parameters + ---------- + y_true: true values + y_pred: predictions as booleans + + Output + ------ + Returns dictionary with the measures + ''' + + dict_perfo = {'precision': mt.precision_score(self.y_true, self.y_pred_b), + 'accuracy': mt.accuracy_score(self.y_true, self.y_pred_b), + 'recall': mt.recall_score(self.y_true, self.y_pred_b), + 'F1': mt.f1_score(self.y_true, self.y_pred_b, average=None)[1], + 'lift': np.round(Evaluator.liftCalculator(y_true=self.y_true, + y_pred=self.y_pred_p, + lift_at=self.lift_at),2), + 'AUC': mt.roc_auc_score(self.y_true, self.y_pred_p) + } + return dict_perfo + + @staticmethod + def liftCalculator(y_true : np.ndarray, y_pred : np.ndarray, lift_at : float=0.05, **kwargs) -> float: + ''' + Calculates lift given two arrays on specified level + + Parameters + ---------- + y_true: numpy array with true values + y_pred: numpy array with predictions (probabilities) + lift_at: lift at what top percentage + + Output + ------ + Scalar value, lift. + + 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) + ''' + #Make sure it is numpy array + y_true_ = np.array(y_true) + y_pred_ = np.array(y_pred) + + #Make sure it has correct shape + y_true_ = y_true_.reshape(len(y_true_),1) + y_pred_ = y_pred_.reshape(len(y_pred_),1) + + #Merge data together + y_data = np.hstack([y_true_, y_pred_]) + + #Calculate necessary variables + nrows = len(y_data) + stop = int(np.floor(nrows*lift_at)) + avg_incidence = np.einsum('ij->j',y_true_)/float(len(y_true_)) + + #Sort and filter data + data_sorted = y_data[y_data[:,1].argsort()[::-1]][:stop,0].reshape(stop, 1) + + #Calculate lift (einsum is very fast way of summing, needs specific shape) + inc_in_top_n = np.einsum('ij->j',data_sorted)/float(len(data_sorted)) + + lift = np.round(inc_in_top_n/avg_incidence,2)[0] + + return lift + + '''------------------------------------------------------------------- + ------------------------JUST IN CASE ------------------------------- + -------------------------------------------------------------------''' + + def plotConfusionMatrix(self, labels : list=None, color : str='Reds', + save_pth : str=None, desc : str=None): + ''' + Plot Confusion matrix + + Parameters + ---------- + y_test: True values of target y + pred: Predicted values of target y, boolean + labels: labels for the matrix, if empty, values from y_test_ are used + color: Color of the matrix, its a cmap, so many values possible + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + if labels is None: + labels = [str(lab) for lab in np.unique(self.y_true)] + + if desc is None: + desc = '' + + cm = mt.confusion_matrix(self.y_true, self.y_pred_b) + + fig, ax = plt.subplots(figsize=(8,5)) + ax = sns.heatmap(cm, annot=cm.astype(str), fmt="s", cmap=color, xticklabels=labels, yticklabels=labels) + ax.set_title('Confusion matrix {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + def plotCumulativeGains(self, save_pth : str=None, desc : str=None): + ''' + Functions plot cumulative gains + + Parameters + ---------- + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + if desc is None: + desc = '' + + #--------------------------- + #Calculate cumulative gains + #-------------------------- + nrows = len(self.y_true) + npositives = self.y_true.sum() + df_y_pred = pd.DataFrame({"y":self.y_true, "y_pred":self.y_pred_p}).sort_values(by='y_pred', ascending=False).reset_index(drop=True) + cgains = [0] + for stop in (np.linspace(0.01,1,100)*nrows).astype(int): + cgains.append(round(df_y_pred.loc[:stop,'y'].sum()/npositives*max(100,1),2)) + + #--------------------------- + #Plot it + #--------------------------- + plt.style.use('seaborn-darkgrid') + fig, ax_cgains = plt.subplots(figsize=(8,5)) + ax_cgains.plot(cgains, color='blue', linewidth=3, label='cumulative gains') + ax_cgains.plot(ax_cgains.get_xlim(), ax_cgains.get_ylim(), linewidth=3, ls="--", color="darkorange", label='random selection') + ax_cgains.set_title('Cumulative Gains ' + desc, fontsize=20) + + ax_cgains.set_title('Cumulative Gains {}' .format(desc), fontsize=20) + #Format axes + ax_cgains.set_xlim([0,100]) + ax_cgains.set_ylim([0,100]) + #Format ticks + ax_cgains.set_yticklabels(['{:3.0f}%'.format(x) for x in ax_cgains.get_yticks()]) + ax_cgains.set_xticklabels(['{:3.0f}%'.format(x) for x in ax_cgains.get_xticks()]) + #Legend + ax_cgains.legend(loc='lower right') + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + def plotCumulativeResponse(self, desc : str=None, save_pth : str=None): + #--------------------- + #-- CALCULATE LIFT --- + #--------------------- + inc_rate = self.y_true.mean() + lifts = [Evaluator.liftCalculator(y_true=self.y_true, y_pred=self.y_pred_p, lift_at=perc_lift) + for perc_lift in np.arange(0.1,1.1,0.1)] + lifts = np.array(lifts)*inc_rate*100 + #--------------------- + #------- PLOT -------- + #--------------------- + if desc is None: + desc = '' + + fig, ax = plt.subplots(figsize=(8,5)) + #plt.style.use('seaborn-darkgrid') + plt.style.use('default') + + nrows = len(lifts) + x_labels = [nrows-x for x in np.arange(0,nrows,1)] + + #plt.bar(x_labels[::-1], df['lift'].values.tolist(), align='center', color="cornflowerblue") + plt.bar(x_labels[::-1], lifts, align='center', color="#00ccff") + plt.ylabel('response (%)', fontsize=16) + plt.xlabel('decile', fontsize=16) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=inc_rate*100, color='#ff9500', linestyle='--', + xmin=0.05, xmax=0.95, linewidth=3, label='Incidence') + + #Legend + ax.legend(loc='upper right') + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title('Cumulative response {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + +def plotIncidence(df, variable, dim=(12,8)): + ''' + Method plots Incidence plot on train partition + Returns plot + ---------------------------------------------------- + df: dataframe with cleaned, binned, partitioned and prepared data + variable: variable for which the incidence plot will be shown` + dim: tuple with width and lentgh of the plot + ---------------------------------------------------- + ''' + def masterOfOrder(x): + ''' + Function converts interval or string (category) to a number, so the incidence plot can be orderd. + In case of interval -> '(151, 361]' to integer 151. + In case of string -> order is alphabetical + Missings and Non-significants are always put at the end + + Parameters + ---------- + x: value to be converted + + Output + ------ + Order of given value + ''' + x_split = x.split(',')[0] + replace_strings = (('...', '0'),('Missing','999999999999'), ('Non-significants','999999999999')) + for repl_str in replace_strings: + x_split = x_split.replace(repl_str[0], repl_str[1]) + x_split = x_split.strip("()[]") + + try: + order = float(x_split) + except: + LETTERS = {letter: index for index, letter in enumerate(ascii_lowercase, start=1)} + order = LETTERS[x[0].lower()] + + return order + + plt.style.use('seaborn-darkgrid') + + #---------------------------------- + #------ Prepare the data -------- + #---------------------------------- + #Set up the variable and dataframe + var_prefix = 'B_' + variable + df_plt = df[['TARGET', var_prefix]][df['PARTITION'] == 'train'].copy() + + #Aggregate the data + avg_inc_rate = df_plt['TARGET'].mean() + + aggregations = { + 'bin_inc_rate': 'mean', + 'bin_size': 'count' + } + df_plt = df_plt.groupby(var_prefix, as_index=False)['TARGET'].agg(aggregations) + df_plt['avg_inc_rate'] = avg_inc_rate + + #create a sort column and sort by it + df_plt['sort_by'] = df_plt[var_prefix].apply(lambda x: masterOfOrder(x)) + df_plt.sort_values(by='sort_by', ascending=True, inplace=True) + df_plt.reset_index(inplace=True) + + #---------------------------------- + #----- Plot the incidence ------- + #---------------------------------- + fig, ax = plt.subplots(figsize=dim) + ##First Axis + #Bin size + y_pos = np.arange(len(df_plt[var_prefix])) + plt.bar(y_pos, df_plt['bin_size'].values.tolist(), align='center', color="cornflowerblue") + plt.xticks(y_pos, df_plt[var_prefix]) + plt.ylabel('Bin Size') + plt.xlabel(variable + ' Bins') + + max_inc = max(df_plt['bin_inc_rate']) + + ##Second Axis + ax2 = ax.twinx() + #incidence rate per bin + plt.plot(df_plt['bin_inc_rate'], color="darkorange", marker=".", markersize=20, linewidth=3, label='incidence rate per bin') + plt.plot(df_plt['avg_inc_rate'], color="dimgrey", linewidth=4, label='average incidence rate') + ax2.plot(np.nan, "cornflowerblue", linewidth=6, label = 'bin size') #dummy line to have label on second axis from first + ax2.set_yticks(np.arange(0, max_inc+0.05, 0.05)) + ax2.set_yticklabels(['{:3.1f}%'.format(x*100) for x in ax2.get_yticks()]) + plt.ylabel('Incidence') + + ##Set Axis + sns.despine(ax=ax, right=True, left=True) + sns.despine(ax=ax2, left=True, right=False) + ax2.spines['right'].set_color('white') + + #remove white line from second grid axes + #the white lines are reguler, Spyder sometimes fails to visualize it (try to export the pic!) + ax2.grid(False) + + ##Description + fig.suptitle('Incidence Plot - ' + variable, fontsize=20, y=1.02) + ax2.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=1, mode="expand", borderaxespad=0.) + plt.show() From 03b5b4d3b957b0f3dd1b3f7699d9f29f497f8296 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 11 Oct 2019 13:53:14 +0200 Subject: [PATCH 11/98] Remove metrics module which was put in the wrong branch --- cobra/metrics/__init__.py | 3 - cobra/metrics/all_metrics_plots.py | 527 ----------------------------- 2 files changed, 530 deletions(-) delete mode 100644 cobra/metrics/__init__.py delete mode 100644 cobra/metrics/all_metrics_plots.py diff --git a/cobra/metrics/__init__.py b/cobra/metrics/__init__.py deleted file mode 100644 index 67656f7..0000000 --- a/cobra/metrics/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .all_metrics_plots import Evaluator - -__all__ = ['Evaluator'] \ No newline at end of file diff --git a/cobra/metrics/all_metrics_plots.py b/cobra/metrics/all_metrics_plots.py deleted file mode 100644 index d18bb64..0000000 --- a/cobra/metrics/all_metrics_plots.py +++ /dev/null @@ -1,527 +0,0 @@ -""" -====================================================================================== ---------------------------------------- Evaluation Class code ------------------------ -====================================================================================== -author: jan.benisek@pythonpredictins.com - benoit.vandekerkhove@pythonpredictions.com -date: 23/09/2019 -purpose: library for model evaluation class - -""" -#%% -import numpy as np -import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt -import sklearn.metrics as mt -from typing import Tuple -#%% - - -class Evaluator(): - ''' - Class to evaluate models - - Parameters - ----------- - y_true : array, shape = [1, n_features] - array with true values - - y_pred_p : array, shape = [1, n_features] - array with predicted values (probabilities) - - lift_at : int , default=0.05 - calculate lift at given level (0-1) - - save_pth : str, default=None - path to where save the plot - - binary_cutoff : float, default=0.5 - cutoff to convert predictions to binary - - ''' - - def __init__(self, y_true: np.ndarray, y_pred_p: np.ndarray, - lift_at: float=0.05, save_pth: str=None, binary_cutoff: int=0.5): - - self.y_true = y_true.flatten() - self.y_pred_p = y_pred_p.flatten() #As probability - self.lift_at = lift_at - self.save_pth = save_pth - self.binary_cutoff = binary_cutoff - - self.y_pred_b = np.where(self.y_pred_p > self.binary_cutoff,1,0) - - - - - '''============================================================= - ----------------------------- PLOTS ---------------------------- - =============================================================''' - def plotROCCurve(self, desc: str=None): - ''' - Plot ROC curve and print best cutoff value - Transform probabilities predictions to bool based on best AUC based cutoff - - Parameters - ---------- - desc : str, default=None - description of the plot, used also as a name of saved plot - - ''' - if desc is None: - desc = '' - - fpr,tpr,thresholds = mt.roc_curve(self.y_true,self.y_pred_p) - - #--------------------------- - #Calculate AUC - #-------------------------- - score = mt.roc_auc_score(self.y_true, self.y_pred_p) - - fig, ax = plt.subplots(figsize=(8,5)) - ax.plot(fpr,tpr, color='darkorange', lw=2, label='ROC curve (area = {s:.3})'.format(s=score)) - ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') - ax.set_xlabel('False Positive Rate', fontsize=15) - ax.set_ylabel('True Positive Rate', fontsize=15) - ax.legend(loc="lower right") - ax.set_title('ROC Curve {}' .format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - '''============================================================= - ---------------------------- METRICS --------------------------- - =============================================================''' - - def printPerformance(self): - ''' - Print out performance measures - - EV.printPerformance() - %timeit 2min 19s ± 784 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - ''' - - if self.threshold != np.nan : - out_perfo = self._evaluation() - - print('=== Test on', self.test_on, '===') - print('Precision: {s:.3}'.format(s=out_perfo['precision'])) #If we mark customer as a churner, how often we are correct - print('Accuracy: {s:.3}'.format(s=out_perfo['accuracy'])) #Overall performance - print('Recall: {s:.3}'.format(s=out_perfo['recall'])) #How many churners can the model detect - print('F1 Score: {s:.3}'.format(s=out_perfo['F1'])) # 2 * (precision * recall) / (precision + recall) - print('Lift at top {l}%: {s:.3}'.format(l=self.lift_at*100, s=out_perfo['lift'])) # 2 * (precision * recall) / (precision + recall) - print('AUC: {s:.3}'.format(s=out_perfo['AUC'])) # 2 * (precision * recall) / (precision + recall) - - else : - raise ValueError('Please call .plotROCCurve() method first to get the best threshold for probabilities, and try again') - - def plotLift(self, desc : str=None, save_pth : str=None): - ''' - Method plots lift per decile - - Parameters - ---------- - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - #--------------------- - #-- CALCULATE LIFT --- - #--------------------- -# inc_rate = self.y_true.mean() - lifts = [Evaluator.liftCalculator(y_true=self.y_true, y_pred=self.y_pred_p, lift_at=perc_lift) - for perc_lift in np.arange(0.05,1.05,0.05)] - - #--------------------- - #------- PLOT -------- - #--------------------- - if desc is None: - desc = '' - - fig, ax = plt.subplots(figsize=(8,5)) - plt.style.use('seaborn-darkgrid') - - nrows = len(lifts) - x_labels = [nrows/2-x/2 for x in np.arange(0,nrows,1)] - - #plt.bar(x_labels[::-1], df['lift'].values.tolist(), align='center', color="cornflowerblue") - plt.bar(x_labels[::-1], lifts, align='center', color="green", width=0.2) - plt.ylabel('lift', fontsize=15) - plt.xlabel('decile', fontsize=15) - ax.set_xticks(x_labels) - ax.set_xticklabels(x_labels) - - plt.axhline(y=1, color='darkorange', linestyle='--', - xmin=0.05, xmax=0.9, linewidth=3, label='Baseline') - - #Legend - ax.legend(loc='upper right') - - ##Set Axis - make them pretty - sns.despine(ax=ax, right=True, left=True) - - #Remove white lines from the second axis - ax.grid(False) - - ##Description - ax.set_title('Cumulative Lift {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - - - '''------------------------------------------------------------------- - -------------------------------- UTILS ------------------------------- - -------------------------------------------------------------------''' - def estimateCutoff(self) -> float: - ''' - Estimates optimal cutoff based on maximization of AUC curve - https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python - - Parameters - ---------- - None - - Returns - ------- - best_cutoff : float - optimal cutoff as a float <0;1> - - ''' - fpr,tpr,thresholds = mt.roc_curve(self.y_true,self.y_pred_p) - i = np.arange(len(tpr)) - roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), - 'threshold' : pd.Series(thresholds, index=i)}) - roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]] - - best_cutoff = list(roc_t['threshold']) - - return best_cutoff[0] - - - def _testA(self, test : np.ndarray, pred : np.ndarray, train_M : np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - ''' - Limits the evaluation to potential A offers - (that a customer has not purchase in the train timeframe) - - Parameters - ---------- - test: true values -> array - pred: predictions as probabilities -> array - train_M : train matrix of interactions -> ndarray - - Output - ------ - testA: vector of interaction on potential A offers -> array - predA: vector of predictions on potential A offers -> array - ''' - - train = train_M.flatten() - testA = np.where(train>0, np.nan, test) - predA = np.where(train>0, np.nan, pred) - testA = testA[testA>=0] - predA = predA[predA>=0] - - return testA, predA - - def _evaluation(self): - ''' - Convenient function, returns various performance measures in a dict - - Parameters - ---------- - y_true: true values - y_pred: predictions as booleans - - Output - ------ - Returns dictionary with the measures - ''' - - dict_perfo = {'precision': mt.precision_score(self.y_true, self.y_pred_b), - 'accuracy': mt.accuracy_score(self.y_true, self.y_pred_b), - 'recall': mt.recall_score(self.y_true, self.y_pred_b), - 'F1': mt.f1_score(self.y_true, self.y_pred_b, average=None)[1], - 'lift': np.round(Evaluator.liftCalculator(y_true=self.y_true, - y_pred=self.y_pred_p, - lift_at=self.lift_at),2), - 'AUC': mt.roc_auc_score(self.y_true, self.y_pred_p) - } - return dict_perfo - - @staticmethod - def liftCalculator(y_true : np.ndarray, y_pred : np.ndarray, lift_at : float=0.05, **kwargs) -> float: - ''' - Calculates lift given two arrays on specified level - - Parameters - ---------- - y_true: numpy array with true values - y_pred: numpy array with predictions (probabilities) - lift_at: lift at what top percentage - - Output - ------ - Scalar value, lift. - - 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) - ''' - #Make sure it is numpy array - y_true_ = np.array(y_true) - y_pred_ = np.array(y_pred) - - #Make sure it has correct shape - y_true_ = y_true_.reshape(len(y_true_),1) - y_pred_ = y_pred_.reshape(len(y_pred_),1) - - #Merge data together - y_data = np.hstack([y_true_, y_pred_]) - - #Calculate necessary variables - nrows = len(y_data) - stop = int(np.floor(nrows*lift_at)) - avg_incidence = np.einsum('ij->j',y_true_)/float(len(y_true_)) - - #Sort and filter data - data_sorted = y_data[y_data[:,1].argsort()[::-1]][:stop,0].reshape(stop, 1) - - #Calculate lift (einsum is very fast way of summing, needs specific shape) - inc_in_top_n = np.einsum('ij->j',data_sorted)/float(len(data_sorted)) - - lift = np.round(inc_in_top_n/avg_incidence,2)[0] - - return lift - - '''------------------------------------------------------------------- - ------------------------JUST IN CASE ------------------------------- - -------------------------------------------------------------------''' - - def plotConfusionMatrix(self, labels : list=None, color : str='Reds', - save_pth : str=None, desc : str=None): - ''' - Plot Confusion matrix - - Parameters - ---------- - y_test: True values of target y - pred: Predicted values of target y, boolean - labels: labels for the matrix, if empty, values from y_test_ are used - color: Color of the matrix, its a cmap, so many values possible - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - if labels is None: - labels = [str(lab) for lab in np.unique(self.y_true)] - - if desc is None: - desc = '' - - cm = mt.confusion_matrix(self.y_true, self.y_pred_b) - - fig, ax = plt.subplots(figsize=(8,5)) - ax = sns.heatmap(cm, annot=cm.astype(str), fmt="s", cmap=color, xticklabels=labels, yticklabels=labels) - ax.set_title('Confusion matrix {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - def plotCumulativeGains(self, save_pth : str=None, desc : str=None): - ''' - Functions plot cumulative gains - - Parameters - ---------- - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - if desc is None: - desc = '' - - #--------------------------- - #Calculate cumulative gains - #-------------------------- - nrows = len(self.y_true) - npositives = self.y_true.sum() - df_y_pred = pd.DataFrame({"y":self.y_true, "y_pred":self.y_pred_p}).sort_values(by='y_pred', ascending=False).reset_index(drop=True) - cgains = [0] - for stop in (np.linspace(0.01,1,100)*nrows).astype(int): - cgains.append(round(df_y_pred.loc[:stop,'y'].sum()/npositives*max(100,1),2)) - - #--------------------------- - #Plot it - #--------------------------- - plt.style.use('seaborn-darkgrid') - fig, ax_cgains = plt.subplots(figsize=(8,5)) - ax_cgains.plot(cgains, color='blue', linewidth=3, label='cumulative gains') - ax_cgains.plot(ax_cgains.get_xlim(), ax_cgains.get_ylim(), linewidth=3, ls="--", color="darkorange", label='random selection') - ax_cgains.set_title('Cumulative Gains ' + desc, fontsize=20) - - ax_cgains.set_title('Cumulative Gains {}' .format(desc), fontsize=20) - #Format axes - ax_cgains.set_xlim([0,100]) - ax_cgains.set_ylim([0,100]) - #Format ticks - ax_cgains.set_yticklabels(['{:3.0f}%'.format(x) for x in ax_cgains.get_yticks()]) - ax_cgains.set_xticklabels(['{:3.0f}%'.format(x) for x in ax_cgains.get_xticks()]) - #Legend - ax_cgains.legend(loc='lower right') - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - def plotCumulativeResponse(self, desc : str=None, save_pth : str=None): - #--------------------- - #-- CALCULATE LIFT --- - #--------------------- - inc_rate = self.y_true.mean() - lifts = [Evaluator.liftCalculator(y_true=self.y_true, y_pred=self.y_pred_p, lift_at=perc_lift) - for perc_lift in np.arange(0.1,1.1,0.1)] - lifts = np.array(lifts)*inc_rate*100 - #--------------------- - #------- PLOT -------- - #--------------------- - if desc is None: - desc = '' - - fig, ax = plt.subplots(figsize=(8,5)) - #plt.style.use('seaborn-darkgrid') - plt.style.use('default') - - nrows = len(lifts) - x_labels = [nrows-x for x in np.arange(0,nrows,1)] - - #plt.bar(x_labels[::-1], df['lift'].values.tolist(), align='center', color="cornflowerblue") - plt.bar(x_labels[::-1], lifts, align='center', color="#00ccff") - plt.ylabel('response (%)', fontsize=16) - plt.xlabel('decile', fontsize=16) - ax.set_xticks(x_labels) - ax.set_xticklabels(x_labels) - - plt.axhline(y=inc_rate*100, color='#ff9500', linestyle='--', - xmin=0.05, xmax=0.95, linewidth=3, label='Incidence') - - #Legend - ax.legend(loc='upper right') - - ##Set Axis - make them pretty - sns.despine(ax=ax, right=True, left=True) - - #Remove white lines from the second axis - ax.grid(False) - - ##Description - ax.set_title('Cumulative response {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - -def plotIncidence(df, variable, dim=(12,8)): - ''' - Method plots Incidence plot on train partition - Returns plot - ---------------------------------------------------- - df: dataframe with cleaned, binned, partitioned and prepared data - variable: variable for which the incidence plot will be shown` - dim: tuple with width and lentgh of the plot - ---------------------------------------------------- - ''' - def masterOfOrder(x): - ''' - Function converts interval or string (category) to a number, so the incidence plot can be orderd. - In case of interval -> '(151, 361]' to integer 151. - In case of string -> order is alphabetical - Missings and Non-significants are always put at the end - - Parameters - ---------- - x: value to be converted - - Output - ------ - Order of given value - ''' - x_split = x.split(',')[0] - replace_strings = (('...', '0'),('Missing','999999999999'), ('Non-significants','999999999999')) - for repl_str in replace_strings: - x_split = x_split.replace(repl_str[0], repl_str[1]) - x_split = x_split.strip("()[]") - - try: - order = float(x_split) - except: - LETTERS = {letter: index for index, letter in enumerate(ascii_lowercase, start=1)} - order = LETTERS[x[0].lower()] - - return order - - plt.style.use('seaborn-darkgrid') - - #---------------------------------- - #------ Prepare the data -------- - #---------------------------------- - #Set up the variable and dataframe - var_prefix = 'B_' + variable - df_plt = df[['TARGET', var_prefix]][df['PARTITION'] == 'train'].copy() - - #Aggregate the data - avg_inc_rate = df_plt['TARGET'].mean() - - aggregations = { - 'bin_inc_rate': 'mean', - 'bin_size': 'count' - } - df_plt = df_plt.groupby(var_prefix, as_index=False)['TARGET'].agg(aggregations) - df_plt['avg_inc_rate'] = avg_inc_rate - - #create a sort column and sort by it - df_plt['sort_by'] = df_plt[var_prefix].apply(lambda x: masterOfOrder(x)) - df_plt.sort_values(by='sort_by', ascending=True, inplace=True) - df_plt.reset_index(inplace=True) - - #---------------------------------- - #----- Plot the incidence ------- - #---------------------------------- - fig, ax = plt.subplots(figsize=dim) - ##First Axis - #Bin size - y_pos = np.arange(len(df_plt[var_prefix])) - plt.bar(y_pos, df_plt['bin_size'].values.tolist(), align='center', color="cornflowerblue") - plt.xticks(y_pos, df_plt[var_prefix]) - plt.ylabel('Bin Size') - plt.xlabel(variable + ' Bins') - - max_inc = max(df_plt['bin_inc_rate']) - - ##Second Axis - ax2 = ax.twinx() - #incidence rate per bin - plt.plot(df_plt['bin_inc_rate'], color="darkorange", marker=".", markersize=20, linewidth=3, label='incidence rate per bin') - plt.plot(df_plt['avg_inc_rate'], color="dimgrey", linewidth=4, label='average incidence rate') - ax2.plot(np.nan, "cornflowerblue", linewidth=6, label = 'bin size') #dummy line to have label on second axis from first - ax2.set_yticks(np.arange(0, max_inc+0.05, 0.05)) - ax2.set_yticklabels(['{:3.1f}%'.format(x*100) for x in ax2.get_yticks()]) - plt.ylabel('Incidence') - - ##Set Axis - sns.despine(ax=ax, right=True, left=True) - sns.despine(ax=ax2, left=True, right=False) - ax2.spines['right'].set_color('white') - - #remove white line from second grid axes - #the white lines are reguler, Spyder sometimes fails to visualize it (try to export the pic!) - ax2.grid(False) - - ##Description - fig.suptitle('Incidence Plot - ' + variable, fontsize=20, y=1.02) - ax2.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=1, mode="expand", borderaxespad=0.) - plt.show() From 67703d3d7cd6f310b1ca3acc83732217123e1efc Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 14 Oct 2019 08:46:10 +0200 Subject: [PATCH 12/98] Add model building module --- cobra/model_building/univariate_selection.py | 101 +++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 cobra/model_building/univariate_selection.py diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py new file mode 100644 index 0000000..725792e --- /dev/null +++ b/cobra/model_building/univariate_selection.py @@ -0,0 +1,101 @@ +"""Summary +""" +import pandas as pd +from sklearn.metrics import roc_auc_score + + +def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, + target_enc_selection_data: pd.DataFrame, + predictors: list, + target_column: str, + preselect_auc_threshold: float, + preselect_overtrain_threshold: float + ) -> pd.DataFrame: + """Summary + + Args: + target_enc_train_data (pd.DataFrame): Train data + target_enc_selection_data (pd.DataFrame): Selection data + predictors (list): list of predictors (e.g. column names in the train + and selection data sets) + target_column (str): name of the target column + preselect_auc_threshold (float): Description + preselect_overtrain_threshold (float): Description + + Returns: + pd.DataFrame: DataFrame containing for each variable the train auc and + test auc allong with a boolean indicating whether or not it is selected + based on the criteria + """ + result = [] + + for predictor in predictors: + + cleaned_predictor = _clean_predictor_name(predictor) + + auc_train = roc_auc_score(target_enc_train_data[predictor], + target_enc_train_data[target_column]) + + auc_selection = roc_auc_score( + target_enc_selection_data[predictor], + target_enc_selection_data[target_column] + ) + + result.append({"predictor": cleaned_predictor, + "AUC train": auc_train, + "AUC selection": auc_selection}) + + df_auc = pd.DataFrame(result) + + # Filter based on min AUC + auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold + + # Identify those variables for which the AUC difference between train + # and selection is within a user-defined ratio + auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) * 100 + < preselect_overtrain_threshold) + + df_auc["preselection"] = auc_thresh & auc_overtrain + + return df_auc + + +def compute_correlations(target_enc_train_data: pd.DataFrame, + predictors: list) -> pd.DataFrame: + """Given a DataFrame and a list of predictors, compute the correlations + amongst the predictors in the DataFrame + + Args: + target_enc_train_data (pd.DataFrame): data to compute correlation + matrix from + predictors (list): List of column names of the DataFrame between which + to compute correlations + + Returns: + pd.DataFrame: The correlation matrix of the training set + """ + + correlations = target_enc_train_data[predictors].corr() + + predictors_cleaned = [_clean_predictor_name(predictor) + for predictor in predictors] + + # Change index and columns with the cleaned version of the predictors + # e.g. change "var1_enc" with "var1" + correlations.columns = predictors_cleaned + correlations.index = predictors_cleaned + + return correlations + + +def _clean_predictor_name(predictor: str) -> str: + """Strip-off redundant suffix (e.g. "_enc" or "_bin") from the predictor + name to return a clean version of the predictor + + Args: + predictor (str): Description + + Returns: + str: Description + """ + return predictor.replace("_enc", "").replace("_bin", "") From 1e8bdaa35c1fc4cb5a07d43d2f9f833eed1d74dd Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 14 Oct 2019 15:08:53 +0200 Subject: [PATCH 13/98] Bug fix in KbinsDiscretizer _fit_column The bug occured when auto_adapt_bins was set to True, but it fixed right now. Tests are updated so that this function is also covered. In fact, all private member functions related to fitting are covered at the moment. --- cobra/preprocessing/kbins_discretizer.py | 4 +-- tests/preprocessing/test_kbins_discretizer.py | 36 +++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 7767319..5e8eb56 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -17,7 +17,7 @@ import pandas as pd from sklearn.exceptions import NotFittedError -from sklearn.cluster import KMeans +#from sklearn.cluster import KMeans class KBinsDiscretizer: @@ -154,7 +154,7 @@ def _fit_column(self, data: pd.DataFrame, if self.auto_adapt_bins: size = len(data.index) missing_pct = data[column_name].isnull().sum()/size - n_bins = int(max((1 - missing_pct) * n_bins), 2) + n_bins = int(max(round((1 - missing_pct) * n_bins), 2)) bin_edges = self._compute_bin_edges(data, column_name, n_bins, col_min, col_max) diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py index 19d92ae..2512186 100644 --- a/tests/preprocessing/test_kbins_discretizer.py +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest @@ -117,6 +118,41 @@ def test_kbins_discretizer_compute_bins_from_edges_round_up(self): expected = [(0, 1), (1, 2), (2, 3)] assert actual == expected + # Tests for _fit_column + def test_kbins_discretizer_fit_column_regular(self): + + data = pd.DataFrame({"variable": list(range(0, 11))}) # ints from 0-10 + + discretizer = KBinsDiscretizer(n_bins=4) + actual = discretizer._fit_column(data, column_name="variable") + + expected = [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), (8.0, 10.0)] + + assert expected == actual + + def test_kbins_discretizer_fit_column_auto_adapt_bins(self): + + data = pd.DataFrame({"variable": list(range(0, 11)) + + ([np.nan] * 17)}) # ints from 0-10 with 17 nan's + + discretizer = KBinsDiscretizer(auto_adapt_bins=True) + actual = discretizer._fit_column(data, column_name="variable") + + expected = [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), (8.0, 10.0)] + + assert expected == actual + + def test_kbins_discretizer_fit_column_two_bin_edges(self): + + data = pd.DataFrame({"variable": [0] + ([1] * 100)}) # almost constant + + discretizer = KBinsDiscretizer() + actual = discretizer._fit_column(data, column_name="variable") + + expected = None + + assert expected == actual + # Tests for _create_bin_labels def test_kbins_discretizer_create_bin_labels(self): From e596c12154aba6e10b806cc306a106bc01a9a712 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 14 Oct 2019 15:28:55 +0200 Subject: [PATCH 14/98] Add more docstrings to preprocessing module methods --- cobra/preprocessing/kbins_discretizer.py | 9 +++++++-- cobra/preprocessing/target_encoder.py | 18 +++++------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 5e8eb56..391c842 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -22,7 +22,11 @@ class KBinsDiscretizer: - """Bin continuous data into intervals of predefined size + """Bin continuous data into intervals of predefined size. This provides a + way to partition continuous data into discrete values, i.e. tranform + continuous data into nominal data. This can make a linear model more + expressive as it introduces nonlinearity to the model, while maintaining + the interpretability of the model afterwards. Attributes ---------- @@ -173,7 +177,8 @@ def _fit_column(self, data: pd.DataFrame, def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: - """Summary + """Discretizes the data in the given list of columns by mapping each + number to the appropriate bin computed by the fit method Parameters ---------- diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 7ce5fbb..be9da12 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -1,3 +1,6 @@ +""" + Incidence Replacement Module +""" import logging log = logging.getLogger(__name__) @@ -36,19 +39,7 @@ class TargetEncoder(BaseEstimator, TransformerMixin): """ def __init__(self, columns: list=None, weight: float=0.0): - """Constructor - Parameters - ---------- - columns : list, optional - A list of columns to encode, if None, all string columns will be - encoded. - weight : float, optional - Smoothing parameters (non-negative). The higher the value of the - parameter, the bigger the contribution of the overall mean. When - set to zero, there is no smoothing - (e.g. the pure target incidence is used) - """ if weight < 0: raise ValueError("The value of weight cannot be smaller than zero") @@ -128,7 +119,8 @@ def _fit_column(self, X: pd.Series, y: pd.Series, return numerator/denominator def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: - """Summary + """Replace (e.g. encode) categories of each column with its average + incidence which was computed when the fit method was called Parameters ---------- From c2603af4132bf5b509b5a34179e49d3782bd9f20 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 21 Oct 2019 15:36:21 +0200 Subject: [PATCH 15/98] Bug fix in label formatting of KBinsDiscretizer When the change_endpoint format was set to True, the label formatting did not depend on the "closed" attribute so we had to fix that. --- cobra/preprocessing/kbins_discretizer.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 391c842..8ae7e3d 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -70,7 +70,7 @@ def __init__(self, n_bins: int=10, strategy: str="quantile", self.n_bins = n_bins self.strategy = strategy.lower() - self.closed = closed + self.closed = closed.lower() self.auto_adapt_bins = auto_adapt_bins self.starting_precision = starting_precision self.label_format = label_format @@ -419,6 +419,14 @@ def _create_index(intervals: List[tuple], pd.IntervalIndex Description """ + + # check if closed is of the proper form + if closed not in ["left", "right"]: + raise ValueError("{}: valid options for 'closed' are {}. " + "Got strategy={!r} instead." + .format(KBinsDiscretizer.__name__, + ["left", "right"], closed)) + # deepcopy variable because we do not want to modify the content # of intervals (which is still used outside of this function) _intervals = deepcopy(intervals) @@ -450,7 +458,11 @@ def _create_bin_labels(self, bins: List[tuple]) -> list: # Format first and last bin as < x and > y resp. if self.change_endpoint_format: - bin_labels[0] = "< {}".format(bins[0][1]) - bin_labels[-1] = "> {}".format(bins[-1][0]) + if self.closed == "left": + bin_labels[0] = "< {}".format(bins[0][1]) + bin_labels[-1] = ">= {}".format(bins[-1][0]) + else: + bin_labels[0] = "<= {}".format(bins[0][1]) + bin_labels[-1] = "> {}".format(bins[-1][0]) return bin_labels From fcb6ef39805d6298c7a5dd867bebdaf66f88d628 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 21 Oct 2019 15:37:56 +0200 Subject: [PATCH 16/98] Refactor test organization of kbins_discretizer Cleaned tests by making them parametrizable. --- tests/preprocessing/test_kbins_discretizer.py | 252 ++++++++---------- 1 file changed, 112 insertions(+), 140 deletions(-) diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py index 2512186..a2ecdc2 100644 --- a/tests/preprocessing/test_kbins_discretizer.py +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -1,175 +1,147 @@ +from contextlib import contextmanager +import pytest + import numpy as np import pandas as pd -import pytest from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer +@contextmanager +def does_not_raise(): + yield + + class TestKBinsDiscretizer: ################# Test for public methods ################# ################# Test for private methods ################# - # Tests for _validate_n_bins function - def test_kbins_discretizer_validate_n_bins_exception_1(self): - - with pytest.raises(ValueError): - KBinsDiscretizer()._validate_n_bins(n_bins=1) - - def test_kbins_discretizer_validate_n_bins_exception_no_integral(self): - - with pytest.raises(ValueError): - KBinsDiscretizer()._validate_n_bins(n_bins=10.5) + @pytest.mark.parametrize("n_bins, expectation", + [(1, pytest.raises(ValueError)), + (10.5, pytest.raises(ValueError)), + (2, does_not_raise())], + ids=["invalid_int", "float", "normal"]) + def test_validate_n_bins_exception(self, n_bins, expectation): + with expectation: + assert KBinsDiscretizer()._validate_n_bins(n_bins=n_bins) is None + + def test_transform_column(self): + + data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) + discretizer = KBinsDiscretizer(n_bins=3, strategy="unform") + + bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)] + + actual = discretizer._transform_column(data, "variable", bins) + + categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"] + + expected = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) + expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4 + + ["3.0 - 6.0"]*3 + + ["6.0 - 9.0"]*3 + + ["Missing"], + categories=categories, + ordered=True) + + # assert using pandas testing module + pd.testing.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("n_bins, auto_adapt_bins, data, expected", + [(4, False, + pd.DataFrame({"variable": list(range(0, 11))}), + [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), + (8.0, 10.0)]), + (10, True, + # ints from 0-10 with 17 nan's + pd.DataFrame({"variable": list(range(0, 11)) + + ([np.nan] * 17)}), + [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), + (8.0, 10.0)]), + (10, False, + # almost constant + pd.DataFrame({"variable": [0] + ([1] * 100)}), + None)], + ids=["regular", "auto_adapt_bins", + "two bin edges"]) + def test_fit_column(self, n_bins, auto_adapt_bins, data, expected): + discretizer = KBinsDiscretizer(n_bins=n_bins, + auto_adapt_bins=auto_adapt_bins) - def test_kbins_discretizer_validate_n_bins_valid_n_bins(self): - - KBinsDiscretizer()._validate_n_bins(n_bins=2) - - # Test for _compute_bin_edges - def test_kbins_discretizer_compute_bin_edges_quantile_method(self): - - data = pd.DataFrame({"variable": list(range(0, 11))}) # ints from 0-10 - - discretizer = KBinsDiscretizer() - actual = discretizer._compute_bin_edges(data, column_name="variable", - n_bins=4, - col_min=data.variable.min(), - col_max=data.variable.max()) - expected = [0.0, 2.5, 5, 7.5, 10.0] - - assert expected == actual + actual = discretizer._fit_column(data, column_name="variable") - def test_kbins_discretizer_compute_bin_edges_uniform_method(self): + assert actual == expected - data = pd.DataFrame({"variable": list(range(0, 10))}) # ints from 0-9 + @pytest.mark.parametrize("strategy, n_bins, data, expected", + [("quantile", # strategy + 4, # n_bins + # data (ints from 0 - 10): + pd.DataFrame({"variable": list(range(0, 11))}), + [0.0, 2.5, 5, 7.5, 10.0]), # expected result + ("uniform", # strategy + 3, # n_bins + # data (ints from 0 - 9): + pd.DataFrame({"variable": list(range(0, 10))}), + [0.0, 3.0, 6.0, 9.0])], # expected result + ids=["quantile", "uniform"]) + def test_compute_bin_edges(self, strategy, n_bins, data, expected): + + discretizer = KBinsDiscretizer(strategy=strategy) - discretizer = KBinsDiscretizer(strategy="uniform") actual = discretizer._compute_bin_edges(data, column_name="variable", - n_bins=3, + n_bins=n_bins, col_min=data.variable.min(), col_max=data.variable.max()) - expected = [0.0, 3.0, 6.0, 9.0] - - assert expected == actual - - # Tests for _compute_minimal_precision_of_bin_edges - def test_compute_minimal_precision_of_bin_edges_less_precision(self): - # If starting precision is bigger than actual precision, should return - # starting precision - - bin_edges = [-10, 0, 1, 2] - discretizer = KBinsDiscretizer(starting_precision=1) - res = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) - assert res == 1 - - def test_compute_minimal_precision_of_bin_edges_more_precision(self): - # If starting precision is smaller than actual precision, should return - # actual precision - - bin_edges = [-10, 0, 1, 1.01] - discretizer = KBinsDiscretizer() - res = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) - assert res == 2 - - def test_compute_minimal_precision_of_bin_edges_equal_precision(self): - # If starting precision is equal to actual precision, should return - # starting precision - - bin_edges = [-10, 0, 1, 1.1] - discretizer = KBinsDiscretizer(starting_precision=1) - res = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) - assert res == 1 - - def test_compute_minimal_precision_of_bin_edges_negative_start(self): - # Check if negative starting precision also leads to the correct result - - bin_edges = [-10, 0, 1, 2] - discretizer = KBinsDiscretizer(starting_precision=-1) - res = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) - assert res == 0 - - def test_compute_minimal_precision_of_bin_edges_round_up(self): - # Check if negative starting precision leads to rounding up - # bin edges to the nearest multiple of 10 - - bin_edges = [-10, 0, 10, 21] - discretizer = KBinsDiscretizer(starting_precision=-1) - res = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) - assert res == -1 - # Tests for _compute_bins_from_edges - def test_kbins_discretizer_compute_bins_from_edges(self): - - bin_edges = [0, 1, 1.5, 2] - - discretizer = KBinsDiscretizer() - actual = discretizer._compute_bins_from_edges(bin_edges) - - expected = [(0, 1), (1, 1.5), (1.5, 2)] assert actual == expected - def test_kbins_discretizer_compute_bins_from_edges_round_up(self): + @pytest.mark.parametrize("bin_edges, starting_precision, expected", + [([-10, 0, 1, 2], 1, 1), + ([-10, 0, 1, 1.01], 0, 2), + ([-10, 0, 1, 1.1], 1, 1), + ([-10, 0, 1, 2], -1, 0), + ([-10, 0, 10, 21], -1, -1)], + ids=["less precision", "more precision", + "equal precision", "negative start", + "round up"]) + def test_compute_minimal_precision_of_bin_edges(self, bin_edges, + starting_precision, + expected): - bin_edges = [0, 1, 1.5, 3] + discretizer = KBinsDiscretizer(starting_precision=starting_precision) - discretizer = KBinsDiscretizer() - actual = discretizer._compute_bins_from_edges(bin_edges) + actual = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) - expected = [(0, 1), (1, 2), (2, 3)] assert actual == expected - # Tests for _fit_column - def test_kbins_discretizer_fit_column_regular(self): - - data = pd.DataFrame({"variable": list(range(0, 11))}) # ints from 0-10 - - discretizer = KBinsDiscretizer(n_bins=4) - actual = discretizer._fit_column(data, column_name="variable") - - expected = [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), (8.0, 10.0)] - - assert expected == actual - - def test_kbins_discretizer_fit_column_auto_adapt_bins(self): - - data = pd.DataFrame({"variable": list(range(0, 11)) + - ([np.nan] * 17)}) # ints from 0-10 with 17 nan's - - discretizer = KBinsDiscretizer(auto_adapt_bins=True) - actual = discretizer._fit_column(data, column_name="variable") - - expected = [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), (8.0, 10.0)] - - assert expected == actual - - def test_kbins_discretizer_fit_column_two_bin_edges(self): - - data = pd.DataFrame({"variable": [0] + ([1] * 100)}) # almost constant + @pytest.mark.parametrize("bin_edges, expected", + [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]), + ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)])]) + def test_compute_bins_from_edges(self, bin_edges, expected): discretizer = KBinsDiscretizer() - actual = discretizer._fit_column(data, column_name="variable") - - expected = None - - assert expected == actual - - # Tests for _create_bin_labels - def test_kbins_discretizer_create_bin_labels(self): - - bins = [(0, 1), (1, 2), (2, 3)] - - discretizer = KBinsDiscretizer() - actual = discretizer._create_bin_labels(bins) - expected = ["0 - 1", "1 - 2", "2 - 3"] + actual = discretizer._compute_bins_from_edges(bin_edges) assert actual == expected - def test_kbins_discretizer_create_bin_labels_different_endpoint_fmt(self): - - bins = [(0, 1), (1, 2), (2, 3)] + @pytest.mark.parametrize("change_endpoint_format, closed, bins, expected", + [(False, "right", [(0, 1), (1, 2), (2, 3)], + ["0 - 1", "1 - 2", "2 - 3"]), + (True, "right", [(0, 1), (1, 2), (2, 3)], + ["<= 1", "1 - 2", "> 2"]), + (True, "left", [(0, 1), (1, 2), (2, 3)], + ["< 1", "1 - 2", ">= 2"])], + ids=["standard format", "different endpoints", + "different endpoints left"]) + def test_create_bin_labels(self, change_endpoint_format, closed, + bins, expected): + + discretizer = KBinsDiscretizer( + closed=closed, + change_endpoint_format=change_endpoint_format + ) - discretizer = KBinsDiscretizer(change_endpoint_format=True) actual = discretizer._create_bin_labels(bins) - expected = ["< 1", "1 - 2", "> 2"] assert actual == expected From 20dc24b66b37fca56c50ea33c4df693dedb0cf26 Mon Sep 17 00:00:00 2001 From: JanBenisek Date: Fri, 22 Nov 2019 14:35:04 +0100 Subject: [PATCH 17/98] Add categorical regrouper Part of preprocessin module. The class regroups categorical variables based on incidence. --- cobra/preprocessing/__init__.py | 4 +- cobra/preprocessing/categorical_regrouper.py | 391 +++++++++++++++++++ dev/preprocessor/__init__.py | 0 dev/preprocessor/categorical_regrouper.py | 391 +++++++++++++++++++ dev/preprocessor/compare.py | 154 ++++++++ dev/preprocessor/develop.py | 255 +----------- 6 files changed, 952 insertions(+), 243 deletions(-) create mode 100644 cobra/preprocessing/categorical_regrouper.py create mode 100644 dev/preprocessor/__init__.py create mode 100644 dev/preprocessor/categorical_regrouper.py create mode 100644 dev/preprocessor/compare.py diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py index f3884e0..98c3ef9 100644 --- a/cobra/preprocessing/__init__.py +++ b/cobra/preprocessing/__init__.py @@ -1,5 +1,7 @@ from .kbins_discretizer import KBinsDiscretizer from .target_encoder import TargetEncoder +from .categorical_regrouper import CategoryRegrouper __all__ = ['KBinsDiscretizer', - 'TargetEncoder'] \ No newline at end of file + 'TargetEncoder', + 'CategoryRegrouper'] \ No newline at end of file diff --git a/cobra/preprocessing/categorical_regrouper.py b/cobra/preprocessing/categorical_regrouper.py new file mode 100644 index 0000000..a6276d4 --- /dev/null +++ b/cobra/preprocessing/categorical_regrouper.py @@ -0,0 +1,391 @@ + +import pandas as pd +import numpy as np +from scipy import stats +from typing import Dict +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.exceptions import NotFittedError +import logging + +log = logging.getLogger(__name__) + + +class CategoryRegrouper(BaseEstimator, TransformerMixin): + """ + Regroups categories in categorical variables based on significance + with target variable. + + Parameters + ---------- + scale_cont : bool, default=True + Whether contingency table should be scaled before chi^2.' + + pval_thresh : float, default=0.001 + Significance threshold for regroupping. + + regroup_rename : str, default='non-significant' + New name of non-significant regroupped variables. + + missing_rename : str, default='Missing' + New name of missing categories. + + keep_missing : bool, default=Falsse + Whether missing category should be kept in the result. + + forced_categories : Dict, default=None + Dictionary to force categories - + for each colum dict of {col:[forced vars]}. + + Attributes + ---------- + all_category_map_ : Dict + Dictionary with mapping for each variable. + """ + def __init__(self, scale_cont: bool = True, + pval_thresh: float = 0.001, + regroup_rename: str = "non-significant", + missing_rename: str = "Missing", + keep_missing: bool = False, + forced_categories: Dict = None): + self.scale_cont = scale_cont + self.pval_thresh = pval_thresh + self.regroup_rename = regroup_rename + self.missing_rename = missing_rename + self.keep_missing = keep_missing + self.forced_categories = forced_categories + + def fit(self, X: pd.DataFrame, + y: pd.Series, + columns: list = []): + """ + Method regroups categories whole DataFrame. + + Parameters + ---------- + X : pd.DataFrame + Dataframe with all the columns. + + y: pd.Series + Series with target variable. + + columns : list, default=[] + Columns to be regrouped. + + Raises + ------ + ValueError + In case X and y are not of the same length. + + Returns + ------- + None + Only fits the instance of the class. + """ + self.all_category_map_ = {} + + if len(X.index) != len(y.index): + raise ValueError("The length of X is {}, but the length of y is {}" + .format(len(X.index), len(y.index))) + + if not columns: + columns = CategoryRegrouper._get_categorical_columns(X) + log.warning("All object-type columns have been selected") + + for column in columns: + if column not in X.columns: + log.warning("DataFrame has no column '{}', so it will be " + "skipped in fitting" .format(column)) + continue + + self.all_category_map_[column] = self._fit_column(X=X, + y=y, + column=column) + + def _fit_column(self, X: pd.DataFrame, + y: pd.Series, + column: str) -> Dict: + """ + Method regroups categories in given column. + + Parameters + ---------- + X : pd.Series + Series with one column to be transformed. + + y: pd.Series + Series with target variable + + column : str + Column to be regrouped. + + Raises + ------ + ValueError + in case input column is not a string. + + Returns + ------- + Dict + Returns dictionary as {old category : new category} for + specific column. + """ + category_map = {} + keep_categories = [] + incidence_mean = y.mean() + + # Rename target + y.rename("TARGET", inplace=True) + + # Replace missings + X = self._replaceMissings(X=X, column=column, + replace_with=self.missing_rename) + + all_uq_categories = X[column].unique().tolist() + + # Remove small categories + categories = self._removeCategories(X=X, y=y, column=column) + + # Inspect remaining categories and test significance + for category in categories: + df_aux = pd.concat([X[column], y], axis=1) + df_aux['other_cats'] = np.where(df_aux[column] == category, 0, 1) + cont_table = pd.crosstab(index=df_aux['other_cats'], + columns=df_aux['TARGET'], + margins=False) + + # if true, we scale the "other" categories + if self.scale_cont: + size_other_cats = cont_table.iloc[1].sum() + cont_table.iloc[1, 0] = (1-incidence_mean)*size_other_cats + cont_table.iloc[1, 1] = incidence_mean*size_other_cats + cont_table = cont_table.values.astype(np.int64) + + pval = stats.chi2_contingency(cont_table, correction=False)[1] + + # If significant, keep it + if pval <= self.pval_thresh: + keep_categories.append(category) + + # Keep "Missing" even if it wasn't selected if + # it is in the original categories and set to True + if ((self.missing_rename not in keep_categories) and + (self.missing_rename in all_uq_categories) and self.keep_missing): + keep_categories.append(self.missing_rename) + + # Keep forced categories + if self.forced_categories is not None: + # If doesnt exists, give warning + forced = [col for col in self.forced_categories[column] + if col in all_uq_categories] + + # Extend list and remove duplicates + keep_categories = list(set(keep_categories.extend(forced))) + + difference = set(forced) - set(self.forced_categories[column]) + if len(difference) > 0: + log.warning("Following forced categories: {} " + "are not in column: {}.".format(difference, + column)) + + # Return dictionary as {old column : new column} + for category in all_uq_categories: + if category in keep_categories: + category_map[category] = category + else: + category_map[category] = self.regroup_rename + + return category_map + + def transform(self, X: pd.DataFrame, + columns: list = []) -> pd.DataFrame: + """ + Method transforms specified columns. + + Parameters + ---------- + X : pd.DataFrame + Dataframe with all the columns. + + columns : list, default=[] + Columns to be regrouped. + + Raises + ------ + NotFittedError + If fit() method has not been called. + + ValueError + If columns to be transformed have not been fitted. + + Returns + ------- + pd.DataFrame + Returns transformed DataFrame with new columns as "col_regrouped". + """ + if len(self.all_category_map_) == 0: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + fitted_columns = list(self.all_category_map_.keys()) + + # if specified columns not in fitted Dict, raise error + if not set(columns).issubset(set(fitted_columns)): + diff_cols = set(columns).difference(set(fitted_columns)) + raise ValueError("Following columns are not fitted: " + "{}".format(diff_cols)) + + X_tr = X.copy() + for column in columns: + X_tr[column + "_regrouped"] = self._transform_column(X=X, + column=column) + + return X_tr + + def _transform_column(self, X: pd.DataFrame, + column: str) -> pd.Series: + """ + Method transforms specified columns. + + Parameters + ---------- + X : pd.DataFrame + Dataframe with all the columns. + + column : str + Column to be regrouped. + + Returns + ------- + pd.Series + Returns DataFrame with regrouped variable as category datatype. + """ + X_tr = X[column].copy() + X_tr[column + "_regrouped"] = X_tr.replace( + to_replace=self.all_category_map_[column]) + + X_tr[column + "_regrouped"] = X_tr[column + + "_regrouped"].astype('category') + + return X_tr[column + "_regrouped"] + + def fit_transform(self, X: pd.DataFrame, + y: pd.Series, + columns: list = []) -> pd.DataFrame: + """ + Auxiliary method fits and transforms specified columns. + + Parameters + ---------- + X : pd.DataFrame + Dataframe with all the columns. + + y : pd.Series + Series with target variable + + column : list, default=[] + Columns to be regrouped. + + Returns + ------- + pd.DataFrame + Returns DataFrame with regrouped variable as category datatype. + """ + self.fit(X=X, y=y, columns=columns) + + X_tr = self.transform(X=X, columns=columns) + + return X_tr + + def _replaceMissings(self, X: pd.DataFrame, + column: str, + replace_with: str = 'Missing') -> pd.DataFrame: + """ + Method replaces missing and empty cells with `Missing` (default) in + a pd.DataFrame. + + Parameters + ---------- + X : pd.DataFrame + Dataframe where a value will be replaced if empty or nan. + + column : str + Column to be analyzed for missings. + + replace_with : str default='Missing' + String to replace the missings. + + Raises + ------ + ValueError + In case input column is not a string. + + Returns + ------- + pd.DataFrame + Modified dataframe with replaced missings. + """ + if X[column].dtype != 'O' or X[column].dtype != 'object': + raise TypeError("column {} must be a string".format(column)) + + X[column].fillna(replace_with, inplace=True) + X[column] = X[column].astype(str).str.strip() + X[column].replace('', replace_with, inplace=True) + + return X + + def _removeCategories(self, X: pd.DataFrame, + y: pd.Series, + column: str, + threshold: int = 5) -> np.ndarray: + """ + Method removes category which fail to meet certain condition + + Parameters + ---------- + X : pd.DataFrame + Dataframe with columns to be inspected for group removal. + + y : pd.Series + Series with target. + + column : str + Column to be analyzed group removal. + + threshold : int default=5 + Threshold for group removal. + + Returns + ------- + np.ndarray + Numpy array with groups to be kept. + """ + category_cnts = pd.DataFrame(X.groupby(column)[column].count()) + train_inc = y.mean() + factor = max(train_inc, 1-train_inc) + keep_categories = category_cnts.where((category_cnts*factor) > + threshold) + + return np.array(keep_categories.index.tolist()) + + @staticmethod + def _get_categorical_columns(data: pd.DataFrame) -> list: + """Get the columns containing categorical data + (dtype "object" or "category") + + Parameters + ---------- + data : pd.DataFrame + Dataframe from which categorical variables + will be extracted. + + Returns + ------- + list + List of column names containing categorical data. + """ + object_columns = data.dtypes[data.dtypes == object].index + categorical_columns = data.dtypes[data.dtypes == "category"].index + + return list(set(object_columns).union(set(categorical_columns))) diff --git a/dev/preprocessor/__init__.py b/dev/preprocessor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dev/preprocessor/categorical_regrouper.py b/dev/preprocessor/categorical_regrouper.py new file mode 100644 index 0000000..a6276d4 --- /dev/null +++ b/dev/preprocessor/categorical_regrouper.py @@ -0,0 +1,391 @@ + +import pandas as pd +import numpy as np +from scipy import stats +from typing import Dict +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.exceptions import NotFittedError +import logging + +log = logging.getLogger(__name__) + + +class CategoryRegrouper(BaseEstimator, TransformerMixin): + """ + Regroups categories in categorical variables based on significance + with target variable. + + Parameters + ---------- + scale_cont : bool, default=True + Whether contingency table should be scaled before chi^2.' + + pval_thresh : float, default=0.001 + Significance threshold for regroupping. + + regroup_rename : str, default='non-significant' + New name of non-significant regroupped variables. + + missing_rename : str, default='Missing' + New name of missing categories. + + keep_missing : bool, default=Falsse + Whether missing category should be kept in the result. + + forced_categories : Dict, default=None + Dictionary to force categories - + for each colum dict of {col:[forced vars]}. + + Attributes + ---------- + all_category_map_ : Dict + Dictionary with mapping for each variable. + """ + def __init__(self, scale_cont: bool = True, + pval_thresh: float = 0.001, + regroup_rename: str = "non-significant", + missing_rename: str = "Missing", + keep_missing: bool = False, + forced_categories: Dict = None): + self.scale_cont = scale_cont + self.pval_thresh = pval_thresh + self.regroup_rename = regroup_rename + self.missing_rename = missing_rename + self.keep_missing = keep_missing + self.forced_categories = forced_categories + + def fit(self, X: pd.DataFrame, + y: pd.Series, + columns: list = []): + """ + Method regroups categories whole DataFrame. + + Parameters + ---------- + X : pd.DataFrame + Dataframe with all the columns. + + y: pd.Series + Series with target variable. + + columns : list, default=[] + Columns to be regrouped. + + Raises + ------ + ValueError + In case X and y are not of the same length. + + Returns + ------- + None + Only fits the instance of the class. + """ + self.all_category_map_ = {} + + if len(X.index) != len(y.index): + raise ValueError("The length of X is {}, but the length of y is {}" + .format(len(X.index), len(y.index))) + + if not columns: + columns = CategoryRegrouper._get_categorical_columns(X) + log.warning("All object-type columns have been selected") + + for column in columns: + if column not in X.columns: + log.warning("DataFrame has no column '{}', so it will be " + "skipped in fitting" .format(column)) + continue + + self.all_category_map_[column] = self._fit_column(X=X, + y=y, + column=column) + + def _fit_column(self, X: pd.DataFrame, + y: pd.Series, + column: str) -> Dict: + """ + Method regroups categories in given column. + + Parameters + ---------- + X : pd.Series + Series with one column to be transformed. + + y: pd.Series + Series with target variable + + column : str + Column to be regrouped. + + Raises + ------ + ValueError + in case input column is not a string. + + Returns + ------- + Dict + Returns dictionary as {old category : new category} for + specific column. + """ + category_map = {} + keep_categories = [] + incidence_mean = y.mean() + + # Rename target + y.rename("TARGET", inplace=True) + + # Replace missings + X = self._replaceMissings(X=X, column=column, + replace_with=self.missing_rename) + + all_uq_categories = X[column].unique().tolist() + + # Remove small categories + categories = self._removeCategories(X=X, y=y, column=column) + + # Inspect remaining categories and test significance + for category in categories: + df_aux = pd.concat([X[column], y], axis=1) + df_aux['other_cats'] = np.where(df_aux[column] == category, 0, 1) + cont_table = pd.crosstab(index=df_aux['other_cats'], + columns=df_aux['TARGET'], + margins=False) + + # if true, we scale the "other" categories + if self.scale_cont: + size_other_cats = cont_table.iloc[1].sum() + cont_table.iloc[1, 0] = (1-incidence_mean)*size_other_cats + cont_table.iloc[1, 1] = incidence_mean*size_other_cats + cont_table = cont_table.values.astype(np.int64) + + pval = stats.chi2_contingency(cont_table, correction=False)[1] + + # If significant, keep it + if pval <= self.pval_thresh: + keep_categories.append(category) + + # Keep "Missing" even if it wasn't selected if + # it is in the original categories and set to True + if ((self.missing_rename not in keep_categories) and + (self.missing_rename in all_uq_categories) and self.keep_missing): + keep_categories.append(self.missing_rename) + + # Keep forced categories + if self.forced_categories is not None: + # If doesnt exists, give warning + forced = [col for col in self.forced_categories[column] + if col in all_uq_categories] + + # Extend list and remove duplicates + keep_categories = list(set(keep_categories.extend(forced))) + + difference = set(forced) - set(self.forced_categories[column]) + if len(difference) > 0: + log.warning("Following forced categories: {} " + "are not in column: {}.".format(difference, + column)) + + # Return dictionary as {old column : new column} + for category in all_uq_categories: + if category in keep_categories: + category_map[category] = category + else: + category_map[category] = self.regroup_rename + + return category_map + + def transform(self, X: pd.DataFrame, + columns: list = []) -> pd.DataFrame: + """ + Method transforms specified columns. + + Parameters + ---------- + X : pd.DataFrame + Dataframe with all the columns. + + columns : list, default=[] + Columns to be regrouped. + + Raises + ------ + NotFittedError + If fit() method has not been called. + + ValueError + If columns to be transformed have not been fitted. + + Returns + ------- + pd.DataFrame + Returns transformed DataFrame with new columns as "col_regrouped". + """ + if len(self.all_category_map_) == 0: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + fitted_columns = list(self.all_category_map_.keys()) + + # if specified columns not in fitted Dict, raise error + if not set(columns).issubset(set(fitted_columns)): + diff_cols = set(columns).difference(set(fitted_columns)) + raise ValueError("Following columns are not fitted: " + "{}".format(diff_cols)) + + X_tr = X.copy() + for column in columns: + X_tr[column + "_regrouped"] = self._transform_column(X=X, + column=column) + + return X_tr + + def _transform_column(self, X: pd.DataFrame, + column: str) -> pd.Series: + """ + Method transforms specified columns. + + Parameters + ---------- + X : pd.DataFrame + Dataframe with all the columns. + + column : str + Column to be regrouped. + + Returns + ------- + pd.Series + Returns DataFrame with regrouped variable as category datatype. + """ + X_tr = X[column].copy() + X_tr[column + "_regrouped"] = X_tr.replace( + to_replace=self.all_category_map_[column]) + + X_tr[column + "_regrouped"] = X_tr[column + + "_regrouped"].astype('category') + + return X_tr[column + "_regrouped"] + + def fit_transform(self, X: pd.DataFrame, + y: pd.Series, + columns: list = []) -> pd.DataFrame: + """ + Auxiliary method fits and transforms specified columns. + + Parameters + ---------- + X : pd.DataFrame + Dataframe with all the columns. + + y : pd.Series + Series with target variable + + column : list, default=[] + Columns to be regrouped. + + Returns + ------- + pd.DataFrame + Returns DataFrame with regrouped variable as category datatype. + """ + self.fit(X=X, y=y, columns=columns) + + X_tr = self.transform(X=X, columns=columns) + + return X_tr + + def _replaceMissings(self, X: pd.DataFrame, + column: str, + replace_with: str = 'Missing') -> pd.DataFrame: + """ + Method replaces missing and empty cells with `Missing` (default) in + a pd.DataFrame. + + Parameters + ---------- + X : pd.DataFrame + Dataframe where a value will be replaced if empty or nan. + + column : str + Column to be analyzed for missings. + + replace_with : str default='Missing' + String to replace the missings. + + Raises + ------ + ValueError + In case input column is not a string. + + Returns + ------- + pd.DataFrame + Modified dataframe with replaced missings. + """ + if X[column].dtype != 'O' or X[column].dtype != 'object': + raise TypeError("column {} must be a string".format(column)) + + X[column].fillna(replace_with, inplace=True) + X[column] = X[column].astype(str).str.strip() + X[column].replace('', replace_with, inplace=True) + + return X + + def _removeCategories(self, X: pd.DataFrame, + y: pd.Series, + column: str, + threshold: int = 5) -> np.ndarray: + """ + Method removes category which fail to meet certain condition + + Parameters + ---------- + X : pd.DataFrame + Dataframe with columns to be inspected for group removal. + + y : pd.Series + Series with target. + + column : str + Column to be analyzed group removal. + + threshold : int default=5 + Threshold for group removal. + + Returns + ------- + np.ndarray + Numpy array with groups to be kept. + """ + category_cnts = pd.DataFrame(X.groupby(column)[column].count()) + train_inc = y.mean() + factor = max(train_inc, 1-train_inc) + keep_categories = category_cnts.where((category_cnts*factor) > + threshold) + + return np.array(keep_categories.index.tolist()) + + @staticmethod + def _get_categorical_columns(data: pd.DataFrame) -> list: + """Get the columns containing categorical data + (dtype "object" or "category") + + Parameters + ---------- + data : pd.DataFrame + Dataframe from which categorical variables + will be extracted. + + Returns + ------- + list + List of column names containing categorical data. + """ + object_columns = data.dtypes[data.dtypes == object].index + categorical_columns = data.dtypes[data.dtypes == "category"].index + + return list(set(object_columns).union(set(categorical_columns))) diff --git a/dev/preprocessor/compare.py b/dev/preprocessor/compare.py new file mode 100644 index 0000000..f863bfa --- /dev/null +++ b/dev/preprocessor/compare.py @@ -0,0 +1,154 @@ +#%% +import pandas as pd +import numpy as np +from random import shuffle +from scipy import stats +from typing import Dict, Tuple +import sys + +sys.path.insert(0,"C:/Local/pers/Documents/GitHub/Cobra/dev") + +import preprocessor.categorical_regrouper as pr + +import logging +log = logging.getLogger(__name__) + +ROOT = "C:/Local/pers/Documents/GitHub/Cobra/" +df_data = pd.read_csv(ROOT + "datasets/titanic_data.csv") +df_data.rename(columns={'Survived': 'TARGET'}, inplace=True) +df_data['Pclass'] = df_data['Pclass'].astype(object) + +split = ['TRAIN']*int(df_data.shape[0]*0.7) + \ + ['TEST']*int(df_data.shape[0]*0.2)+ \ + ['VALIDATION']*int(np.ceil(df_data.shape[0]*0.1)) + +shuffle(split) + +df_data['PARTITION'] = split + +df_x = pd.DataFrame(df_data[['Pclass', 'Embarked']][df_data['PARTITION'] == "TRAIN"]) +df_y = df_data['TARGET'][df_data['PARTITION'] == "TRAIN"] + +#%% +""" NEW SOLUTION """ +CR = pr.CategoryRegrouper() + +CR.fit(X=df_x, y=df_y, columns=["Embarked", "Pclass"]) +print(CR.all_category_map_) +df_new = CR.transform(X=df_x, columns=["Embarked", "Pclass"]) + +#%% +""" OLD SOLUTION """ +def __regroup(var,target,train,pval_thresh=0.01,dummy=True,keep='Missing',rename='Other'): + ''' + Method regroups categorical variables + Returns DF mask + ---------------------------------------------------- + var: input pd.Serie with cat column + target: pd.Serie with target variable + train: pd.Serie with parition variable + pval_thresh: threshold for regrouping + dummy: scale of booleans (?) + keep: keep specific groups (?) + rename: rename the insignificant category + ---------------------------------------------------- + - Each group is tested with a chi² for relevant incidence differences in comparison to a rest-group + - The rest group has the size of the remaining groups and an 'overall average incidence' (if dummy=True) or + - remaining groups average incidence' (if dummy=False) + - Groups with a pvalue above the threshold are relabled to a single group + ''' + + # Define the chi² test condition + # Groups that do not meet the condition are not analyzed and will be unconditionally relabled + def _chi2cond_(var=var,target=target,train=train): + varcounts = var[train].groupby(by=var).count() + train_inc = target[train].sum()/len(target[train]) + factor = max(train_inc, 1-train_inc) + analyze_mask = (varcounts*factor)>5 + analyze_groups = analyze_mask.index[analyze_mask].values + return analyze_groups + + # Compute overal incidence mean + incidence_mean = target[train].mean() + # Create container of which groups will be kept, compared to the groups which will be relabled + keepgroups = [] + # Cycle and test each group that meets the chi² condition + for group in _chi2cond_(): + # Container for target 0/1 observations of the group under scrutiny + obs_group = [] + # Counts of the target 0/1 occurences for the group under scrutiny + obs_group.append(((target[train]==0)&(var[train]==group)).sum()) + obs_group.append(((target[train]==1)&(var[train]==group)).sum()) + obs_group = np.array(obs_group) + # Container for target 0/1 observations of the remaining groups together + obs_other = [] + # Counts of the target 0/1 occurences for the remaining groups together + obs_other.append(((target[train]==0)&(var[train]!=group)).sum()) + obs_other.append(((target[train]==1)&(var[train]!=group)).sum()) + obs_other = np.array(obs_other) + # If dummy=True, we scale the two groups of target 0/1 occurences such that the incidence is equal to the overall incidence + # The size of the two groups of target 0/1 occurences is still equal to the size of the remaining groups + if dummy: + obs_other_size = obs_other.sum() + obs_other[0]=(1-incidence_mean)*obs_other_size # 0(1) index coincides with target = 0(1) + obs_other[1]=( incidence_mean)*obs_other_size + obs = np.array([obs_group,obs_other]) + # Place at least 1 observation to avoid error in chi2 test + obs[obs==0] = 1 + # Perform chi² test + pval = stats.chi2_contingency(obs, correction=False)[1] + # If pval outperforms threshold, append the group in the keepgroups list + if pval<=pval_thresh: + keepgroups.append(group) + #elif group==keep: + # keepgroups.append(group) + # If the specific group to be kept (e.g. 'Missing') didn't pass the test, append it to the keepgroups list + if keep not in keepgroups: + keepgroups.append(keep) + # Makes a list of all groups not in the keepgroups list + regroup_mask = [val not in keepgroups for val in var.values] + var_regroup = var.copy() + # Rename those groups + var_regroup[regroup_mask] = rename + var_regroup.name = "B_"+var.name + info = (var.name+": from "+str(len(var.unique()))+" to "+str(len(var_regroup.unique()))) + return var_regroup, info + +#%% +result = __regroup(var=df_data['Pclass'], #Cabin, Pclass, SibSp, Parch, Embarked + target=df_data.loc[:,'TARGET'], + train=df_data['PARTITION']=='TRAIN', + pval_thresh=0.05, + dummy=True, + keep='Missing', + rename='non-significant') + +print(result[0].unique()) +print(result[0].head(n=5)) +#print(result[1]) +df_orig = result[0].to_frame() +df_orig.columns = ["old"] +df_orig["old"] = df_orig["old"].astype('str') +df_orig["old"] = df_orig["old"].astype('category') + +df_orig['split'] = df_data['PARTITION'] +df_orig = df_orig[df_orig['split'] == 'TRAIN'] + + +#%% +""" COMPARE """ +#df_orig.loc[:,"new"] = df_new['Embarked_regrouped'].copy() +df_orig.loc[:,"new"] = df_new['Pclass_regrouped'].copy() + +print(df_orig) + +df_orig['compare'] = df_orig["new"] == df_orig["old"] + +print(df_orig[df_orig['compare'] == False]) + + + +#%% + + +#%% diff --git a/dev/preprocessor/develop.py b/dev/preprocessor/develop.py index b1cb9bf..6e8c4dc 100644 --- a/dev/preprocessor/develop.py +++ b/dev/preprocessor/develop.py @@ -4,6 +4,11 @@ from random import shuffle from scipy import stats from typing import Dict, Tuple +import sys + +sys.path.insert(0,"C:/Local/pers/Documents/GitHub/Cobra/dev/preprocessor") + +import preprocessor.categorical_regrouper as pr import logging log = logging.getLogger(__name__) @@ -11,6 +16,7 @@ ROOT = "C:/Local/pers/Documents/GitHub/Cobra/" df_data = pd.read_csv(ROOT + "datasets/titanic_data.csv") df_data.rename(columns={'Survived': 'TARGET'}, inplace=True) +df_data['Pclass'] = df_data['Pclass'].astype(object) split = ['TRAIN']*int(df_data.shape[0]*0.5) + \ ['TEST']*int(df_data.shape[0]*0.2)+ \ @@ -20,260 +26,25 @@ df_data['PARTITION'] = split -df_x = pd.DataFrame(df_data[['Parch', 'Embarked']][df_data['PARTITION'] == "TRAIN"]) +df_x = pd.DataFrame(df_data[['Pclass', 'Embarked']][df_data['PARTITION'] == "TRAIN"]) df_y = df_data['TARGET'][df_data['PARTITION'] == "TRAIN"] #%% -column = 'Embarked' -scale_cont = True -incidence_mean = df_y.mean() -pval_thresh = 0.001 -keep_categories = [] -keep = 'Missing' -category_map = {} -replace_with = 'non-significant' - -for category in grps: - #category = 'S' - df_aux = pd.concat([df_x[column], df_y], axis=1) - df_aux['obs_other'] = np.where(df_aux[column] == category, 0, 1) - - cont_table = pd.crosstab(df_aux['obs_other'], df_aux['TARGET'], margins=False) - - if scale_cont: - size_other_cat = cont_table.iloc[1].sum() - cont_table.iloc[1, 0] = (1-incidence_mean)*size_other_cat - cont_table.iloc[1, 1] = incidence_mean*size_other_cat - cont_table = cont_table.values.astype(np.int64) - - pval = stats.chi2_contingency(cont_table, correction=False)[1] - #0.17914169501249405 - - if pval<=pval_thresh: - keep_categories.append(category) - -if keep not in keep_categories and keep in df_x[column].unique().tolist(): - keep_categories.append(keep) - -for category in df_x[column].unique().tolist(): - if category in keep_categories: - category_map[category] = category - else: - category_map[category] = replace_with - - - +""" NEW SOLUTION """ +CR = pr.CategoryRegrouper() +CR.fit(X=df_x, y=df_y, columns=["Embarked", "Pclass"]) +print(CR.all_category_map_) +df_X_tr = CR.transform(X=df_x, columns=["Embarked", "Pclass"]) #%% +""" OLD SOLUTION """ -class CategoryRegrouper(): - - """ - TOOD - -test the keep_categories and give warning if category not in column - -transform will be just df.replace() with a dict - -I am keeping missings, but not in the original code, inspect - -ask geert about the _removeCategories function - - -write the rest - -combine with categorical_processor.py - -add to init - -test if same as the old code - -unit tests - - Regroups categories in categorical variables if based on signicicance - with target variable - - Attributes - ---------- - scale_cont : bool, default=True - whether contingency table should be scaled before chi-2 - pval_thresh : float, default=0.001 - significance threshold for regroupping - regroup_rename : str, default='non-significant' - new name of non-significant regroupped variables - missing_rename : str, default='Missing' - new name of missing categories - keep_missing : bool, default=True - whether missing category should be kept in the result - forced_categories : Dict, default=None - dictionary to force categories - - for each colum dict of {col:[forced vars]} - """ - - def __init__(self, scale_cont: bool=True, - pval_thresh: float=0.001, - regroup_rename: str="non-significant", - missing_rename: str="Missing", - keep_missing: bool=True, - forced_categories: Dict=None): - self.scale_cont = scale_cont - self.pval_thresh = pval_thresh - self.regroup_rename = regroup_rename - self.missing_rename = missing_rename - self.keep_missing = keep_missing - self.forced_categories = forced_categories - - def fit(self): - pass - - def _fit_column(self, X: pd.DataFrame, - y: pd.Series, - column: str) -> Dict: - - category_map = {} - keep_categories = [] - self.incidence_mean = y.mean() - all_uq_categories = X[column].unique().tolist() - - # Rename target - y.rename("TARGET", inplace=True) - - # Replace missings - X = self._replaceMissings(X=X, column=column, - replace_with=self.missing_rename) - - # Remove small categories - categories = self._removeCategories(X=X, y=y, column=column) - - # Inspect remaining categories and test significance - for category in categories: - df_aux = pd.concat([X[column], y], axis=1) - df_aux['other_cats'] = np.where(df_aux[column] == category, 0, 1) - cont_table = pd.crosstab(index=df_aux['other_cats'], - columns=df_aux['TARGET'], - margins=False) - - # if true, we scale the "other" categories - if self.scale_cont: - size_other_cats = cont_table.iloc[1].sum() - cont_table.iloc[1, 0] = (1-self.incidence_mean)*size_other_cats - cont_table.iloc[1, 1] = self.incidence_mean*size_other_cats - cont_table = cont_table.values.astype(np.int64) - pval = stats.chi2_contingency(cont_table, correction=False)[1] - # If significant, keep it - if pval <= self.pval_thresh: - keep_categories.append(category) - # Keep "Missing" even if it wasn't selected if - # it is in the original categories and set to True - if ((self.missing_rename not in keep_categories) and - (self.missing_rename in all_uq_categories) and self.keep_missing): - keep_categories.append(self.missing_rename) - # Keep forced categories - if self.forced_categories is not None: - # If doesnt exists, give warning - forced = [col for col in self.forced_categories[column] - if col in all_uq_categories] - - # Extend list and remove duplicates - keep_categories = list(set(keep_categories.extend(forced))) - - difference = set(forced) - set(self.forced_categories[column]) - if len(difference) > 0: - log.warning("Following forced categories: {} " - "are not in column: {}.".format(difference, - column)) - - # Return dictionary as {old column : new column} - for category in all_uq_categories: - if category in keep_categories: - category_map[category] = category - else: - category_map[category] = self.regroup_rename - - return category_map - - def transform(self): - pass - - def _transform_column(self): - pass - - def fit_transform(self): - pass - - def _replaceMissings(self, X: pd.DataFrame, - column: str, - replace_with: str='Missing') -> pd.DataFrame: - """ - Method replaces missing and empty cells with `Missing` (default) in - a pd.DataFrame - - df_tst = _replaceMissings(X=df_x, column='Embarked') - - Parameters - ---------- - X : pd.DataFrame - Dataframe where a value will be replaced if empty or nan - column : str - Column to be analyzed for missings - replace_with : str default='Missing' - string to replace the missings - - Raises - ------ - ValueError - in case input column is not a string - - Returns - ------- - pd.DataFrame - modified dataframe with replaced missings - """ - if not X[column].dtype == 'object': - raise TypeError("columns must be a string") - - X[column].fillna(replace_with, inplace=True) - X[column] = X[column].str.strip() - X[column].replace('', replace_with, inplace=True) - - return X - - def _removeCategories(self, X: pd.DataFrame, - y: pd.Series, - column: str, - threshold: int=5) -> np.ndarray: - """ - Method removes category which fail to meet certain condition - - grps = _removeGroups(X=df_x, y=df_y, column='Embarked') - - Parameters - ---------- - X : pd.DataFrame - Dataframe with columns to be inspected for group removal - y : pd.Series - Series with target - column : str - Column to be analyzed group removal - threshold : int default=5 - Threshold for group removal - - Returns - ------- - np.ndarray - numpy array with groups to be kept - """ - category_cnts = pd.DataFrame(X.groupby(column)[column].count()) - train_inc = y.mean() - factor = max(train_inc, 1-train_inc) - keep_categories = category_cnts.where((category_cnts*factor) > - threshold) - - return np.array(keep_categories.index.tolist()) - - - -#%% -CR = CategoryRegrouper() -output = CR._fit_column(X=df_x, y=df_y, column='Embarked') -output #%% From 855caaf49a284bcccb1b7091b0861ccdcf1bc627 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 28 Nov 2019 14:03:16 +0100 Subject: [PATCH 18/98] Bug fix in TargetEncoder When applying incidence replacement on variables of type "category", the resulting dtype was also of type category which should of course be float instead! --- cobra/preprocessing/kbins_discretizer.py | 15 +++++++++++---- cobra/preprocessing/target_encoder.py | 13 +++++++++++-- tests/preprocessing/test_target_encoder.py | 3 +++ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 8ae7e3d..57bd08c 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -1,8 +1,15 @@ """ -This class is a rework of -https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/preprocessing/_discretization.py -However, it is purely written in pandas instead of numpy because -it is more intuitive +This module is a rework of +https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/preprocessing/ +_discretization.py +However, it is purely written in pandas instead of numpy because it is more +intuitive + +Also, some custom modifications were included to allign it with our methodology + +Authors: +- Geert Verstraeten (methodology) +- Matthias Roels (implementation) """ # standard lib imports from copy import deepcopy diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index be9da12..f487f94 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -1,5 +1,10 @@ """ - Incidence Replacement Module +Incidence Replacement Module. The implementation is inspired by +https://contrib.scikit-learn.org/categorical-encoding/index.html + +Authors: +- Geert Verstraeten (methodology) +- Matthias Roels (implementation) """ import logging log = logging.getLogger(__name__) @@ -157,7 +162,11 @@ def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: new_column = TargetEncoder._clean_column_name(column) - X[new_column] = X[column].map(self._mapping[column]) + # Convert dtype to float because when the original dtype + # is of type "category", the resulting dtype is also of type + # "category" + X[new_column] = (X[column].map(self._mapping[column]) + .astype("float")) new_columns.append(new_column) diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py index cea1f3d..6766571 100644 --- a/tests/preprocessing/test_target_encoder.py +++ b/tests/preprocessing/test_target_encoder.py @@ -90,6 +90,9 @@ def test_target_encoder_transform(self): 'neutral'], 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + # inputs of TargetEncoder will be of dtype category + df["variable"] = df["variable"].astype("category") + expected = pd.DataFrame({'variable_enc': [0.666667, 0.666667, 0.333333, 0.50000, 0.333333, 0.666667, From d6a5ceb5cb50771d4893cde50af063159418b239 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 28 Nov 2019 16:54:20 +0100 Subject: [PATCH 19/98] Refactor API of TargetEncoder - Change API of TargetEncoder to make it compatable with the one of KBinsDiscretizer - Add (de)serialization options to TargetEncoder - Modify tests accordingly --- cobra/preprocessing/target_encoder.py | 123 ++++++++++++--------- tests/preprocessing/test_target_encoder.py | 114 ++++++++++++------- 2 files changed, 147 insertions(+), 90 deletions(-) diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index f487f94..2fc2369 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -12,11 +12,11 @@ #import numpy as np import pandas as pd -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError -class TargetEncoder(BaseEstimator, TransformerMixin): +class TargetEncoder(BaseEstimator): """Target encoding for categorical features. @@ -43,12 +43,11 @@ class TargetEncoder(BaseEstimator, TransformerMixin): zero, there is no smoothing (e.g. the pure target incidence is used). """ - def __init__(self, columns: list=None, weight: float=0.0): + def __init__(self, weight: float=0.0): if weight < 0: raise ValueError("The value of weight cannot be smaller than zero") - self.columns = columns self.weight = weight self._mapping = {} # placeholder for fitted output @@ -57,40 +56,79 @@ def __init__(self, columns: list=None, weight: float=0.0): # self.randomized = randomized # self.sigma = sigma - def fit(self, X: pd.DataFrame, y: pd.Series): - """Fit the TargetEncoder to X and y + def to_dict(self) -> dict: + """Return the attributes of TargetEncoder in a dictionary + + Returns + ------- + dict + Contains the attributes of TargetEncoder instance with the names + as keys + """ + params = self.get_params() + + params["_mapping"] = { + key: value.to_dict() + for key, value in self._mapping.items() + } + + return params + + def from_dict(self, params: dict): + """Summary Parameters ---------- - X : pd.DataFrame + params : dict + Description + """ + + if "weight" in params and type(params["weight"]) == float: + self.weight = params["weight"] + + _mapping = {} + if "_mapping" in params and type(params["_mapping"]) == dict: + _mapping = params["_mapping"] + + def dict_to_series(key, value): + s = pd.Series(value) + s.index.name = key + return s + + self._mapping = { + key: dict_to_series(key, value) + for key, value in _mapping.items() + } + + return self + + def fit(self, data: pd.DataFrame, column_names: list, + target_column: str): + """Fit the TargetEncoder to data and y + + Parameters + ---------- + data : pd.DataFrame data used to compute the mapping to encode the categorical variables with. - y : pd.Series - series containing the targets for each observation - - Raises - ------ - ValueError - if the length of X and y are not equal + column_names : list + Columns of data to be encoded + target_column : str + Column name of the target """ - # The lengths must be equal - if len(X.index) != len(y.index): - raise ValueError("The length of X is {}, but the length of y is {}" - .format(len(X.index), len(y.index))) - - if self.columns is None: - self.columns = TargetEncoder._get_categorical_columns(X) # compute global mean (target incidence in case of binary target) + y = data[target_column] global_mean = y.sum() / y.count() - for column in self.columns: - if column not in X.columns: + for column in column_names: + if column not in data.columns: log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting" .format(column)) continue - self._mapping[column] = self._fit_column(X[column], y, global_mean) + self._mapping[column] = self._fit_column(data[column], y, + global_mean) def _fit_column(self, X: pd.Series, y: pd.Series, global_mean: float) -> pd.Series: @@ -123,7 +161,8 @@ def _fit_column(self, X: pd.Series, y: pd.Series, return numerator/denominator - def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: + def transform(self, data: pd.DataFrame, + column_names: list) -> pd.DataFrame: """Replace (e.g. encode) categories of each column with its average incidence which was computed when the fit method was called @@ -131,8 +170,8 @@ def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: ---------- X : pd.DataFrame data to encode - y : pd.Series, optional - Ignored (added for compatibility with scikit-learn) + column_names : list + Columns of data to be encoded Returns ------- @@ -153,9 +192,9 @@ def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: raise NotFittedError(msg.format(self.__class__.__name__)) new_columns = [] - for column in self.columns: + for column in column_names: - if column not in X.columns: + if column not in data.columns: log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column)) continue @@ -165,32 +204,12 @@ def transform(self, X: pd.DataFrame, y: pd.Series=None) -> pd.DataFrame: # Convert dtype to float because when the original dtype # is of type "category", the resulting dtype is also of type # "category" - X[new_column] = (X[column].map(self._mapping[column]) - .astype("float")) + data[new_column] = (data[column].map(self._mapping[column]) + .astype("float")) new_columns.append(new_column) - return X[new_columns] - - @staticmethod - def _get_categorical_columns(data: pd.DataFrame) -> list: - """Get the columns containing categorical data - (dtype "object" or "category") - - Parameters - ---------- - data : pd.DataFrame - Description - - Returns - ------- - list - List of column names containing categorical data - """ - object_columns = data.dtypes[data.dtypes == object].index - categorical_columns = data.dtypes[data.dtypes == "category"].index - - return list(set(object_columns).union(set(categorical_columns))) + return data @staticmethod def _clean_column_name(column_name: str) -> str: diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py index 6766571..2b4144a 100644 --- a/tests/preprocessing/test_target_encoder.py +++ b/tests/preprocessing/test_target_encoder.py @@ -10,6 +10,71 @@ def test_target_encoder_constructor_value_error(self): with pytest.raises(ValueError): TargetEncoder(weight=-1) + # Tests for to_dict and from_dict + def test_target_encoder_to_dict(self): + + encoder = TargetEncoder() + + mapping_data = pd.Series(data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"]) + mapping_data.index.name = "variable" + + encoder._mapping["variable"] = mapping_data + + actual = encoder.to_dict() + + expected = {"weight": 0.0, + "_mapping": {"variable": { + "negative": 0.333333, + "neutral": 0.50000, + "positive": 0.666667 + }}} + + assert actual == expected + + @pytest.mark.parametrize("attribute", + [("weight",), ("mapping",)], + ids=["test_weight", "test_mapping"]) + def test_target_encoder_from_dict_unfitted(self, attribute): + + encoder = TargetEncoder() + + data = {"weight": 1.0} + encoder.from_dict(data) + + if attribute == "weight": + actual = encoder.weight + expected = 1.0 + + assert expected == actual + elif attribute == "mapping": + actual = encoder._mapping + expected = {} + + assert expected == actual + + def test_target_encoder_from_dict(self): + + encoder = TargetEncoder() + + data = {"weight": 0.0, + "_mapping": {"variable": { + "negative": 0.333333, + "neutral": 0.50000, + "positive": 0.666667 + }}} + + encoder.from_dict(data) + + expected = pd.Series(data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + + actual = encoder._mapping["variable"] + + pd.testing.assert_series_equal(actual, expected, + check_less_precise=5) + # Tests for _fit_column def test_target_encoder_fit_column(self): @@ -19,7 +84,7 @@ def test_target_encoder_fit_column(self): 'neutral'], 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - encoder = TargetEncoder(columns=["variable"]) + encoder = TargetEncoder() actual = encoder._fit_column(X=df.variable, y=df.target, global_mean=0.0) @@ -40,7 +105,7 @@ def test_target_encoder_fit_column_global_mean(self): global_mean = df.target.sum() / df.target.count() # is 0.5 - encoder = TargetEncoder(columns=["variable"], weight=1) + encoder = TargetEncoder(weight=1) actual = encoder._fit_column(X=df.variable, y=df.target, global_mean=global_mean) @@ -52,16 +117,6 @@ def test_target_encoder_fit_column_global_mean(self): check_less_precise=3) # Tests for fit method - def test_target_encoder_fit_value_error(self): - - X = pd.DataFrame({'variable': ['positive', 'positive', 'negative']}) - - target = pd.Series([1, 1, 0, 0]) - - encoder = TargetEncoder(columns=["variable"]) - with pytest.raises(ValueError): - encoder.fit(X, target) - def test_target_encoder_fit(self): df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', @@ -70,8 +125,8 @@ def test_target_encoder_fit(self): 'neutral'], 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - encoder = TargetEncoder(columns=["variable"]) - encoder.fit(X=df, y=df.target) + encoder = TargetEncoder() + encoder.fit(data=df, column_names=["variable"], target_column="target") expected = pd.Series(data=[0.333333, 0.50000, 0.666667], index=["negative", "neutral", "positive"]) @@ -93,35 +148,18 @@ def test_target_encoder_transform(self): # inputs of TargetEncoder will be of dtype category df["variable"] = df["variable"].astype("category") - expected = pd.DataFrame({'variable_enc': [0.666667, 0.666667, - 0.333333, 0.50000, - 0.333333, 0.666667, - 0.333333, 0.50000, - 0.50000, 0.50000]}) + expected = df.copy() + expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000, + 0.333333, 0.666667, 0.333333, 0.50000, + 0.50000, 0.50000] - encoder = TargetEncoder(columns=["variable"]) - encoder.fit(X=df, y=df.target) - actual = encoder.transform(X=df, y=df.target) + encoder = TargetEncoder() + encoder.fit(data=df, column_names=["variable"], target_column="target") + actual = encoder.transform(data=df, column_names=["variable"]) pd.testing.assert_frame_equal(actual, expected, check_less_precise=5) - # Tests for _get_categorical_columns - def test_target_encoder_get_categorical_columns(self): - - df = pd.DataFrame({"continuous": [1.0, 1.5, 2.0], - "categorical": ["negative", "neutral", "positive"], - "object": ["cats", "dogs", "goldfish"]}) - - expected = ["categorical", "object"] - - encoder = TargetEncoder() - actual = encoder._get_categorical_columns(df) - - # It is OK to take sets here because we also do that in the - # _get_categorical_columns function - assert set(actual) == set(expected) - # Tests for _clean_column_name def test_target_encoder_clean_column_name(self): From 4ffc7f07b04bfaf55612e7813c0c6feaf9ea1018 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 28 Nov 2019 18:53:14 +0100 Subject: [PATCH 20/98] Add (de)serialization to KBinsDiscretizer Serialization transforms instance attributes to JSON and back Added unittests for these methods Renamed serialization methods in TargetEncoder and fixed bug in unittests of TargetEncoder --- cobra/preprocessing/kbins_discretizer.py | 62 +++++++++++++++++-- cobra/preprocessing/target_encoder.py | 10 +-- tests/preprocessing/test_kbins_discretizer.py | 60 ++++++++++++++++++ tests/preprocessing/test_target_encoder.py | 16 ++--- 4 files changed, 132 insertions(+), 16 deletions(-) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 57bd08c..89c4621 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -23,11 +23,12 @@ import numpy as np import pandas as pd +from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError #from sklearn.cluster import KMeans -class KBinsDiscretizer: +class KBinsDiscretizer(BaseEstimator): """Bin continuous data into intervals of predefined size. This provides a way to partition continuous data into discrete values, i.e. tranform @@ -64,6 +65,9 @@ class KBinsDiscretizer: """ valid_strategies = ("uniform", "quantile") + valid_keys = ["n_bins", "strategy", "closed", "auto_adapt_bins", + "starting_precision", "label_format", + "change_endpoint_format"] def __init__(self, n_bins: int=10, strategy: str="quantile", closed: str="right", @@ -109,9 +113,59 @@ def _validate_n_bins(self, n_bins: int): "of bins. Received {}, expected at least 2." .format(KBinsDiscretizer.__name__, n_bins)) - def set_bins_by_columns(self, bins_by_column: List[tuple]): - # To do: add checks! - self._bins_by_column = bins_by_column + def attributes_to_dict(self) -> dict: + """Return the attributes of KBinsDiscretizer in a dictionary + + Returns + ------- + dict + Contains the attributes of KBinsDiscretizer instance with the names + as keys + """ + params = self.get_params() + + params["_bins_by_column"] = { + key: [list(tup) for tup in value] + for key, value in self._bins_by_column.items() + } + + return params + + def set_attributes_from_dict(self, params: dict): + """Set instance attributes from a dictionary of values with key the + name of the attribute. + + Parameters + ---------- + params : dict + Contains the attributes of KBinsDiscretizer with their + names as key. + + Raises + ------ + ValueError + In case _bins_by_column is not of type dict + """ + _bins_by_column = params.pop("_bins_by_column", {}) + + if type(_bins_by_column) != dict: + raise ValueError("_bins_by_column is expected to be a dict " + "but is of type {} instead" + .format(type(_bins_by_column))) + + # Clean out params dictionary to remove unknown keys (for safety!) + params = {key: params[key] for key in params if key in self.valid_keys} + + # We cannot turn this method into a classmethod as we want to make use + # of the following method from BaseEstimator: + self.set_params(**params) + + self._bins_by_column = { + key: [tuple(l) for l in value] + for key, value in _bins_by_column.items() + } + + return self def fit(self, data: pd.DataFrame, column_names: list): """Fits the estimator diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 2fc2369..86ff882 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -56,7 +56,7 @@ def __init__(self, weight: float=0.0): # self.randomized = randomized # self.sigma = sigma - def to_dict(self) -> dict: + def attributes_to_dict(self) -> dict: """Return the attributes of TargetEncoder in a dictionary Returns @@ -74,13 +74,15 @@ def to_dict(self) -> dict: return params - def from_dict(self, params: dict): - """Summary + def set_attributes_from_dict(self, params: dict): + """Set instance attributes from a dictionary of values with key the + name of the attribute. Parameters ---------- params : dict - Description + Contains the attributes of TargetEncoder with their + names as key. """ if "weight" in params and type(params["weight"]) == float: diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py index a2ecdc2..4c39e90 100644 --- a/tests/preprocessing/test_kbins_discretizer.py +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -15,6 +15,66 @@ def does_not_raise(): class TestKBinsDiscretizer: ################# Test for public methods ################# + def test_attributes_to_dict(self): + + discretizer = KBinsDiscretizer() + + bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)] + discretizer._bins_by_column = {"variable": bins} + + actual = discretizer.attributes_to_dict() + + expected = { + "n_bins": 10, + "strategy": "quantile", + "closed": "right", + "auto_adapt_bins": False, + "starting_precision": 0, + "label_format": "{} - {}", + "change_endpoint_format": False, + "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0], + [6.0, 9.0]]} + } + + assert actual == expected + + @pytest.mark.parametrize("attribute", + ["n_bins", "strategy", "closed", + "auto_adapt_bins", "starting_precision", + "label_format", "change_endpoint_format", + "_bins_by_column"], + ids=["n_bins", "strategy", "closed", + "auto_adapt_bins", "starting_precision", + "label_format", "change_endpoint_format", + "_bins_by_column"]) + def test_set_attributes_from_dict(self, attribute): + + discretizer = KBinsDiscretizer() + + params = { + "n_bins": 5, + "strategy": "uniform", + "closed": "left", + "auto_adapt_bins": True, + "starting_precision": 1, + "label_format": "[,)", + "change_endpoint_format": True, + "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0], + [6.0, 9.0]]} + } + + expected = params[attribute] + + if attribute == "_bins_by_column": + # list of list is transformed to a list of tuples + # in KBinsDiscretizer!!! + expected = {"variable": [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]} + + discretizer.set_attributes_from_dict(params) + + actual = getattr(discretizer, attribute) + + assert actual == expected ################# Test for private methods ################# @pytest.mark.parametrize("n_bins, expectation", diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py index 2b4144a..25b5f3b 100644 --- a/tests/preprocessing/test_target_encoder.py +++ b/tests/preprocessing/test_target_encoder.py @@ -10,8 +10,8 @@ def test_target_encoder_constructor_value_error(self): with pytest.raises(ValueError): TargetEncoder(weight=-1) - # Tests for to_dict and from_dict - def test_target_encoder_to_dict(self): + # Tests for attributes_attributes_to_dict and set_attributes_from_dict + def test_target_encoder_attributes_to_dict(self): encoder = TargetEncoder() @@ -21,7 +21,7 @@ def test_target_encoder_to_dict(self): encoder._mapping["variable"] = mapping_data - actual = encoder.to_dict() + actual = encoder.attributes_to_dict() expected = {"weight": 0.0, "_mapping": {"variable": { @@ -33,14 +33,14 @@ def test_target_encoder_to_dict(self): assert actual == expected @pytest.mark.parametrize("attribute", - [("weight",), ("mapping",)], + ["weight", "mapping"], ids=["test_weight", "test_mapping"]) - def test_target_encoder_from_dict_unfitted(self, attribute): + def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute): encoder = TargetEncoder() data = {"weight": 1.0} - encoder.from_dict(data) + encoder.set_attributes_from_dict(data) if attribute == "weight": actual = encoder.weight @@ -53,7 +53,7 @@ def test_target_encoder_from_dict_unfitted(self, attribute): assert expected == actual - def test_target_encoder_from_dict(self): + def test_target_encoder_set_attributes_from_dict(self): encoder = TargetEncoder() @@ -64,7 +64,7 @@ def test_target_encoder_from_dict(self): "positive": 0.666667 }}} - encoder.from_dict(data) + encoder.set_attributes_from_dict(data) expected = pd.Series(data=[0.333333, 0.50000, 0.666667], index=["negative", "neutral", "positive"]) From 1d58ba452db864a7443c1c8715b4763201190415 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 13 Dec 2019 16:05:50 +0100 Subject: [PATCH 21/98] Clean up of old files --- cobra/preprocessing/kbins_discretizer.py | 5 + cobra/utils.py | 10 +- dev/preprocessor/__init__.py | 0 dev/preprocessor/categorical_regrouper.py | 391 --------------------- dev/preprocessor/compare.py | 154 --------- dev/preprocessor/develop.py | 50 --- dev/preprocessor/new_regroup.py | 189 ----------- tests/model_build.PNG | Bin 106292 -> 0 bytes tests/model_comp.PNG | Bin 84607 -> 0 bytes tests/predictors.PNG | Bin 72134 -> 0 bytes tests/stability_tests.py | 396 ---------------------- tests/tests.py | 394 --------------------- 12 files changed, 10 insertions(+), 1579 deletions(-) delete mode 100644 dev/preprocessor/__init__.py delete mode 100644 dev/preprocessor/categorical_regrouper.py delete mode 100644 dev/preprocessor/compare.py delete mode 100644 dev/preprocessor/develop.py delete mode 100644 dev/preprocessor/new_regroup.py delete mode 100644 tests/model_build.PNG delete mode 100644 tests/model_comp.PNG delete mode 100644 tests/predictors.PNG delete mode 100644 tests/stability_tests.py delete mode 100644 tests/tests.py diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 89c4621..603bac5 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -186,6 +186,11 @@ def fit(self, data: pd.DataFrame, column_names: list): for column_name in column_names: + if column_name not in data.columns: + log.warning("DataFrame has no column '{}', so it will be " + "skipped in fitting" .format(column_name)) + continue + bins = self._fit_column(data, column_name) # Add to bins_by_column for later use diff --git a/cobra/utils.py b/cobra/utils.py index b6f25ec..ea09c64 100644 --- a/cobra/utils.py +++ b/cobra/utils.py @@ -5,7 +5,7 @@ def get_column_datatypes(data: pd.DataFrame, target_column_name: str=None, id_column_name: str=None, - numeric_is_categorical_threshold: int=10) -> dict: + threshold_numeric_is_categorical: int=10) -> dict: """Get a list of column names by data type from a pandas DataFrame, excluding the id column and the target_column if provided @@ -17,7 +17,7 @@ def get_column_datatypes(data: pd.DataFrame, Description id_column_name : str, optional Description - numeric_is_categorical_threshold : int, optional + threshold_numeric_is_categorical : int, optional Threshold to decide whether a numeric variable is categorical based on the number of unique values in that column @@ -47,10 +47,10 @@ def get_column_datatypes(data: pd.DataFrame, # Remark: numeric variables can still be "categorical" # i.e. when they only contain some distinct values! # We only consider a variable continuous if they have more distinct values - # than the requested number bins (using numeric_is_categorical_threshold) + # than the requested number bins (using threshold_numeric_is_categorical) - # continuous if more than numeric_is_categorical_threshold distinct values - threshold = numeric_is_categorical_threshold + # continuous if more than threshold_numeric_is_categorical distinct values + threshold = threshold_numeric_is_categorical vars_cat_numeric = set([col for col in vars_numeric if len(data[col].unique()) < threshold]) diff --git a/dev/preprocessor/__init__.py b/dev/preprocessor/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/dev/preprocessor/categorical_regrouper.py b/dev/preprocessor/categorical_regrouper.py deleted file mode 100644 index a6276d4..0000000 --- a/dev/preprocessor/categorical_regrouper.py +++ /dev/null @@ -1,391 +0,0 @@ - -import pandas as pd -import numpy as np -from scipy import stats -from typing import Dict -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.exceptions import NotFittedError -import logging - -log = logging.getLogger(__name__) - - -class CategoryRegrouper(BaseEstimator, TransformerMixin): - """ - Regroups categories in categorical variables based on significance - with target variable. - - Parameters - ---------- - scale_cont : bool, default=True - Whether contingency table should be scaled before chi^2.' - - pval_thresh : float, default=0.001 - Significance threshold for regroupping. - - regroup_rename : str, default='non-significant' - New name of non-significant regroupped variables. - - missing_rename : str, default='Missing' - New name of missing categories. - - keep_missing : bool, default=Falsse - Whether missing category should be kept in the result. - - forced_categories : Dict, default=None - Dictionary to force categories - - for each colum dict of {col:[forced vars]}. - - Attributes - ---------- - all_category_map_ : Dict - Dictionary with mapping for each variable. - """ - def __init__(self, scale_cont: bool = True, - pval_thresh: float = 0.001, - regroup_rename: str = "non-significant", - missing_rename: str = "Missing", - keep_missing: bool = False, - forced_categories: Dict = None): - self.scale_cont = scale_cont - self.pval_thresh = pval_thresh - self.regroup_rename = regroup_rename - self.missing_rename = missing_rename - self.keep_missing = keep_missing - self.forced_categories = forced_categories - - def fit(self, X: pd.DataFrame, - y: pd.Series, - columns: list = []): - """ - Method regroups categories whole DataFrame. - - Parameters - ---------- - X : pd.DataFrame - Dataframe with all the columns. - - y: pd.Series - Series with target variable. - - columns : list, default=[] - Columns to be regrouped. - - Raises - ------ - ValueError - In case X and y are not of the same length. - - Returns - ------- - None - Only fits the instance of the class. - """ - self.all_category_map_ = {} - - if len(X.index) != len(y.index): - raise ValueError("The length of X is {}, but the length of y is {}" - .format(len(X.index), len(y.index))) - - if not columns: - columns = CategoryRegrouper._get_categorical_columns(X) - log.warning("All object-type columns have been selected") - - for column in columns: - if column not in X.columns: - log.warning("DataFrame has no column '{}', so it will be " - "skipped in fitting" .format(column)) - continue - - self.all_category_map_[column] = self._fit_column(X=X, - y=y, - column=column) - - def _fit_column(self, X: pd.DataFrame, - y: pd.Series, - column: str) -> Dict: - """ - Method regroups categories in given column. - - Parameters - ---------- - X : pd.Series - Series with one column to be transformed. - - y: pd.Series - Series with target variable - - column : str - Column to be regrouped. - - Raises - ------ - ValueError - in case input column is not a string. - - Returns - ------- - Dict - Returns dictionary as {old category : new category} for - specific column. - """ - category_map = {} - keep_categories = [] - incidence_mean = y.mean() - - # Rename target - y.rename("TARGET", inplace=True) - - # Replace missings - X = self._replaceMissings(X=X, column=column, - replace_with=self.missing_rename) - - all_uq_categories = X[column].unique().tolist() - - # Remove small categories - categories = self._removeCategories(X=X, y=y, column=column) - - # Inspect remaining categories and test significance - for category in categories: - df_aux = pd.concat([X[column], y], axis=1) - df_aux['other_cats'] = np.where(df_aux[column] == category, 0, 1) - cont_table = pd.crosstab(index=df_aux['other_cats'], - columns=df_aux['TARGET'], - margins=False) - - # if true, we scale the "other" categories - if self.scale_cont: - size_other_cats = cont_table.iloc[1].sum() - cont_table.iloc[1, 0] = (1-incidence_mean)*size_other_cats - cont_table.iloc[1, 1] = incidence_mean*size_other_cats - cont_table = cont_table.values.astype(np.int64) - - pval = stats.chi2_contingency(cont_table, correction=False)[1] - - # If significant, keep it - if pval <= self.pval_thresh: - keep_categories.append(category) - - # Keep "Missing" even if it wasn't selected if - # it is in the original categories and set to True - if ((self.missing_rename not in keep_categories) and - (self.missing_rename in all_uq_categories) and self.keep_missing): - keep_categories.append(self.missing_rename) - - # Keep forced categories - if self.forced_categories is not None: - # If doesnt exists, give warning - forced = [col for col in self.forced_categories[column] - if col in all_uq_categories] - - # Extend list and remove duplicates - keep_categories = list(set(keep_categories.extend(forced))) - - difference = set(forced) - set(self.forced_categories[column]) - if len(difference) > 0: - log.warning("Following forced categories: {} " - "are not in column: {}.".format(difference, - column)) - - # Return dictionary as {old column : new column} - for category in all_uq_categories: - if category in keep_categories: - category_map[category] = category - else: - category_map[category] = self.regroup_rename - - return category_map - - def transform(self, X: pd.DataFrame, - columns: list = []) -> pd.DataFrame: - """ - Method transforms specified columns. - - Parameters - ---------- - X : pd.DataFrame - Dataframe with all the columns. - - columns : list, default=[] - Columns to be regrouped. - - Raises - ------ - NotFittedError - If fit() method has not been called. - - ValueError - If columns to be transformed have not been fitted. - - Returns - ------- - pd.DataFrame - Returns transformed DataFrame with new columns as "col_regrouped". - """ - if len(self.all_category_map_) == 0: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - - raise NotFittedError(msg.format(self.__class__.__name__)) - - fitted_columns = list(self.all_category_map_.keys()) - - # if specified columns not in fitted Dict, raise error - if not set(columns).issubset(set(fitted_columns)): - diff_cols = set(columns).difference(set(fitted_columns)) - raise ValueError("Following columns are not fitted: " - "{}".format(diff_cols)) - - X_tr = X.copy() - for column in columns: - X_tr[column + "_regrouped"] = self._transform_column(X=X, - column=column) - - return X_tr - - def _transform_column(self, X: pd.DataFrame, - column: str) -> pd.Series: - """ - Method transforms specified columns. - - Parameters - ---------- - X : pd.DataFrame - Dataframe with all the columns. - - column : str - Column to be regrouped. - - Returns - ------- - pd.Series - Returns DataFrame with regrouped variable as category datatype. - """ - X_tr = X[column].copy() - X_tr[column + "_regrouped"] = X_tr.replace( - to_replace=self.all_category_map_[column]) - - X_tr[column + "_regrouped"] = X_tr[column + - "_regrouped"].astype('category') - - return X_tr[column + "_regrouped"] - - def fit_transform(self, X: pd.DataFrame, - y: pd.Series, - columns: list = []) -> pd.DataFrame: - """ - Auxiliary method fits and transforms specified columns. - - Parameters - ---------- - X : pd.DataFrame - Dataframe with all the columns. - - y : pd.Series - Series with target variable - - column : list, default=[] - Columns to be regrouped. - - Returns - ------- - pd.DataFrame - Returns DataFrame with regrouped variable as category datatype. - """ - self.fit(X=X, y=y, columns=columns) - - X_tr = self.transform(X=X, columns=columns) - - return X_tr - - def _replaceMissings(self, X: pd.DataFrame, - column: str, - replace_with: str = 'Missing') -> pd.DataFrame: - """ - Method replaces missing and empty cells with `Missing` (default) in - a pd.DataFrame. - - Parameters - ---------- - X : pd.DataFrame - Dataframe where a value will be replaced if empty or nan. - - column : str - Column to be analyzed for missings. - - replace_with : str default='Missing' - String to replace the missings. - - Raises - ------ - ValueError - In case input column is not a string. - - Returns - ------- - pd.DataFrame - Modified dataframe with replaced missings. - """ - if X[column].dtype != 'O' or X[column].dtype != 'object': - raise TypeError("column {} must be a string".format(column)) - - X[column].fillna(replace_with, inplace=True) - X[column] = X[column].astype(str).str.strip() - X[column].replace('', replace_with, inplace=True) - - return X - - def _removeCategories(self, X: pd.DataFrame, - y: pd.Series, - column: str, - threshold: int = 5) -> np.ndarray: - """ - Method removes category which fail to meet certain condition - - Parameters - ---------- - X : pd.DataFrame - Dataframe with columns to be inspected for group removal. - - y : pd.Series - Series with target. - - column : str - Column to be analyzed group removal. - - threshold : int default=5 - Threshold for group removal. - - Returns - ------- - np.ndarray - Numpy array with groups to be kept. - """ - category_cnts = pd.DataFrame(X.groupby(column)[column].count()) - train_inc = y.mean() - factor = max(train_inc, 1-train_inc) - keep_categories = category_cnts.where((category_cnts*factor) > - threshold) - - return np.array(keep_categories.index.tolist()) - - @staticmethod - def _get_categorical_columns(data: pd.DataFrame) -> list: - """Get the columns containing categorical data - (dtype "object" or "category") - - Parameters - ---------- - data : pd.DataFrame - Dataframe from which categorical variables - will be extracted. - - Returns - ------- - list - List of column names containing categorical data. - """ - object_columns = data.dtypes[data.dtypes == object].index - categorical_columns = data.dtypes[data.dtypes == "category"].index - - return list(set(object_columns).union(set(categorical_columns))) diff --git a/dev/preprocessor/compare.py b/dev/preprocessor/compare.py deleted file mode 100644 index f863bfa..0000000 --- a/dev/preprocessor/compare.py +++ /dev/null @@ -1,154 +0,0 @@ -#%% -import pandas as pd -import numpy as np -from random import shuffle -from scipy import stats -from typing import Dict, Tuple -import sys - -sys.path.insert(0,"C:/Local/pers/Documents/GitHub/Cobra/dev") - -import preprocessor.categorical_regrouper as pr - -import logging -log = logging.getLogger(__name__) - -ROOT = "C:/Local/pers/Documents/GitHub/Cobra/" -df_data = pd.read_csv(ROOT + "datasets/titanic_data.csv") -df_data.rename(columns={'Survived': 'TARGET'}, inplace=True) -df_data['Pclass'] = df_data['Pclass'].astype(object) - -split = ['TRAIN']*int(df_data.shape[0]*0.7) + \ - ['TEST']*int(df_data.shape[0]*0.2)+ \ - ['VALIDATION']*int(np.ceil(df_data.shape[0]*0.1)) - -shuffle(split) - -df_data['PARTITION'] = split - -df_x = pd.DataFrame(df_data[['Pclass', 'Embarked']][df_data['PARTITION'] == "TRAIN"]) -df_y = df_data['TARGET'][df_data['PARTITION'] == "TRAIN"] - -#%% -""" NEW SOLUTION """ -CR = pr.CategoryRegrouper() - -CR.fit(X=df_x, y=df_y, columns=["Embarked", "Pclass"]) -print(CR.all_category_map_) -df_new = CR.transform(X=df_x, columns=["Embarked", "Pclass"]) - -#%% -""" OLD SOLUTION """ -def __regroup(var,target,train,pval_thresh=0.01,dummy=True,keep='Missing',rename='Other'): - ''' - Method regroups categorical variables - Returns DF mask - ---------------------------------------------------- - var: input pd.Serie with cat column - target: pd.Serie with target variable - train: pd.Serie with parition variable - pval_thresh: threshold for regrouping - dummy: scale of booleans (?) - keep: keep specific groups (?) - rename: rename the insignificant category - ---------------------------------------------------- - - Each group is tested with a chi² for relevant incidence differences in comparison to a rest-group - - The rest group has the size of the remaining groups and an 'overall average incidence' (if dummy=True) or - - remaining groups average incidence' (if dummy=False) - - Groups with a pvalue above the threshold are relabled to a single group - ''' - - # Define the chi² test condition - # Groups that do not meet the condition are not analyzed and will be unconditionally relabled - def _chi2cond_(var=var,target=target,train=train): - varcounts = var[train].groupby(by=var).count() - train_inc = target[train].sum()/len(target[train]) - factor = max(train_inc, 1-train_inc) - analyze_mask = (varcounts*factor)>5 - analyze_groups = analyze_mask.index[analyze_mask].values - return analyze_groups - - # Compute overal incidence mean - incidence_mean = target[train].mean() - # Create container of which groups will be kept, compared to the groups which will be relabled - keepgroups = [] - # Cycle and test each group that meets the chi² condition - for group in _chi2cond_(): - # Container for target 0/1 observations of the group under scrutiny - obs_group = [] - # Counts of the target 0/1 occurences for the group under scrutiny - obs_group.append(((target[train]==0)&(var[train]==group)).sum()) - obs_group.append(((target[train]==1)&(var[train]==group)).sum()) - obs_group = np.array(obs_group) - # Container for target 0/1 observations of the remaining groups together - obs_other = [] - # Counts of the target 0/1 occurences for the remaining groups together - obs_other.append(((target[train]==0)&(var[train]!=group)).sum()) - obs_other.append(((target[train]==1)&(var[train]!=group)).sum()) - obs_other = np.array(obs_other) - # If dummy=True, we scale the two groups of target 0/1 occurences such that the incidence is equal to the overall incidence - # The size of the two groups of target 0/1 occurences is still equal to the size of the remaining groups - if dummy: - obs_other_size = obs_other.sum() - obs_other[0]=(1-incidence_mean)*obs_other_size # 0(1) index coincides with target = 0(1) - obs_other[1]=( incidence_mean)*obs_other_size - obs = np.array([obs_group,obs_other]) - # Place at least 1 observation to avoid error in chi2 test - obs[obs==0] = 1 - # Perform chi² test - pval = stats.chi2_contingency(obs, correction=False)[1] - # If pval outperforms threshold, append the group in the keepgroups list - if pval<=pval_thresh: - keepgroups.append(group) - #elif group==keep: - # keepgroups.append(group) - # If the specific group to be kept (e.g. 'Missing') didn't pass the test, append it to the keepgroups list - if keep not in keepgroups: - keepgroups.append(keep) - # Makes a list of all groups not in the keepgroups list - regroup_mask = [val not in keepgroups for val in var.values] - var_regroup = var.copy() - # Rename those groups - var_regroup[regroup_mask] = rename - var_regroup.name = "B_"+var.name - info = (var.name+": from "+str(len(var.unique()))+" to "+str(len(var_regroup.unique()))) - return var_regroup, info - -#%% -result = __regroup(var=df_data['Pclass'], #Cabin, Pclass, SibSp, Parch, Embarked - target=df_data.loc[:,'TARGET'], - train=df_data['PARTITION']=='TRAIN', - pval_thresh=0.05, - dummy=True, - keep='Missing', - rename='non-significant') - -print(result[0].unique()) -print(result[0].head(n=5)) -#print(result[1]) -df_orig = result[0].to_frame() -df_orig.columns = ["old"] -df_orig["old"] = df_orig["old"].astype('str') -df_orig["old"] = df_orig["old"].astype('category') - -df_orig['split'] = df_data['PARTITION'] -df_orig = df_orig[df_orig['split'] == 'TRAIN'] - - -#%% -""" COMPARE """ -#df_orig.loc[:,"new"] = df_new['Embarked_regrouped'].copy() -df_orig.loc[:,"new"] = df_new['Pclass_regrouped'].copy() - -print(df_orig) - -df_orig['compare'] = df_orig["new"] == df_orig["old"] - -print(df_orig[df_orig['compare'] == False]) - - - -#%% - - -#%% diff --git a/dev/preprocessor/develop.py b/dev/preprocessor/develop.py deleted file mode 100644 index 6e8c4dc..0000000 --- a/dev/preprocessor/develop.py +++ /dev/null @@ -1,50 +0,0 @@ -#%% -import pandas as pd -import numpy as np -from random import shuffle -from scipy import stats -from typing import Dict, Tuple -import sys - -sys.path.insert(0,"C:/Local/pers/Documents/GitHub/Cobra/dev/preprocessor") - -import preprocessor.categorical_regrouper as pr - -import logging -log = logging.getLogger(__name__) - -ROOT = "C:/Local/pers/Documents/GitHub/Cobra/" -df_data = pd.read_csv(ROOT + "datasets/titanic_data.csv") -df_data.rename(columns={'Survived': 'TARGET'}, inplace=True) -df_data['Pclass'] = df_data['Pclass'].astype(object) - -split = ['TRAIN']*int(df_data.shape[0]*0.5) + \ - ['TEST']*int(df_data.shape[0]*0.2)+ \ - ['VALIDATION']*int(np.ceil(df_data.shape[0]*0.3)) - -shuffle(split) - -df_data['PARTITION'] = split - -df_x = pd.DataFrame(df_data[['Pclass', 'Embarked']][df_data['PARTITION'] == "TRAIN"]) -df_y = df_data['TARGET'][df_data['PARTITION'] == "TRAIN"] - - -#%% -""" NEW SOLUTION """ -CR = pr.CategoryRegrouper() - -CR.fit(X=df_x, y=df_y, columns=["Embarked", "Pclass"]) -print(CR.all_category_map_) -df_X_tr = CR.transform(X=df_x, columns=["Embarked", "Pclass"]) - -#%% -""" OLD SOLUTION """ - - - - - - - -#%% diff --git a/dev/preprocessor/new_regroup.py b/dev/preprocessor/new_regroup.py deleted file mode 100644 index 6a467c2..0000000 --- a/dev/preprocessor/new_regroup.py +++ /dev/null @@ -1,189 +0,0 @@ -#%% -import pandas as pd -import numpy as np -from random import shuffle -from scipy import stats - -ROOT = "C:/Local/pers/Documents/GitHub/Cobra/" -df_data = pd.read_csv(ROOT + "datasets/titanic_data.csv") -df_data.rename(columns={'Survived': 'TARGET'}, inplace=True) - -split = ['TRAIN']*int(df_data.shape[0]*0.5) + \ - ['TEST']*int(df_data.shape[0]*0.2)+ \ - ['VALIDATION']*int(np.ceil(df_data.shape[0]*0.3)) - -shuffle(split) - -df_data['PARTITION'] = split - -#%% -''' ORIGINAL CODE ''' -def __regroup(var,target,train,pval_thresh=0.01,dummy=True,keep='Missing',rename='Other'): - ''' - Method regroups categorical variables - Returns DF mask - ---------------------------------------------------- - var: input pd.Serie with cat column - target: pd.Serie with target variable - train: pd.Serie with parition variable - pval_thresh: threshold for regrouping - dummy: scale of booleans (?) - keep: keep specific groups (?) - rename: rename the insignificant category - ---------------------------------------------------- - - Each group is tested with a chi² for relevant incidence differences in comparison to a rest-group - - The rest group has the size of the remaining groups and an 'overall average incidence' (if dummy=True) or - - remaining groups average incidence' (if dummy=False) - - Groups with a pvalue above the threshold are relabled to a single group - ''' - - # Define the chi² test condition - # Groups that do not meet the condition are not analyzed and will be unconditionally relabled - def _chi2cond_(var=var,target=target,train=train): - varcounts = var[train].groupby(by=var).count() - train_inc = target[train].sum()/len(target[train]) - factor = max(train_inc, 1-train_inc) - analyze_mask = (varcounts*factor)>5 - analyze_groups = analyze_mask.index[analyze_mask].values - return analyze_groups - - # Compute overal incidence mean - incidence_mean = target[train].mean() - # Create container of which groups will be kept, compared to the groups which will be relabled - keepgroups = [] - # Cycle and test each group that meets the chi² condition - for group in _chi2cond_(): - # Container for target 0/1 observations of the group under scrutiny - obs_group = [] - # Counts of the target 0/1 occurences for the group under scrutiny - obs_group.append(((target[train]==0)&(var[train]==group)).sum()) - obs_group.append(((target[train]==1)&(var[train]==group)).sum()) - obs_group = np.array(obs_group) - # Container for target 0/1 observations of the remaining groups together - obs_other = [] - # Counts of the target 0/1 occurences for the remaining groups together - obs_other.append(((target[train]==0)&(var[train]!=group)).sum()) - obs_other.append(((target[train]==1)&(var[train]!=group)).sum()) - obs_other = np.array(obs_other) - # If dummy=True, we scale the two groups of target 0/1 occurences such that the incidence is equal to the overall incidence - # The size of the two groups of target 0/1 occurences is still equal to the size of the remaining groups - if dummy: - obs_other_size = obs_other.sum() - obs_other[0]=(1-incidence_mean)*obs_other_size # 0(1) index coincides with target = 0(1) - obs_other[1]=( incidence_mean)*obs_other_size - obs = np.array([obs_group,obs_other]) - # Place at least 1 observation to avoid error in chi2 test - obs[obs==0] = 1 - # Perform chi² test - pval = stats.chi2_contingency(obs, correction=False)[1] - # If pval outperforms threshold, append the group in the keepgroups list - if pval<=pval_thresh: - keepgroups.append(group) - #elif group==keep: - # keepgroups.append(group) - # If the specific group to be kept (e.g. 'Missing') didn't pass the test, append it to the keepgroups list - if keep not in keepgroups: - keepgroups.append(keep) - # Makes a list of all groups not in the keepgroups list - regroup_mask = [val not in keepgroups for val in var.values] - var_regroup = var.copy() - # Rename those groups - var_regroup[regroup_mask] = rename - var_regroup.name = "B_"+var.name - info = (var.name+": from "+str(len(var.unique()))+" to "+str(len(var_regroup.unique()))) - return var_regroup, info - -#%% -''' RUN ORIGINAL CODE ''' -result = __regroup(var=df_data['Embarked'], #Cabin, Pclass, SibSp, Parch, Embarked - target=df_data.loc[:,'TARGET'], - train=df_data['PARTITION']=='TRAIN', - pval_thresh=0.05, - dummy=True, - keep='Missing', - rename='Non-significants') - -print(result[0].unique()) -print(result[0].head(n=5)) -print(result[1]) -df_tst = result[0] - -#%% -''' TEST CHISQR CONDITION ''' -def _chi2cond_(var,target,train): - #simple group by - pandas series - varcounts = var[train].groupby(by=var).count() - #train incidence - 0.3775280898876405 - train_inc = target[train].sum()/len(target[train]) - #Why? -0.6224719101123595 - factor = max(train_inc, 1-train_inc) - #which groups to analyze - boolean - analyze_mask = (varcounts*factor)>5 - #filter groups to be kept - array([0, 1, 2], dtype=int64) - analyze_groups = analyze_mask.index[analyze_mask].values - return analyze_groups - -chi = _chi2cond_(var=df_data['Embarked'], - target=df_data.loc[:,'TARGET'], - train=df_data['PARTITION']=='TRAIN') - -#%% -varcounts = df_data['Parch'][df_data['PARTITION']=='TRAIN'].groupby(by=df_data['Parch']).count() -train_inc = df_data.loc[:,'TARGET'][df_data['PARTITION']=='TRAIN'].sum()/len(df_data.loc[:,'TARGET'][df_data['PARTITION']=='TRAIN']) -factor = max(train_inc, 1-train_inc) -analyze_mask = (varcounts*factor)>5 -analyze_groups = analyze_mask.index[analyze_mask].values - -#%% -df_data['TARGET'][df_data['PARTITION']=='TRAIN'].mean() - -#%% -''' TEST TESTING ''' -target = df_data.loc[:,'TARGET'] -train = df_data['PARTITION']=='TRAIN' -var = df_data['Embarked'] - -for group in chi: - group == 'S' - # Container for target 0/1 observations of the group under scrutiny - obs_group = [] - # Counts of the target 0/1 occurences for the group under scrutiny - obs_group.append(((target[train]==0)&(var[train]==group)).sum()) - obs_group.append(((target[train]==1)&(var[train]==group)).sum()) - obs_group = np.array(obs_group) - # Container for target 0/1 observations of the remaining groups together - obs_other = [] - # Counts of the target 0/1 occurences for the remaining groups together - obs_other.append(((target[train]==0)&(var[train]!=group)).sum()) - obs_other.append(((target[train]==1)&(var[train]!=group)).sum()) - obs_other = np.array(obs_other) - -#S -#obs_group -#array([225, 95], dtype=int64) -# -#obs_other -#array([58, 67], dtype=int64) - -#%% -pd.crosstab(df.regiment, df_data.loc[:,'TARGET'], margins=True) - -#%% -incidence_mean = target[train].mean() -dummy=True - -if dummy: - obs_other_size = obs_other.sum() #400 - obs_other[0]=(1-incidence_mean)*obs_other_size # 0(1) index coincides with target = 0(1) - obs_other[1]=( incidence_mean)*obs_other_size -obs = np.array([obs_group,obs_other]) -# Place at least 1 observation to avoid error in chi2 test -obs[obs==0] = 1 -# Perform chi² test -pval = stats.chi2_contingency(obs, correction=False)[1] - -#obs -#array([[ 19, 26], -# [248, 151]], dtype=int64) - -#%% diff --git a/tests/model_build.PNG b/tests/model_build.PNG deleted file mode 100644 index 7560c6941ee00e8ad9255b2bfd3b0fca6052e780..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 106292 zcmdSAcT|&E*EX(?3WFFKhfxFste{e&^v>8|6a=N$AVO%NLnwj3C<+z?1tD}qq?aHq z5E4*OK>~!50tA9c2_ZBI0YeCUw=?s+kIwu2zCXXUep$;Ea3zO*@3YU|`?{`u9^NoD z0_~O9yJN==(AD2BoA1~m2;Q+{*YlqQfd7%?#_R^(b_SapUEERCdxQ!6vfJar^$R<8 zyiO9{blC&^E)?*)UGR<_zl41Ix0CKJ6TV}|cHz~_7c9de^L)YVlT4@aB$;2{s@ulM zZLOL8aq8S~=f(8X;^z+E0sV5zMX36j;=r?L0U@C~E7!80nGB_9781Cx2FDf_7T)7& zW6iW9?jD2lQ%6E?hnbm5wX|;NTbY@`dj?&KV4-Cm^d%|_Nw(rxk_XBB)p?7&$TGjf zd6}Y@zkj`3NL&8p!1piFV-u&2fB%xb*k=6wS1y~}yN7!4{VV+^A@KK095{dA7s&TZ z9KU_M;tXznj+r7Xru4goh3MjOjtZ67GS%GI6SfmZjf%%DnS9lGwYCQ8BA44tuS^q$ z{+RP9&U5*yGL6YUuShlg&-xfb*+gNS4d8@i(_*Nf-~O;?8yPGQ|6SdSxzWtIx#a0- z=+2To{Hob7!&aR&jY4$H?;9H%Zcyl{(|ex8MTc&P&v|_GzvMdUUGSf$pm~v(&5fWx zJ1#;vwbZ3QqQ#UTxYNeQ#^$Gg{TXK6sm}!I^Gh$P=Xn%a2*-4;45%_ zhr|a1B(MCBdq+)$z^}{JUN)F=j)y6sT)RRp4F;ThdB=vnTQ=835+Si44C5=EyFQ~j z`n^Xi+Kem8@FkTN_|7MdH{0L-el^?++H#E1K2gXD?)Y&2zkS$u*CQWwhvyM9unZBw z6SBB&F&sSN;~t9{$zXj62v$FGM$%RPzkRWCaQXcrnQS$af%eirlHmCXdg)xz6FpEo zYWn~3CBQUaCp$ZrTyEw}``ZUX;NXal_O%f+k|Fv>AZfh$4F3e~cb*u1kdbl4u4m@e zjnltth2$WQq~cpjXbOrRP#|9gu-r|0i4oxD!W zh9!5bEueaPt1`P(@GXDt7Yg*Y!4=z>i4rW{tBjo@7N_A2k`H#NC>lJ8L)Ax#dFLo8 zDrSu54G(K&WMpuPY9+HbXcPo)Odo`~`R{0W_oU&6C;r9hG|&-D6GkTA9iWlXdjl5=#5-w72Km zjD(hq(IBFtg3c~Rqfh#>a_Y6KG24Ia6A>|%O$khUTiFjGQA#0G5D3JSWO*s^1h_>( zM0d5iE-a*Lb+zs$idv&LSeX|Wdq5O|F!0;B)GRf~OwHa5vbmrSzmA%oMjNgK`Kg1A z=}BRVG!>_;xORelt8l$(e7w;sZrc^D;I-^*{b1|f?BSu+3QUflGwVwz@B1*;xZSku zcfwP%h$`j5pjp6HjO$vG9_^ zt%)BL+|1 zd2hn&#OmQ?VofPU0yHy&$Q#npQ|BYT6c(&>x+W2}c9}(t=kN50z-<8!48p0Zl4LHa zIDv|&UeVE~8hE!iol5)B^FzCbi%_16(NMvLGS+qv<& zzD>OI*Ut;R$Vo)I@2~&PVu2?<9DvxXz8d5+J%4K>Sc$67E!a=~Ml7U|)L! z44-vj9aC+KfYBOFEW@=_R9s2aO5X+dlVh&D+;bF9zPPmbGbB%> zb8KDzs0T3+Myn)!+s{OIlgtg|*RFyjf|wuvQcE~3Cy3#+-X$uSn^wf`QPx(U85KaI zCvLsFhbq@nF+u7SX}7J+n?PFDaGh^1nnHeOOy@cO>_zEE$=5H%dwGR3H%3`EX>7$j z-fgGot;ZOh^(U+!V%Am(M^liXFoO>Yfo)t3FbF5Aeyy?^{#F%-5=*(B7b}i5<7B2U z2YGvUCDcT1-ESX|yq7|hj-u~QS5&N2{@f#@7don`9CUqTM7#U_0#9Z$Il-~n-OywI ze$e^7!NS1C73_Tf`Sb1LGiEvZ@?oP9Nka0gI)SW(unZ=C-QYBkx$SLoxhw;j{f4T` z12UZp%JtbPBKr?bb=V6lFW~i6Qcy973nnj`4&OuBW=*#dG+T7?_?wliw$3LdG}^Lq z;>P^ykJ%_t&_`m!>u3Y5-54pd;qy_s_?E?Nr*-fmZgOsmkTd?|#m-sF?gFr=C}fSp z$#Ot?&%W9@Gdr6e=;=AMJ4NDJk&5`$Viot0nW3S^Y?!vcA(g2N(pwbviBqnwQ|wCFxd>3^S%E0+Ur7Zx0)9`YU+v4{Z*1_8O2l_P>-5@Wuovkgv{ z?A{qYN?!I8E|WOEf8Li6?GX3Id*Xa?*;_QZdxS>pl1{V;CS+uiq>8CQaN z7CGL-8BM2Z9nmV-HWknwHCtn|4jIVE&74p-kD&w`4XUCge_@lM6Q{-#%DviTH8H)} z_UN+Ba33jqPU6V_;hT6z}~; zy;!~PZ!X{{XJV-U#m0d)~uq*?kiuAEtJTF1J~Pr30?*88QR`|G$V#ij%y|C}v;-*2^! z;UDWZ|C#N~QC>$qFBwWL6r(vAk5e{G2+TdfIsFICyu*v~UkJQVniB$ZhN>1+pN8y- zwig5)aH5qsNI?u=PgKF2twC~5#4ZxT`*f*`YttuNf1C2WC%JOG2xjNCF;gQy!+ySJ zBde^#eQa%cWuTC-^4S<{92S=J%KVJV$bbOKb#iO=8eX%q;OtV%dUVs$o%rrrC9^y& zMyiw=ug{@3S1Z!gG*{d8U)Mp}@XL@@+eWR0E512%R~ETkE=yZVx<%H^fSnytHLXIF zR*hy(7?;#9Tmde?VSMP+OI-EGVu zAJu$!z#96hq7i4d-6K@b{exZI%wnmt(^HRmP2yMnv9Bg%?`5KP(z=olZ z-%wJ(_Ll#yo)zX-Yll(89g&eidE`YxzNONYWgF?#avO)9ApGI6&v?q?KFoLorfJ85 zbHW<=@d#!-5ThIT>5RC@T~quEJuS6U{9Um@pFVm_$tNwU5MFkShBmJgdLq|4o#c+N ze)+qBL+X(~nYY96ARikWj8KD|tn78o$`h{_o0cw!I(qN$Yxki>OMIm7(jesOg?&8J zfi+Y+l5nq`P?&r}p(R4f>6ih%-4UCvr0iD;yoVyD&~l=?)#<88IY^CfdQ*u7iX6NLg15(;x$o6CXx z$Uf+X?(YsAoDHfSVNadIA&3Aclx)|26$EwSk=|Gvz{EDB9cexkUl16+YGUS(J`%O{ z^oaiE%O-xg(x5}D{&HXYd*QyDaMi@bccdVD`l4C8u~cfY3&kmHT#mnM*QpM9#>eLM z&Ro3%OZrR&2MkCWf~EMUN|?ZYMt&;wc!j|b6G}j6>N6~(SA&3RT`94_uvrv;Qm&k; zYDkn=f(BvBZ#^_{R)(jfB_6SGmiFM!`o9`~GsbZ;yEk&M|9(Ywx7+lyhuKkxlX5bL zI|)l`Q`1QsE6S=2{;ZPwbW8+$yP)}~g>d=GM)@8tDZtcS_0?)aj9CmMn&z1jwmtb+Uu9SyY|(CP)6^60v~B7W`$OVp34s~%2|Gx>geTWV z_aTnp8^n#mONpr=dNL}es}ep<%jV-ojrupspWqzr>|!8rh|!}t-`h^SIsi4W`=C|6 z>a2$QC~bRl@^Zh-b&X?#gFyWnE8wl!>{C)3^6~Wr)Aa+~+%9kM1h0p~3#ZGDL`_r+ zHLqT5U=6wOg$Md?&Al2e6f0U^MCINXo?GkO!aoxpky{L!Z+#6E@rAGp2AIm+IS=k; z{)w4GZ=J^UW`aXg2OR`tw4>gQDu;T8znb5AX#oq5z<@v}2ylt5+{>89t`hqdU}t1< zGqDll0eqZ$7%TL?l$wH&aruN^&ZX z%avKFNcXeyceaywQ+Kah6^M2ElzjA zH_Z3#R-Vo@D|plUXH!H=qaPBYf1sjy!aIMSH{7{-3)R{>32Yaq`LwdFEB6|^f4L@Y z>0+e?U-Cukx=1wBg{|v`oCz#k(LTon2oFW**n^rw?0r>pLD7`>=Wp*bK~E}%xf6wusfLl?0_A4_w4+?$u@v*k zcz{lK(Psrs!*^FDHv0Pd23Ljkh=8~SQSGeeH8c=6GLR*~tmtqTl-0<#A%o8!x6=&` zK^8{vq!!CwZ-#cqf9tGQjchLcro!3!O41ouPEpIBMu$vK51&~|GW%H>+f|I0;U$4) zrrfk)4ZL+5;h;{MVf%F(ZW_~yI~8EMWf6HnI zKbw-ZY>qN&7@x&~z)7igGC3HI%0!_n-|$gy8u)fiG!Oj{4V-R0R?ax7o|@Yp_tLL0$*0?K`cnoAS`s|6sI^W%F|dJ?4!ynrgB$o;yt z>%A~>nj}21VPZUH3if7QS>#EFcP6|cRo4_2TCK^14~&{x`Wv&fTgWmuCtI#Cu(wni zr(qv`cRgmsJ|4QVlpHJj)ZQVKEvm=zlY}MayA%s)QM=OU*6KC>OX+f#PH@gr%cFt1 zEqP-|c-%p4VUhVPzGd?un>(j+vvs9YpN;GfpjovP7u)1U&uBFTOuP)n!^&uvQMBci zdj(ONI8wxRu+9MmDpB6CYPGg8A_g_7nXmkw|8)ZDrDxieGf`vMI@)c5l1URop~+}_ zJx7kl&g*WHyq1>?*8%_xo9_b`xVT8M<1OsC!)teiPDOgyKq(G3tTn-6>mFYhMpuPQ zp7XQXjEc==^4(F@=P7rZx_Gq{pLSJB+?X+@w3e`N(;}K}1V}KCY93R1!lniv^LXf~ zBDa27S##?^q{GNl8I@z+UVMRAt|wnZSq21#ag|8gprjTmRr_BSG2>3t@L#t8WOM`P z;*q^sRL)sS;E*NfDBc^a$O$?H37Jp@9bWNutr6 zuh#=r$_CO50&Q-n>n^1-_szgcjq0|d@f7;FJ>D;^L+2uIcnZkNo}UWeqV}&-k9Ox# zG-ATf%A@_ybi%*~r;z-u%G*RY>q5A@)rPwbG{n!UXvXH`B)_k@A^Q;RP+>>DZYg%wUyMquIiamr?c)2P|iti`<- zuhELAi3k1K`^S$uE&8ysvg5u_6MlOn5}1?as!_Ea879Q6XGw_1cA#J2C*2;>)vXOo z;DTk?sM?9b#NQRp=q#Y>I~3xpgsEVk*HssWPp7sK3KjQ)IpK&FC!S?kRCwv!6$`1r zQZZBb*5^V(5D_Wz+%c+(Oj~Sp=v7n8x_Pp_y?r34o~&ySd^;#;ERY}}_4e@o?c@0c zWEr@Hf4I0f*UtQdh+I!e&y9@wg-uRBLcBThwm8xY$UT+@<&~=>rV2op$UHNWj&zhp z`m}OQXB8o8Z*T!$k`t0xwKu6<&duuHeqyB)P!E($H~F@i|2P(^{!>6R^`kLGU* zV=W;WAo;b$rHUIxwI^}5@)lU%o1$l{7ET<2%ta}39#F=8L`OO@a|Xl_<5@uKS|JR^ z%d{gHNyW7*MU=Zj%$6=TE9=UZlko+5e+FuuGwrq%V7e&P({*CfPD{D|tdTigwa~)M ze=Fl-MUFb(H#NdrTJ4%>!WfWM0G7a}(Uz)T&m7 z#7XZR#Ra?JmPL@-RkU+eCT(c0(o5~tU>c`PKHNru_PqY$t@6hz$eY-8;(_`si zS3^Tp!`Pnc_tilu6VQ-%9-xUr1MR`M+_!EODla4qru7kdimI3Za)RY@lO0fp$a z{iHkouQt1Xc_p!VF}2+Y5`>stKG4KjYreK6TLQ4+Yf^d9%_G6oFP~ZXgS7Hy6R#{d z>bx(duBRk&`pa5tx)6lh}6LtchBmZ53WrVRz0T3DNIbulB=)g!AdZpdeNVsH-(Rk zjJT&=0slt*oCAO!sT`-|wYA{QsjI{5Uw5)`?_dQ5y_1qBF5D(1|51I-ZhT=c*ti~( zqgeW9rBHv-#~{4dv&#FTj|S62i;rlpB<~YRO7}?5)rH$DIbnSa9pebF^wPW~Q8Rr- z6xjLv9e6mho;fu&g`JkV@JB%RO?_=ndn;0Qlh(Nw{rd!xA+O*}BfdWRZuWGuL5{TW z-Oj{Ybf=`rp!i{%;*2nIYsegmddtnZ8OHxmG;zyKzK!su=^*!WeZ^+%t>;tLU)fR- zS|~`ReO`cOdOar5sguxwXI_tuKs#nnK*PBsC0+fWlCGqWc5Q5Md8M1tXM2o*WZeQV z%YeF3#+JB6XWl@?ED9GEC|OBDC_%WI!L?fKhzJK8*5$G0XvV7Tt`a?lAs6 z0|R>v38{M-7cD{e>MXwFQ243lP>-TJ1J$$ORxqpVRD35NMg!P-%E| zqOe5-Z=CB_zGRb>&$fpU3zSaaMqg>KTsN|ZV58oT&RF>$)0w)S7d_Ji7kg1&S`Z|? zG0?88r3E#d(TiN(6Wnia^x3myTD#@^PeO$IQRH@Gg9=CCLulu4JDhLDaO;V83YaL- zzl*xH9yPFs#a3Ku-ae+a@|jTMzr8bJj_J`;dqeFFi8113=K#q|-I%f<9!A$Wsu6LI z7yVV(3m1OM-+32?45y3iThDmYrxffr-eR^Uu=7PP@7R53_pz+{PuNyOO$~U3h-Ax{kP8tmi zsf^OJB}Ev_+RD4GdHm}qK5ldQcBORmg9CI3A@$o#)LhZn3hD*$nkZz%;M{prbdM6j_kqb;9gdJnjiE_}X&Kp5rpDGkenVxU+me|gBW<}lNmnR>z8!UA18d$G zaD$CXtF?P&<^4668ETtxq(ZF>+dy+Z73sV7b!$817OO6QeKK9-`#M{P(IX%25vNBG zLUQz72huq3C3LpLwUGe7>mVcko619V6#^-#cbl$FOwX$WI9rOMZ)U+t`?6NdBf*D# zS~Nj|k7jV;+kp!fgt^)7o;Vc37(!+19kk*0V0|YF5ep+~T7fIibytQaS5{t60uqb6 zpBQ2}B4+B9d`Uu?SR*GRzM#N$MhhijJNG~jq-;@I$8KbS{?!IIY-_#k=?M}!LUM6S{DFCNxUk4Vv+K?AsgsO<~$*SJtLGCX{0 zjLz`Pb1pjp=ZL2#Ez-an3C&eiekz}p6DMg@U*2ZlvS$Wof?FWF8QFdB*9nfgrUEuB zukJpDwYC-529-`}Kv4~E+-Kd~r*qA*yr48`*c!!q8>i$PwvUMCjhrR5UYAzcdCe`W zgBHulpY*-`S57c!+}I_)UWY_np^s&y#*pUY!#Nv!iAybxPZ_e-r>6NAN(!abkIakA z2*!E<4P3f#SLJQpJratE*@`VyP`Yzy)%=!*{*w3OGT(Ch zEzjXKY=c3VxG!&lQ`O)x;v3$qlayAGls4WO^#gMZR9k^afbV*lcv3+uo&1P$1r5Wl z`2)1d2(1U(a1(PDM_R9RDHKS2v}k9Xg8g8*qtdzUdw%pc!As%=RyGc(ItW>rNlYPfi^-y8S3 zs=uuIV^AF7_w1YU-!9fXc_a3k6qMUtD+mnQ{5I$d4ml_wUODU+~2PT$8V>|sl2 zpXFLP=*zp`q{n4!tr@enWivl1<6=Ns#(k3&bG>_OB=n=xKL?I-%GZEO`*%NtFb<^p zUsRl1YVYfN%9W9JWCh}Xpc+%8A#c#Zw=Y5nET8_-&E#C6SSkEh0C!i|Rb7~R^XY|nDmT?={iB$_Jm@^UZu%QB+jOoH z^PnE{nC!c1_YfgDJR>Ydl|s8bp(E4}dx<0W&shHnS}xtuT$o!_2Z(aaA0V{k;lt-J zyXfCqAqA4K7&zmNj-!BN`=33efV0Hi(v3-}^(f-%qo`K~^5fz8N1cP|Ql@DZMh@P~ z+{wxaE!_$J4Wgw1-7iNZ9#uueOB1toJhqe?DOMIS#;e1P8<_TbX{zLRB}VclT3l8Q zw;J1VFr)0}PP>%D5bvj+ZH8!wEN+!<+_YT%3--RW(Re|wa2w&Q_YEdOsZby}CBE{Oq zg=KXe?RzsPnLLt1XPU4myfjxsv4A(=%gHFfnza!Qla8w7m}Wh$*f?UG6P8-*eRR$~ zF>COzNd$_=^$%*u`7d%)_bAv$4>#O*34RxeZxR>x@;uy ztlz}mdFJpHm7d*$mFZ?gg^$LfgcUf0PkYl+^HIjG6q5ofCqS5$)SH>D(l-6zQgsO4Oe&7TZwMoCqqk_ zdl>4HHw7&`--wNEYuf}BjeZ^|i&#yKdnU2jw8k_yIyvWSRlp-x-#|YEQVBFn*j(LpCCAg&D+$d~a|8({BY2L@P9M+X z*~L7Nj7phw^Qgzf*;YHrhH$ zjx^n>(~ytvz#IIeQojKBskx0X`>i^FOb8qv8Jgi>l2uyY6%}Tdw7OQsk9+6T znxp=G111@G5oEXuxCz@^-(yJXGu?%m-72u&JLXO1=&ufuFZLnmt-}r7YgH@OPG~Aj zO_Th_+$gK6YNlIxj9&kQ9AV<$l2%RS+JzGwp&IfNr>uboRv-KlC@v}a*cSna4o&0& z#0tr)DT@1Ob$vYxp4#9#V*A$+&Q|#CNy+~*V3@4*LVznZGQcFlT2ykK@njDs{xk79!n7o1`(+Khq4I zLDP3k_nQ2MPX0N2rBRmnRRwc(?9IZZb{a|hc6CVBNP`3I1D;;k`XtiHQjswpRKF34 z!3-&$H0QjEO{&mLox}RYjCi$~yk$-!o#M(^XE*AD%Pg096N_`hB+*rHpvBj@BJ92g9$~p3xk9;e!Y* z4L>qwy7UX8kbZ6A9QW*vgc@GpPX?xmOgkR@a)1;j7$G`EB$pZu)tEFQc054 z_@$;DH&86Z7=|uEvHDO3?csizdr4V=EO->pZNVeiT&v;UTpM0XIdZ>b>hdq;-rvY3 za8Oc0!Z|25-%Imw{~P0&7o3hKUan?;>c4x6K7LQ4eKsV$kJ|ZOYyw>S-nM zte7h1Ygb#T;bae~Jlx>b*HD*z@RXQO@kCyEg;lI(G|-j*s;c}Hjs&;&M=nXMDJoTFQa9+1)0J4rdgYz` zmMYDWx(~|(iHEFK^;odV$1OI0+P{kPp>o*lI(@TK)7pusdG=X0M z=I}~}M@t&_FTSOORUHh>^a1da6o4b6u~@08j_?2IPb)tJnbY6=nK+&ohe)0CuV1>H)GKKLapsfXOkkiLgQ&0|RRJ z3&~|f`;a&kL<(~pM}ifpzY9sljI8Hl5-L>(EXnNG{mbUWhoDoO&;}qwS@i12Y(VO9 zE&e&^Um;~1(yNp=_LN);K!3d!{?H15ZE$T>)q9ODz<{FwWaE;(>etqr>HfICWc7ar zK^vh$=9brN;W0@44kBP|=CDIHa2gk9&!R{A%f4}ly2k=ZB>cUU%@dGO3;eSb&{~B~N{b5?oJ!BO^ z5qZ;fA9hpi)8HRB{0+)Z>`~h>h_u{$!gscKXBPc-sGrO5J(4RN0_JZLu_AwwAwLi!!+P}OWQ1?EMTW$Cj$h4Y6xOJa4V^jH4 z1I*U~N6(J+zcV3V^$z1pDtN!yv!YhJbVW}qRi)oBZC3%dbknP^xT6}AWyN<*6FI)n zc~P-A6r$603$R_&*YWGzL#$To|7J(HV|^4z*Xq{d;couXmjE|C+%* z`Qu04UH(1%;s1;+fBWI#)BiWa;Q!AIfrTjRf#u7!&iuOf<+-sDFa11J;j~r8?>+uB zkEi#v=P9ME(G&j`b&nta+)EXQ>`yA1dXF;gZ0X1wt(Oj8(b5X6xoY@9c_zDCQ6cNr zl}iVUsQy@7vSKtC!&3SEqEskLHEQwSlbyhOAcM6JOmuFX+qW885TAHNh2a(&{c-o% zCdWi3JlC|4-ycTfil5Bi-wgLA>iy@hA76~hnShSJc_6g-t<_aLqPPW;q6YL(&1J70 zfrhlCDc?Mss)d+;r6qzW?^|8H>sD)5AaHg~jqPAypx5l#g8KKe^xVkrf?<@a-A*8_ z-W6IH6L!wse~7MhIyW8b@+N)?=?S{-fs-C#07Oqac-DrVAygJrk^PyC*1 z-!)s8ru;Ta5&DQ-NT_Mb))EoSjiO|WO6_dG;e6VDEuw~%osh=x{;R>mX&^Az5$k=o zxqCIiEttFFE=A}P53_S{7bZwYd@0cqk;rv(pMiRL31Qxr)@|cQdx=Md;6s%o!Aaj8 z4INs|#PplKbfE<-mPh%Q9+B+R+nrVF+Couw&pX&P#J$bk3!RVZKfF$hnn>0XvB=ny zag9j${Q0GjMLJ}hx!R56Li`wi$DH2!cS|?cDoR33i}(ykE)09?A}*Ee_})tq+E7;U z`dUW7us5^~8i^3>j+m8floh2%*xA~?@eK&=6+t)rwMFG?KSpqaPk8ChCvl zM9JJ+MpF7Gzd)}mX*M-AHOVT2)u9=2$SQ92beE4?s|z9ZA#do<4H49n8&r{SB)b69 zgb_sP;o$X;1JiPg$$F9N#%8M=IQ!LTJ;Qx?yb>MhwoTUyqCWJ@IIOy~t)#`Yx4D8` z^i;^ZAoby9JlPoiiF#)bA2%MT*v?j9bG{oGo353`^|dKP~N7^nY}LNi={RU6Bv^axHsZ5(;QJz-fl zLd<6Is4BFsLF!s7iW4os>89GmOMg`JvW-ZoNUIP@b4Hvok1pVi53~Q_;`-a+TB^!k z7n{?{rE2Q3IgXVsbFGUpD}yygMu)4|(psWt9HO;P3i?re^%Vs-%2VqHIfUNPS|h*c zpw2u4WYT;S_bL-?5qe%3J!p(VXR^Lj8DCjPAgv_IdK%JR*SWH zq$4)Oo0!^5{0cSh&9__h5Buylmee|~7nb+5imyTF;hEiQXtFRJa^2k_vP3{%3g`B$ zgfr~)6$yH)GZ1Fl#0wW6<0pH1)W&Rl{b^X?Q=-j}xqI~`<|#u|KHk+p+Y zIAAx*kE0Urb=`NO84?awNk6L)x}XQodK+2z39sb1pfp^(C>KL^W+W}*Y|<|B&S)9B zIej^d#j%QRA=gI&pX9IqLOO;c)^8knhD#&!D;#W^|BLUQWoqP)JxuX{26|Z?(rq8S zwqK9$HOyufLoKg)wV1V+M^1&5_;C5^AN3;j70r?!Ow|@by-Q;i2dAXigJvQ;0&xsB zr}k^b<21O|x$vIs_};~x!H}ZjVhxXe92C;fp`tqHn%h zWokQrHHdBb^?St}r;XJM9@2+mqXzh^8euS=X2m zdpp+C(nOXta24UwzEX+2iNXk}d#H}h71rFs!cu~?T#a+^{Os$Ond`{nBVm>07kK#5 zd5>G9bs4Q<$-r88^+X{o2mX3miq(}41zh&hl|Q1db6qooK+oTPaLMzUN!=0wE13BDxwAYZbm)OzcGlo8+t*UHcr71>5(GvY^J?7SO{ z81ecFchmN4BNV&!r3qKEW3T%=5mSQLxq3lM+*`IqL0gdXE>JVb%7DVUzzJ}$JY7HD zxFgW}rw!(=SohYM@O8CHsg8u|)g=FR0@v~Fcx&QMic`>2mc^e*_1#(Cf0Q#W5U-xq zJHZS=xHc@M;NQOq=mEUVlGw>|m9W%qpaoOU*XC4#7u##w@1eG^v(Y`+ zz6R#%`6gE)WSAW{b zxWohV8i<6D__(yKFN3?nP@~cmgWN@bU+0#s^P~SCt~B>gEz5Tre^V_pv*!-o@#Lq+ zy&}Mv@$cf2KKoY5sEZvvAV(eObd6*|i?>Ek^HOcI?^QU<|!RK}@!IXIb zTd!>prv4KcPkj&ERr*Ci=BGrDL`huLYVInf2_dbjn_<#(?QH7E(*<|S&-z|qjKWVp z6`+&GnD`Ezq||x&imGv5cw``bUd2(r1|2YLnI(XTggR&S0&_kQW779f7n(k6J1wFD z@p+l59k@%0O%(Dcx2{(Bth;{fON8m!>_Kn8hW)a-y6ix^&$6Zj8Ez{^v!C6 zm7rB|gK7n7aJX;XvBLEc5pevb_PsPQLRlgruGq$Yq$lF=k&Nd;OE{D3*L~lpN0_v~ zK@|{^x}@ll$^>W1(=X?}2B-6>Mrydx;P@keMqM%9j8?SOIOWSz#p&8-J7d6+`*d1? zbVn&;W1V>jB(i>9iYJ|df4erL`F;gPyT|%mN2VElp4YKGZaUw)5H#n!qievUUZHL+|2vfA?>}%it*#3RHdAUmh^sT47*WC-2 zRO_xq6YTb#mSXjeV&r~>Y+&v`Np|S3flW;NVAAk5nXzzt!P-1tX1Bg?xK~oYly(aU z+=IOl?nNjg(+B%H8?{7iszNi_fbQ;T!d#MMOvc17q9snc*HIbkbHm3D`ln*NB*YWbKZNMD+U2NMB>LOOT!Nx z9Z(;igl|yHq{oldCr_9e6mKZG+iCBd2}~<8c;XU zd2ug_I!6NAtPy5_VyVG!Ij#m4wNR1${Vi+h-7DnG)EZLYYRM%v-*BcqweuecgqYbz zc=$YqH8kjvbv+%mR-sy1X!1)}TxFq;u>(p=L~j=r$5xgvC@2U5Bq5!9G9FvY7w{@>eCQP;=yc7;dI8rf#`juCeQ6x~+vI=5 zNX^XhO53!$=m=)UB;(VDmS= zZWTB1|0^Wc0-n9hOrMjTvERv9E2VGjvxS+AD&x-b?(ZfIh2mcZG^%P zZue0@8;5J9=A!yv3yIq2>P1=u0|v}}fJhUU-qX|Lt~z6R)-`cLTm3sG>G*N=I0tC} z5itsji#zV6adxLolaO=19vp6ZhPa{2j~^{_kbo2^*9LJ{pzP6WAaQ^Tw}iJMB<(?0 zi#>$SeIFJFB>NSG!Nul-K@)9zf*;%3d}=&6SLsHh(V&Ppht40@&2;z$;vEpd95^K^ z>qqu9Mj;eH#yZ;CC-oMpB!Bp*?BGy4!P4Ed3WB#k5?H#k2$+t}GmiN3`HHKBMM}0d zyzJyCsZ^4Agf=*EwpwAo0~SG{gw^*NdScNPmB{TT#<}h3}WWU2quor{@+I z{X{jrJ4@z%>IRNnMrLMmMuvDsZf@GcheAviOJ(OPfyA=7r+hyDTg%YprYtTFT=T1{ zQWCvA*HSl{DsST!5D+l7wHkqvxb3Y4I!Daa0|u~Mi%hGq-~Ykmw`&rBD}ul#=jRJN z(Ei0d)(CXW#@hNMiXhJB;xd%LVB@o?$rOq;JG(1H+cynbJAB&??&bB7wp8NS5;c79 z1uzt}<)hVXpjdpY2)Nho3X@ueyCW!eHb2CKN!siCC?+#P^pLk_$y~6U*r7uvjkXWp zzrSKpb>sAIV@d%pJugoqA6g%z{fmWB;0R*&0l+T2tnK|FD$|d+3WMY0jJj)O)y|P< zZ|8o$CUwB>jE|*ifF(@PMrbTvI@rzaUBV7Bt$R*2wM&zg@Wb~%o=;m2I;-cAd!^JR zUw_FwFS4}YB<68bla`m4*LnC&&RV^;GRZ3JN897RB;c~s&n99?%%P$52-)e-*I4XX zJRUzYjOs{~Qo%f{T$?9Q67?e%xu_{&4E>BzQl6q+^rl-%1b^kViaWjP@dzo3JZq03 zd*dBD7A22>BMhcVzVdhsg2bno3$j9UzatngUmn$G?FY-52g{x&eKL=|h`-M2?lvJi z;QUS@ctaBtE~2lzb3(NB_5J4N5F$y7k>-%?4UJBa#^{Yzsn(70>&s!Zh4oHM3@OU8d5?fSVM?+deI3LIHbNn6`1cd zS|2@$jvQkd-&KoLXwI8`Km;x{HQG|jXz_%h{)biC_mZR_RfEg=}3b*k(Y1}ffsfB`^TWy!&Hf9*2%I)a_6=izyYk1E;e5lMopP#%L3)&85 z--ncsBjC%94|2vAFCr@oq=TCre?A3?v9`8;)!f|twcD68P-v{2+Er1$9ZBa!iz*wU z9GoVgn(3DCCV?dNkl;P2GJ#m_HVeI&ei8oaSBc9bw^Xz*-lVeh>l5N~8WQy4S{f$u z(rTe~`UqSeSKQ$Az9kFtw7m#$lfZX8t7;HSgO+K>?nP zCr|1|th^GRZmg^O_3}e%9+UZP%%P_AaitBA6-`WwDCBX0+x^oLkC>I$oPZfN7&b?- zJU=6iI&E(VfAZ3@;BB_)f_C$EK!g*zKostlFLj`lFdyxsrQ;u~+pMeY|3sS5%d#{I zi=Lq9Jkh>pK3)!zt>3RLF+g!Z-5)Pk!c4r3%f>1=3g%!FC6z1|@v|*(y6Hl68^PD^ zJBu&-e2eBt1mYU711+!UhM_(_-J>lWgaJ=EtLGY!o12ge4@Ko3aRwO9rY0s8<*lGj z)u?@7YX2RHk)}3dr{WWPe1`$OxZoHZFLlh5VC#F1v3IoXPPWy0A#gdkFVU3b`={YD zpv{+T%IxcToGJWr!B}2d2Sqt_cTp8JNc*;VvHB~^dOu+9e@PKAIj-F_hsv2yo~9dr z%^qHO`QK~?s?oJJW64`{ogE?CNT)nx6Bg_3_kY-X@3^M7W_=Vl3MwKZTahM;6qOPI z0V#rr6zNTo4$?%L(tB(a1%c24(xpr90s$06N`MH`t4Kl#J=72q?h3lwdEb4`@4Wkd z?jQGF{xNDo)_1L0GtbOCGo$ZA+V#@mDF`Zv3OmlmZpzO6a5m>7JWRCwKo*--PMT5m z$Y*TuE!w%O@5`+mX05m8ypLPF4-JgmWP)wRbyIYm6*f?S4?WTC8>pFh5Rr{^l}2dP zj!MgXbxbGp7VE-+ybyXoQ=}3@IwxJbTbT*VBkQ~@t8!C@WRAAhEk_KGP2&A`Y zr(EBDt8d+;MH{t|I5ah~cG3^^XB|8k*Ux6+?(SYl1G!_IMQ%BfQyq{mP3gj`U=C!m z9J8>H4|p*uE-o%@!q*BgjVgDV1ZkVEQj`sM2NteqH;GH`X!d>3pcG-LvDIh!Y)fGi zo<7_iRWlyw(ZBLQJ#PtBNH?Grp4~!QRMt}2QVi;F$BL)|Cy8@*1v;+Peooyr!8eMb{qGaL6cz8z<}s` z8fxnrV^2e4#$-LW+7AoV*bUSclWi;*d%6UC77x23@uFo zvl-T#mJUFu$6+6g?%j(}feXa@%?mc7-iN-cicCsObgrjH_zT_qOG7QBzK5r{V7&8q zR-cujVFD9BxeA+Xo#9zo+%qL_O{$&cv$UCJd_K%)k7Grop8e(1-3A_>@$DUNv|e`? zf5Es1!i|lMvkD8hp8R>0!6Qz`CNYR3TFvBDMHXtj%whzJ)$*v?-J)P13RNrWy>wvhAY!qB?|)9#>i0*~J~W<9ia=`UX% zcgzBm6}ji@6`8~^@Tgy-yfqKr>x~=)P|J}i1_AueV2y#2d`1DImG(}Fe?c3SD?-9$W{tP?o;3N z5@u&-FR}o%nmg~+|Cjz}_bq;cNSNI~`?z#vW@fWgw)5l1-J{Z#COWy%YxkIwO=?_k z_so^&M&%X$`Ei5e_$+84eI4u1m`>$Z1-}k_OTNRS63VPm7j!PtDG1 z)EVdZH{baSX1S%BToMF-=7_Ehq-0U6 z@`V=`GJpBZ;I^aPf8kg>;QId@3*#?m@o#zq{?BxfE3qp$jaof43*=YOCqwquC$*f0 zv>ik2&g-$G__4x?d1Cze-yN|fDgZ_{8Vly%3UF{Zj?U8uuGX>@9gO?qWLAMrZfj+` z^=Z3zLU+K)ArS}OFfb)7_dBK;sA=7Te{?EDdaraUR*Z$=Gu)skuM9{p( z-)^ecO*LO*!0QhT_vy6?IQ6(I#tjvZu@c<<+4E9=sG{$m$r7L%S2!_z&5ey={;ON^ z9~<;|ip<*KRwXa0ChXv&<2lws(-_fCA-ic6EZ}sH%pTzyW4og%}oA>UYOFb|9>I>@y=CaPbYJl7MLbDAY9aO8a@ zW<^xK;pkBcf$6^_?S+K=rTvd-MA7LdoV(FMeAhNLD^f^ zt_rr$#ExjDbKW@8K8HABdUGSh(e&7rXXoU%W{yGXY_ArVFQ>f^pOvO(`+b3auPepw zG)&Y;ZA*`co+gdk`IIew+&w->^hA~~zM$0Pw-`zHD{#or9_hA7!0bpo2=5jx90{5U z@gAx1cTy!jpZLWv=)35he>rbYzhgIPo2l5iCavO~n>9baJD_lJ~KAh$bV#kub+ z)cCsnlQcr#t6N`);vjcdw@{M4Y@VKWcuD$)XlCi?`RG)6pX;+V;NfqBK)kZAe@)Y$aWgK~!q`;l%@=F~O!5oN+L z!=tNn|4$FRv9`1MNl5JK=0LwDMbOXG^&Hc8l=G#|dR;wMnqtvS-?_4Tt*EG_6VeT@An)&iJ_| zW=Ht>Mu*G83U8lUmM0Cl>!^6@_qa;>qQ@cpM{J*c;bd;*2>g3>^*E0<9EUp`E-L3bw^Q(1XF#w-+IWWP5A7o+0D8aP zZ*={CRN(pvCnZ0#mlTN@G8$m_qU0V2e_4aA(K0x9r?nT!i(?BPG?)&fb z_B-#%SeR6BaDpCGuE^%_=Vf=I$ujEL@6Y*I>_J6ip|8L_{{=S9e z7ypqZ|DSiHv8D;h^5$nU!Xvd%+Kg-oe;n59qCC2LdA}XiY;PZRD3IbGt14M_+c;oX zv9xj$v|a{3?%d>${<8o2L)WK1>ij#6D*Jzc^V+q}qLT5WD zDJVE$0Re~BWI7%O>xTp#XN%0u&HeKtOQt;djFYO#8p3~$%E~_do_8TI)v@@RPvdNSo>Q|gc&H>qdG9Nk~ z+p=MgbbdG^tPXbZ8fZX%@#A{6GQmGn_S^87udf5N)zrUXVB#vWj-*k}ZaEz?Qm`?< z>r8!P{ScsK<`F(G)Iqq02QhVVvu((R)=W=A%QZq~$x zaIaFsdiui0^f{LRQy*QdO#vO`!g5;2>cc{nb*V!w_vZB!NYrM=rmIB7o%hY9VZ=Cz zXn%p11a`jz6sg=lH~JM@d*ty*cklA3siPOw4um_;D@&@{u@xvOUPh<|bPd&n8JAFI z%}UF;9W4s!t;&#NUPJ^O3DZ-tsc=mp5qOu%GLmI&`nurxynYQ_%WM=B%v6g?xDKF# z1|6c;UtSKpKS}D@A>G6KYuc_bE`>9_CAO!gWMY(ee6mIj*!fi3e1g~e-mXoS#$ZdA zh%dGnt}B!m89kdK25lOaES+xgG4=4LpfDSl!cz8MCJV`@u(^e}u5kCbn@N|x9ysdu zw%_d*vVeEdGeA<0i;e%fLeAF;1=|^WW9odPppGU4V{KuEI!RhZ;(7DdnY8xReuYUy(r`@s{xR~qFN&KknroTs0-UN@3L01RcVlPD1 ztT_p`lo4q3wLK^(lq96qfZBFHmG#_0g5T~yihJ;G*xl+U+Lx*d%cm;p8L%Ebu3r2} zW$j5VJ?eAg;oi&5g)Y6hgSUB^m#$FhN}0TuJ2PkIonPSMdN*Xb&RNmhkNOPQ@&jyM zZNiWP6lPoI1Zg#xv}U=+ugRtprboA)zx8{oJg>atmsXz;%R3i-CzW$OuUh6zFBj|S zKx}DY@+9$AxK=LRB7JgrVctCtl@bV;1n@d^{Am9v&AX8{CcjYJXu2;UDLc~wEuh=; zI0)l1Pt|rb&J2T3+}=pZtnT-E*DIbk=#^g%c6sytPR-Gn&AF0(aifVzlK~iQUG|E< zt8sPMdd(sQMe6x)`x0#^NKE~*7cn=r1y?4xu^fgtoU_?+gSn+mMMu+ajTsT&Kazpm zde6={)>WPID5WU2<{C==y+D|>Yvz{I)|3{Y7Wwz@eCl3Xg!2yY9HMw)*|`7odgc<6 zm1`D~3SC_q8DNWzKGsC0e(7F)YHbXxi zo|k^l+nZ-7ii7+Zw6^bR5t84(veIL^|*5C}&b|6WgJzZKEWCfe7p^)od1=n~ z5ZmVs(2lN!cewghtcdPx;+-X_;Sbsyn?tPQTf;Sx=lag_^yu@IKJ9v^U5ZkozT#WI z>76vZQsAxJWwk!q4Fj4=?b}aCkS{wy3ncL*P{IW#!6B1>p8RHk%xK9FV(|8#?Hk2! z>h_7qVX~z0O#XHs$sfJ;^Xiaa?<10b{iWc4%xnN6`EiNKZ~VXYL-vAs_;NmsLTFxq zYw@q5?b=HQ-;|0Eir$pPk+!bLTt+4ri{HgXjf=0>2jS`4rfr^QN;3YmxW6y$VN(En zO2bc+^!>Akm%nD%t)l9y6DRY1;G+|)#+7&Lze|(Xeg9U|Ta~ZSbEt4=9^wkg2{Et- znH=GcGm_@x$r|TFy{rm=T&Be^-v+B%(^i)wW8=3Lw>KC5D`;~lUMao3)UbNmwj0i@JU(n0yA#?93#T$i z&mI`sH~s=eMQ-thWg`Km00n%IxV!e%I{L2nMn02OG$j9^UFvCEz#ZEpe8iEE#!8fb zW9YNz!(SfFHop{$PHdWKG3BMHCZ!(Kwea5XZNuwms1kXetas1xWhTYBN&E^yR)S));^%0prQ=Z> zt-h5i)1_?54nheI8}_jHKBQ2ccXye-P@Y=y3}>CULJw~?XN7a)+I!Uk1YZ;{qz#Ru zkH064#v?)@GBfzlP<|QybQKNnCCEF1MAS~v^lp-RoxFM$sm#kyfOx!2!07ZCR=l77 zndrr%c{Bd)1SzAD$v1N`Q9{L@@HJKew0F|TH}8$D$?aa~)tSZZUXAwdHnBGYjM+}> z{voHIo8~43k?~;d>ppwX0Um?s6=aH5rHX zF(96mDGB2{ck!7lju!zm9eEn)G99V31tHbgCc;>v;X zq4lEH++Ij)TW(6sl#UFSBF}`SqaFVHr!C~_z_4nd7!>5IZ zyu^i8y-@@4?n}c|19^LE&q$7QUaSU7k%a-2({3XRiqA|tlU|Y!L-SQwyQ+})0nBiP z-A5xK+ZQ2P-p8Z-1uy>}A3^F$54|oH`CS-7q=#FhGyTp^*Z2V-PT8mu92K+sqQgeA zQ|Tsc$5q$f57|DXT?HT;$=i8vjbBX%h<)zasm&Jt);|{<5BS=BFFhg`Q<{kkSpJtrp1FLb{_lU z^_T0;ikizk#AJ9a3+FLfiTd;<(y%?vy&wc%n<{;yp;CE;7vJdo=<|0g>y5~*WPiis zsL)nq*@d1&JiiWI4x67zx<850nti1q#3iU>OD;v`a79{GkV6+jrI61kvVp}gMnyO! zvOWOfIIo0N0GCj8ovAWBvjx-B7>w^6o~wemlTnxQc|<x zvymo>!!E=x$x}}^jbzHmK`A2mOjzr63dSq6}jem=#{yf>!eDQ z4BBB$nEOWWw_>a5I-K^jiI;zk{1O|1#p|L7zp0-IdT=x!(Xc0r|F%-@YxbZ$Aqyin zK2JxBY6}WtL|?do#C5~&;0J@vJ~3LC;`#+(Rc;}y>n64BW>$OtzWQNZ7!1{NietDH z&UpqMS;iW>mDU^S>(2q%`~J4IFB7plVOKuTxs5YY_kU}QuG*2$y6fS;Jxw+Q(KkKy zWsE{!1!X|HlzZltd|pxmHNaUHRz9x1+2hld186flZl;_Q^RS*${#l)gmq!u5Phw~ ztJry1mQ8R_a!-HF{&rnw&iMUx`yEiU@HR|a>MF??H##U*K*QGbPzm3IZV*e$1O?sq zAI9aTWrb<>MwI!HE{Tiy7$kSPwi9)xaPfk)EHus*4XqHj5JavqdZaMe*RL$@QLFb> zbgPEjcF~?qN^46UH0s5Gg&VY74UL?uz$?}b?A2G4AG^C%gn*V2y^HpCY$W9!6O2r$ zcxxdSaDm@yw_2WaEHfE^e;~Mwdn|`AkmiddAwBgpRE0G=PaYr5JOQe>8PmU?|3AIQ zqDH`pFJ?K1$WO@}6=67*tg`pUD6U>ho43W{h@eY=mrJcy-D385n5#XWX-%pcNVC$+ zXm$s``bKQ_OizPAwziB@a#RjVP3qcaXVM)%Mw;`KoQ0=6u%VKKeTxV&NgaW`pds>omq3GJ}_K{BR=8LO0Dy zKh#$iJyI0B8{`f%oukL6AAo6i?k#;Wo9#ukl2p^~-BmQLbPj5x6~+<`brg+<*K*P! zEu|lobK+G(>}E8tqy3i6Ek?yE+)1wMm@mr3zdW>VwL?i@dy*B2b)D#EF_OQMaZuW? zjmVwJ`?^w`M3=kdi>D&>#RTl3TgLSPx<&D`4^SI&J}-k59zc{UP{TjtOivC1`DDj8 zMwX-e?;GEUGY~iLpBEmFU9%8^JHXf5ZJA7^FJ1^S6+~$)ydl`xCJ`fy-jFWXw(kZe z0s~8iNnX}dA4{Nhjw$ADb+KPWIH$#E<%tSKhck>0g7)Ntb3k3L1q*q_Gt^OAxa zOqWjM>kjPQ4Kj^eQ8Ymae173&(<&(}fQPYg9w;C~k;)QP_TCvKlCBHC z)mFT_*#%V(Jk1_mg2`4rBaz?$YaWfTq**dI<2?i;oq`FkaZqQ;FU9?emq{n_vp_3a z8o1`SDuZ{|LO4t@hdByNea6}4@p29vPID*mN&Yap@ZeD)0i2QnI%kWg+C|?|^t0Jx z>1FCTEqLYSg~6w)J}E&|PTej!NbAE-?GzeZz)OJ@Wy0y}&;f*h1WBPlPyqhf zlpm5s%x>kqu9Joj@WXu{wE0As!##-n+B~6F+wpxl+Zzx>w%Mwq7o;ud+04b~l7z2& zTu58X*^_HgCmFl`M^=Fx;8Z&I0^SGkB4?R*|G}4JGgzhHXIPF6aAh0IsWR;wnBq<9 zsd|*>Rd669Fzoso6auVo+icH)wS~6$zL3(0`K?WHwuL$&<`eGxA*@@ZW74to_}v$+ zXP2Ee*pF>j9qgM?#1`B$b^FM6%fPL-D-VK4rGC}2Y#8FfF*b3eoMpsoE2EqBVU=78 z5=+10@CbTrx@L}$<(YTdn?Bdpn7kaZ!p`2lDV{R2R34uC%<5S^4O0#?#2kHB8SdU) zHgU1J0vZ!LrCh?=jGK(zWTtqRr>3auZboUsaM?a`IXY=Yd?a}JY$QpXmy8j)&H^hC z0bpbDe9$H<9oab4eUiIS&X_rDv-s2Jy79|-#QG&AlhZClp%UN0gAo$WI8|&zJM85$ zjetJ^jAjzaM?~Nq<`M}RmUj~nb9*sSxqBQl(>xhgY0bQA%na_aa*B;}YpfzycAX8- z!c!dVtecA0uxm%5@72&EaH3&p%`>HWUa36{7YquTqITwuFKwU#Z;{+p(NClI66<+G zIHJZ^gU#IOs*TjMva&d1l;a1A#Y>0sL0v>^d%H4l*xErkxp+{ax+W@`_T+$q9H{D# zq>mHmZ&YKjAHJepC44@cmR|*Q6s1rjJy0c{4GRxPuCK2b9=-x#!GC(|S{-R1E&=lK z`gME*v77NW>L!WxT4bCcB||FUI**xrxeWW7Th$&DzZEx9g)O^0<8oa)V#EB4v%l_6 zY!IfwJ;c}D_xABl)ppZxYmDE=wF}Tf9W=pEo|K}pbV3L#Q#(5`zL70?%fC(!J@%X< zh51yqjWRlbX-_=Ho91)U(Wi!PS3iAO%lH^bqST?R1pFD{zHg-!6@58?JxOd4y6XY> z(}`aJS=Qw^pdfaLqow)_uj>JJ&ot}t%tphT2$o+|f42=Kiz01T8=G_fPmu^YG9Gcm{h?=dBD)ysC94raZ`fLfgKRG$ zYNQA$u}!t+C)%bBQuqr!(Ij@0_;4?)<*iEbOT74(W)R1#QoV)+{`R-fY}friW8nc9&)MyrOrE+Gnh9b$Jn3?>~WepinBNrYY4A z8jNQPZXyGoGuO{P5F!R;_q(~f-?g^34vkk;08}QX)f{zo_4ZGnUL0Zn{Q0vbXhXOw zr=s|lmW~byngAYgqoSfx)y!J1t*RP$VKp{3#=Q+jt3`)}9k+FIf&*UH{ou)-ey*C) zen<`L7P`JQ6}HP_RQBaLFl32pXr~VY@5CnZ1aEDH0vL zWq2WRm=?_7Y#yfh08}QcW7Z~9A9-e=A8s$RlJ;#Q9Vksn^u3~_%Jw;!9+F0`MQ(nf zJ~Ey3dDcnUwVFg)K#wp1%=aH49&*TsGeY4wgeEcuZx09VuSGIwhXyx#Jv=X{IqB-}N#vk+IsPr{UB*#Py}`NX$lB z#?IE%eFfuTJj#2GSs{6|a+xP0pNW>z;abLlWZI)5o?qz|Cm(4l{`#5e_+1*RDg#=c zin9jn1@L~2zM6+g6-%JkP~EIg&sNs5R^L|MvLPxU8Dea`l{fDi4_<`#^_}o^bnNmy z?R|WsHf=)2c=~eqd|ze$Vy{2lhEM(kkXiVtVyY?R&d*^Qm_xMFv5wco5E)9v=F9mDK~r^Dlm zUcD3V6=pU2_j5U#|C!RYeX$~B7zgXXdNG;!-jkRngQ@Og0u0Sb-;^z zWt4<>nFFKrxb?zX?>l}Ok;7|UzA!#v?I()npW=XCJBQO@ZpS~tN*Og7(kuF&8>#$05V!<%& z$ISigu3l<0AKy-gDZNZI5o`KE;%@?Van4_qg3Ms>c1L z*Gz}9%Nc!?BMHf&VYzFgNGg(su{$z&^#$~N(IUQCSlXy7BEGA7(r5S)%WuBXU%WP??vgj?%!eV)Xh}z(#J2|T*|#m%UdC0-3uVcjOgS=YkdMtg^-h?l|e^R2lSXMRw1~VG#lY z8wTo%T3C1Qk>1YGFw0w96d#uMgZCFPzI*xAoqpPE_bw?d-|VjzxOALv=5?XlZ7oz< zvdTbBMjXKEYP%k=&66P@+@K0jJ8DD4C#uNHH0TTBwxVt; z?sH>vkEqt;HLY5lC2^>&&9_kTGn`I&d3Ofg-T``iU<;f*zx&)gfXTz)R|nc(=BC7k zd=*n37&S#!Wcmq}dFwj|v_laa>@FzOHVVE=JOE2sIDy|Y<)gjW2OZ=C6Sg=ShA-es z=wXn|UHZ5XOS4nmuyBbUEybJIU?o3=w_y>zGxC8=x9_Cz4WyX&Ug1r^H~BQb+=oZY+<(4O>VFtA^&HR%%ot1JSfa;=3^A5i z86-f3!js=SZt4}gzqU75&(edmw)R};5#PF09f4;Pm)>9}KTY(D=$dfhT+>((yS_cq z$Y^yRAHj#-$vlVAxkY5}Tz4-Bx4A`nsEbbW+jnYdJLlz;X$7Dl%<VBJr&Q^aynS-sF>vt^ez;$xT{aUCH4`u;4) zO#r*cI`_t8%ZZ;%pg~BRyt#nLXKX}zob#>?|OJ6q1}diBy(78)-Cm}5X)9n;)CLx1L;6pT*I@>p00qhQ;KtsXA4Y3B6b_X zi7sAa;eG{P1?iVIs-qyG(Aq>*Vj|gjo=a}fSAZu+582xlTX|{g!_vn!HVdLSme2l* zM$>F35gR6y`!R z5Ed5Zb|8~Hd+s3s0s$DbtR!6lJkR#GQtDAjuR%IDJQu=9`-|Jl>T*| zy0xlS5XMz;nTC25$vFLjWcON=iC}Sj|1q@Q_wjc^XMqWC8vg#?tJX4M5aJ+IJtnTY z!=R5J6JtBu{iQ)Osk5<4`cFU;%=crZZ}*S z-`U7~$sQ3KFB4U5pWwUnl3nm4M#jHStaaXh-7eE*$B9$#-c`aEt%w2@Qci0EqJeHN zT@OudAYU(ylWGUiCY+z1JjpP;HAr#Sut^lmi< zn)}mk0BW44p?6WnLuiLC56poDtVfQ9J_t_Cf>WcM8MwfpF~-?|c7V?3nn_y=_$tgvs;Uk|StFTA6ui!CropF}tABG%W#-%z-K(yZw28#+#OhBQ0% zqV6Ud?FHjDtE3=>=UyPw`ML4$H#{c<=sWIMI)E@t+sRoD5S}tV#%r3qYhEq2#v_E- zw$ksQJ!rU{n8b*0e9^kR93LmdfI}9=UTa_Hw&N)BQRRxnfAckVdYxBsM{(7^&0IEzwegCL9{~asR(G(4Cwp# zBeAw0n)^4p!c5MRrc9`QH^02o9|VT{)sKhyxm?WOrLPuRb0lNU(=N}mc9qQ?Zef-; z38<{yH*0o#7k8*TIb?dcaf*~29vUx4@uR~zNb0kmsk3>cRhih7?UyN62>CDAkJT5b zwu)ZXXc#jzJ+`d;Tm-bCb>a&Qkv+q_pi{+vg#Gyql%q|!Z4=T^CQZk<>a=BSAl%N} z&UTQ=#Q(YP_ik9JY^L{j{0wGt2GZ8s8|`%*GYf|rlLi`g;m&id^bBTLVg5DzP|udB z_ErKqvY|J9eURjH+1R>Ezk0C=Df2jLC~SWVPW|*i|IHSpa)6|7DF1aXkgAGg(2?Os zz4p@qTc8ED;4G8A_bm{J3BBMlE^0oGi>-=V zDNQ`3X!>TPd7KmeY^6iw<`=qC&r>_2>3)%`Ef9bVf!-i?7q|u*6=ja`ACYa~`n7Ku%zqdqVf zp|1lx*|i9|EKbD61pK;Uw?s^RIBq6$c+=mQj4Dze@2Am0E`)5*Mx}oMjZ3XnL#cBCLd-iVJX2HV}n9>psRec0GOG3)L9*gnk%e50ZtXnV_j4+ zpUtWp>>W6*JnP@r>A^L8zlya5`ES}_)|UOjsMi*@9@o;VIZ}&BQHnAIeXmNd9;pN& zI1TZFkSAnwkW1F~L3P-N^?7zM7DKZ{m-Z5IYyKEZ(@e$mBJTnnV1XR?4KV-fQ<}lk zFK~;5+Q)kpu_pOiaOV4_SMB=sG`!ac>&(QgF>nCd^<~q!I{&ddFprIwzWux|-s2-q zF{!1@Vs#q@TptHB<$|7xSUn1~Eu{ELtv>CR^s5Mz(2m1A9u;1eVc@cnA7xCQ-42SE?afo{Xtdm>9(;iN^${pC`2_ZLBye zP)QnOT0G*8V`Q4AoffeqG(*}k{o`h?=(zRMayOmy1!s+Dwv~Trh%{Tu7{xB;Rd2is zRijf5s`3k=y|i%+N{TBtS;9Tk@viN)3nx{Ml3Kl-jI)m&bMo3pjh+yn>~|xMG{2@@55Wku1VL zN~0D6eX>hdXJd>^TB*bp6e3|?3fkp8vAHR7wrVE5THFx!T zJo2b@mi}N!{Uq#->=mZ>r{5Iv(*FR=-&XoM>I4FfcO(s^`3L3}V|>>SN_rX_;Wq&u`8^s={=sgHZ9KDsrX) zWYl-=YkHZj8)Eul6L31ZCAjJQ*@yw-&X|KS*eYXsFU9ay)9{z9@u%X)K1<5$(m-0g zB`qI$J76I+uRk3F6$jUbHJ#X+HzXv=TvM9Ogv*OI`_D45F2+>$i_Xn5MQm2~q?Q(p z87}e9FR;P8$K7ogyJgJWC+u2Qj&l@ndu}KOym?gd{usbYkKZG~iW{n@l$`DBR-972(68#Hk&my(iQ0>x=ad zf5Td?0q|zCp3rxQHiFX&O|x8Co5!5i@q6?QCgk+ZPj@fzS4+_m4kAqzGN8kW>>BY> z#iX3|4MAitf5n+fCZK%`t@{tf6jk%#c&~G$tdp!(8$(-N8y#&;#SyHAERSs!%yzob zS&b@^LS(;+jmo}dBg>a(p|;{}LFYk4}8NN8x_8%C91+P_QKcrsniJ1A=*7`lEj zFt(dMIZd*MoazITc4&E5j!4Rx+dW;Opl-Ux({OC3Lp3TZ9S5lH&$`_BZB^UM6b{ipbFY(>@XeKigUL ztJ7+Y4_@ooy)%n;mUX5e$#LB`Yz0>nq!&p%WYcT>e?B7HE! z?XN%wlGg3*_s5PMQ&P0NWz`Ly)@eB=H<#p=e$C;VEni>+$u~$(M)()qCZ!~HRz9le z$;E!GDw#(fp-TqZ1?rp#aEf7*^swZ~kI~d(G z9zHx{O=Uhub?zr%0NoMTHFv6>8M8cPIv6{Dv;D!)Hknw_h};JB=_kgr^~IOxVsv`0 z@7ancduRK4uPoxI$Jk@Vs{G|md~%IV=#N}(Sbf_H1mwW#WtTN946(D~ z`c-!ml-TLn~<6l#yJ^HJkzFEO8Zl__i{46+A`>#J{n+V2fqKw2H6nTO#Rpn z`+I-JPi8c6DEBrHadLL|m#3@Qc`ygkY_TGxNMs2d4p8G><*4T9mg#E7Qh?d zzJUB)T}AbA89rQ0xze(8@s#nWx@BlX=*AZE{TB7NfZ8PkFgYpJ8F(5@y6>)Ezuue* z3Z`MHWQR_4Uu;3d_=i=y4M-d>;Hx)0Z-Y}7EQ@A=P3i1u888l`8Eb9{bkMcd`s|tb z7uaZ4KV31V;T_*wQ06k^`E$NT#W)z@L|i%=Yo^W2v-GZ2jf&(`#}&gYY(d}!}X=%lDP zoFr0hu`SakDIpDJI8DC6Taenbp6#l(yd6zB8BaIX=9+n~6y2FegU5WgGp;wx;mS{p zkB8Y;`o8t~YB+JZVMA?#)1zUl4yPhEdV{Q7C`R@z=f4-q^G73Na%13`WP+M2P7Rj{ zTZT`}?Y!7cK(aNSW_pP%i&yj_G9Jp>C=`sK$}>|E6O zm}mJGWuKCgLN^`R2D(iMw#|y?ukR8#Q^T8;B>oLY;Psc}KLXljO&!*T88#$gXNipv z=SK^Ms`v1X+f~RY1}OLP+4p=C?#=Ax_&domL+70d8C7by<L8H(Yu5Xtk2eSG(bp%l>bOebRuw zMJ}i+mTNtA+7hr6ZR7F6Ubf$qMu7PmPNOs3COMhq z>(cPLm1h=y6&euE552iB?zcbuyk@2T@6Vi~GeQ)Q8MJ@PXnEF;*apk^_8UoyV@25C$g@vjkifoC|Jry>u5P4Rd<1Nr@yg#Ei#x)AV5lJ~u6# zzffp9tr`gAvw}&*TSWuKLT&%$uIw`{DH)yizZEP-X+O^)=MzRvJ`|JhY4pP(+KYtQ zAHb10q0ZA%Y0W-^`6(Gg#-}nE0~Ut-LXenFIR191lW$QFtq;9Eiei1`X6;xke`0|}N?OxKZEQ>A) zZ5^qRuU|s2-}9f6hwXW2qSq0ZO4D0s)`Y}ni1pVR+d&UNCMGb{mHPrOFYm|GlK*T| z-mCwxo{8lhyYYyttCY>%F@Pa{(V>i;Y-2c0SD+C`lOaBx^Bx3w(%d1L;@zN5j)s0- zT=dpRO5ZQEHw54`CNbFt_i|;PyvA9u@sQnpiiqE%(EGHy56vTFlGESQgn!p20esU` z8PG|0vQ72!-xcA`@0|lV3vZ-T6y8WfvnSi)Hkz`Dqo{g;I<{zKH(nj)lXYtvC!0of zKB2Q6be4)It6AOn^4eYPE>q|O1h zUsMUkrsHtOl_`B#0}F$fTpkeQiRC(yq0`U<95f~I!OAsJQR`oBCkgQLw*lhH`sQZr zi$5%}fr9N4H#wF_aqh^E6xmNQ?9-4cGJo^$*~5N-ac2k2cdGjOP_$eKos!FR6*^#b z`HDH_V~JBNU_ZUFG3c@^I>!}O?7CA$M~dD8tscE6$4*;Zv1BLk6C zBO{||z*H(zSF}8N_bzC0@DKDLM?PAoG@3=QaC%_x%V|zXe4h(B3n?zRF%5RDiD7~l zp0%f0$bWs>0e(ce2*w*MwYa!6;3H4cDr3J#ilQO8@NeK_X>n(er4P=;w^?>vRH5Ys zbL*(UzOj%KpT8Td|Fw?3$%iRy3z(aTAy?4^eQwYx>_G4w7PLVR_efV?B`Si%@5E_J zB!kZ0ZHXD_OYwbW+`X=ktxeESx>vqa@<8y1*Esen49Ry?`8iK)z|D`NIJXB6l!k|g zRW)-_1&JvskZOkoN2ou>np^-&@q}}~g6lV01+~CI?jKCuClr3CF){HT zjL&c9g%u-&79nJMPp0^q0Fwb?VpO1#wGPR_zBRn)VYEt#EW+)AES(__0DNp$D72uX z8$r2sbz=vkCD?$PsIIP<0Zor+6u|R4dd)rlHZ;R#T zlM{me`JHrpeap7=fV;}^oEKb2YcDT-(APQ>l+Acbk{fTW$-P5WyM8RVFGlw#E$YES zqoU8CX<+Ch{CTC6^QS_W`dz2~x7g!EF{pabnHK90fP>o@bMqrnt|Xe#>l#0q*Iy*Q zOdY$2jcoM^yw$}+*ec?7?L~v+2DpSE)Z&+byd%I&97#P3| z8_K{4(Zl~tN!@qG1ZU`>T4%9|MuxYM+WwsPE;v8dLi@>aOt`ku@sS7cf>43i6|U03 zZ!fLVG_RoYSkIn~1;GJ^d!P!w6^xJk4sG?g{LhwA3;?s-)CAX<3zU~G^H!Lb>2h0s zYqo-X{W5mgg%6se2DqrC8U-8BuIykmv+hH)+>aPQ-|Oh=dUJ!JlI|Z__`~ld)@G)e z;C@IN?o?kDcq{)DKsgS2x1@zQdMa3dbu?R7)Nnd5GB#$6;J1fD8>;W*j|-iSJn$!h z{GCnny9oYIX5F*HN)laN;sgq3MmikK7B3OMz4S8u$GZi>18?ga=WyE_VsGWCiR|48c&_N_xktrd*J_^w}ubnaBWB8E(wSv7XE zL;S2WrE7bnLE}r?c=hZ~VS%)gf3}$w`P11LK-_F=%S}1OmatvN&oSZIO;~Lw%ttAK zfp7rZfyU3xA4+R$4Y_%ESiZL$dl>XjjGy^Hj|Aa>FcuJ)OdONe6^*f1ZD$=!ZES4F zz3E4f-=#zXo-7j^TTF9vbLy`PSN{iYggB^p8~Vf^t_DR6dy7K4s!(w2^lBO+S^Lf)MH{aWIE2!Gv*tI6wDgM`)JlqXKsmXh zAvQ;a&F{$1r#d&h7h(u_bB4GkWPE(QnLs?BfB-Y7Rc*>J4P|Cwp)uitlS^7o(*5uw zS<jWI-vaP0jCHo0qZb!ho)Fliqka}MaKO(bQ@W~@3@c)Hq*NTf z-QFvCG5=>fo))b`EbA$~_;REDzN`QQoirc$Jtp8;cZI+utaz`>oB{k-NGB!{gkAOv zMn2E^n1E8j&u1?I4hsx!(<_QF6%x|}- z@Xziw=j>6(Q^hdftE{MK`(Zzn(=-n&9{^~J96$Sc#2BT4ay&a9V8RGWLlI*HF*P;~ zYmU&D)ccg3d=O+Hrx-rnm|qC$$Uy?$FJ8NSuLU__-)($pJ;=)0S-YmTHl!+Gad{cs zN()CO8qNKB`#Dx zUt1A`BX3J2#ezZD_sxdDfP|YJ3fpN;3-pKXl70 z;GIF`(kAAiLb#$4&wfQk#YUKAOmy@uT@K-&3x7c^ zJ9B(di1z(csjxTiwGOGMxNNamGVjL@9kPt(eEXBP1XPc=^E)w`E7_p=B>s>-dui8ROMF4RoM>^OyYK||OG{j>V{ikX@ zNM(e!Y&oQ|E>5%N_&2M4?z!gc)p-U#SsFlT2sN*)zk0$;hz}}!z=J{f5ugQyFd$Wt zkWBOMXkc9pdWTCC5Xxv7fL#NWAf5w*0|N~OU`jtQF{{rlcar)kUIL=l7c+54h|RsT zhL8ymbzj*Cct3ALOB=G1V$#Y<8jRZaLF%`nctN`n2z>Z&YZ$ojl)ZZO4rb9?s`@K( zAouQMjyB*Bgp=z`ro6(&&LzmR-FXvQ!P0QS|NbgMLPX>{`b(uS^7KrZlr ze7m?HR`^k~);#M17{P0FX_=_C0=WG^lKt6htBF6Ew0}8w4!wOcal~qmCO=)98{1hk zx88vkw)PFG9-f{F>tkqR8sM6~KjbX}--b8@E<)3d47bF8VMd5On7rbw7aBfKlqYS6 zBQqzC-#^B#6#c`2Z2`5RaDX_5Yhs`pkvHi9E8=mC&F$rzTUw)i$~uFCgRiW9P5n;4 z+FrFxOfd(p)SNBv?f*vofNf?M!u~;*B{;b~6LexAz6WU2%T3h93He$7y!G?+dQ z5}c6rF%MRKx6NAw`YY={-)o2f<7nb14vC!1`6FHmO<14srgXsM*20yzMBG&M!UAZvOuGN(1S_UpN zsIIQ=lfezpSh-rJ@2O5@{a?tbMQw{5U1upUIp0WU9bM>qbH(}(egOfkYYIn=R4oEV zS9}*K_f-mQa@Z}jBG@r?zbLrg9oB*_rU%!$2gUfO+pozL-B%<7F?HWw9a<|BB=#8n zNAL6S*ZYOE>^txeW8~{}Hr5=Mkq7s!()zIVr_I|2n9yIK;U0kvl=ubtqWhSUTW5~h z+U)z#-7N(WeV$1xjQruinM!3t-y<16$Aq?uT$v z-I_VY80@E-gHfU7KiGrhHG_vKk8fMCjQL$vZy?^Yg6bLvMpUhJGad`MD;zwye*dyF zz@7qsumq%qar3Ova26iTcw=$hdonJ@AUaMb`uX%6X(Z^uKuN4|h2RB;wkNRiOOeYTj09oP=Xi2-`{q@{;BNm^kCx9%D>%IzC=~Mqqd)>GPPaSzG7CbWU@xM z(J`QLf|c(ndF@8KJCo(kSnx!Y<T*jFj$!r;lo1Tt|uY>b<$V z=iZielZMOHs*{&*m^9pas()?rs#^+MCF^#qEjhDGQsL|w8)X(#|KhB^C&ISWXGH0v zfl}^Z-{jdB5t=>~dL(AY9DO{K;6E`iR&8mmO(EtKHL_3U;uj~Hxus_HSA z=t%&>pTk1kybfJB0HqigRz6ioE@%mzc$BjHe#*>dUYSw#%e&IY&xw&+m6`}WDZ>G;*iWSKS49&KDYcq zsVqdJy;WF49t{@i@} z2HtP>(+$Fv#ZO03E+jEg_e+6qR_O`+!iBy#FRWOvr*U35WPD8VxZOpV{}U;~LdOsa zxwk`GXf!QcwE3xO&{&fqr3gSn!a_p9lt13HBbCR0+VQ6EKO%v722ORi-+HrXP5}S) zTKmrkh*oMOSKmskRqyR*R*jOU5@$sp9zOKM+pFp>PpeK0d~Q69vg~~@lqmgg-X&3a zt&HQNwwju1d#i_0$Xy-U0%bRv)5a$5;WtLJ1&>pd_gNPff!K68#~iC%!L=J0mH&`8 zP$lI3`@j!lR_wa^)|uc@)JUA#qe7ecgef38PDGT{v5|r`oa;XQgG37S zS}UwGb696KppIxAu9_(Lcv4ZOl)CePoXUr(uQDTXK>N}~(z250RsH+;M2}td?D0`jzDd?1bX|)+1c{{PF9}bb%;7u2>~ge{(_?(jHn!Q~N3zg{cjqT_yBLu=D>Rz1E+K@Sj^q8E@S=ueEBZ$ ze7Al0#o030u>U@o6kf$LWR8bRe4mCCUVekYP=3K<^i#Ao_}c!tyq~EAk7CGvFh6n{ z(W;L(@4JevDC3Roh94Dkk_>9LWI zFH`kuXc<;F_p&p8>vi#*d$}8<*FHuZbin19PG;Yj;>+`iibGrA1yUhhK^T}>Bf@3` zcxWMNcyp`Gw8Ivf{(s5KDw%`N7wjz=Zu$NMBdleATXcfb4ILO4s!H+&3P$7?bst7> z8n7e5ucshoR&A+owqMOY&nrQkar!nhMTd$zgJ_CiX8j30r|1MSKOcidJXN^TvG*uu z?GtH~&gJP1+ax^NP^plaD1gjGC2TfT+LpJPFXBx^(Ts#Npa!H!9JrbJ{i1+busV3| zb~O>$oGmi~VFg$Q*-XaTJqAaEoK977Yyr<$JN>8F zSLcaVcD%90KKy3VPe7m~h!nS1aH-czk&t1mYG{-1h$zLRRKby4(PEc<0#@s9{=p#q z^1FlEcn(`BWo0px7=s=WETxd44ULB53;Ii@&ng?KeDx-N+OUQ%{Ehbd(X{)sOA?T| z9Rkw6O298~mSIFyJ$!7itqsBHPUp)-zm!vH;vYM%T*`RyXCYUUyoraoo2w=f1q%r; zox$GREAd+&@}}2%gK>QK;W;-&i;hYf2}oBs=G6O?M3=D2dcPnpm<8GIt+oIiRR+}b zle#f&eGITn;o+}WWQP7=HX9TRg|Q{ZzrCNyp!dN8Mt*bm z&I_BMuNf-6{gSQ`x+OmWB?`mAydQ~f!NXLDmsCckvf!VCYV$lPJ-c7caBLjn`p!%U z0q5~+iZf#Gh0mKfuDDG(asLa{EL|ozy!3JzWqTRo?by9)n}~|4vDZg!Z_etyGYvM<%u9k%Wwnr`J_1@EMPX-qnK$#JuM+Sj;2xyCMwmvR_J1N;cheo8sSK(kI+CI1*7TEz>xrn^7Lfyvvs*EkT7Tul zw(6Ab_e#icAX<)Q<42vSd8ci(O~h1=uB5#7f7=hN8#o!}k4Ar>*7EVn5Hxh&zz#sR zY;bX?c?9zNAs9$vYDPqt5T#aMTl{2FO_=Pp682FdT&03wxN zQxk9|l2masmy3L&_S_y7+e#VvVZcu!O7{J5?BZgBn1U1H>>|T|YySDcHhreTwex`8 z@1Tl`6@?Wm&p-IXrrAUY(!y0>yQpL$q=ly;_~)ld|9s#0T<^B{verL8)rlMy^b z9UVUeZ&_YwJ$G9EHYPR4D8I5J>(WNBL-N17L)YY8;5SXt{bxN`6XmEps#tGYh~FS# z{>{NG5F~@6VKEt#N+%G%<+#B1tSPo&@uj9p};?d*PhG;DZ#0y z-t;TYbpgrLWCN*xWvajGvbSU2J}dJka*q+7N)i}VbA;E+3b~{Xz{#eQx0eCdwcLRN zSN2`ZDXI$MtDRe}_j`dT2-h>@dgl6o5iOhZJ6y^jy=)LGNcLbIWi-iqsf^sH4ZkjZ zZ}p7Acf0RYcVa;x)q_mfOxi1n9Xo@UQThxsqE@GqiRfDMsEc@WiLYUN-t7asR4N6- z3S2_MnU6qlpSI*$fl_P6S4q&xG%4*ubC_RvSf@pqg(pZF(qQ+H>pqFzsJ_1$Wz!fB ziPiUE)j%!@rZHz??^wrNZ-Q|-I&XBzp|0X#`pJl>=NO)HDOK>5pCUC!!M_~KeaKA7 z@E?+apSz}eu(IBFl@+M{$qvatFvcnwowJCK8pSZL$ErDjfcC70boch{+mlKhKd15O%zc&hO{=V?Btl!4GUY#^ zmW2GUImQlfXSs2|L3Olwan0DD+zl&vPs!iz05FoMyZ5KpN41x~5d2OO*$7!C>;6Q+ z;mX)W_5nk2J=Do~($4EkvMILT{;TW0iXO|RJS(>~UAz;}UFNC&rvHUj!e=rb_K_!g4<@b9)cLi&yYud5Z==U-%R>)h zc#Jw4i!QIYO?}?j+L$b5f;y#cI^I=sxPt2ho67u`z5drzU02fAGuIiFAZ`Wc%`W1U zU`O=#s*B}c5^&pkCIM#l^(;R8;$hr04T^^B8fsLcl??8wPI1rEm z-}vIABXDW=Da}`VH5K9GHAD4JwPvxzMl`fl20MJV9P43P}f2$RK8c7`D?y10fv9C!WwnWM(@h-p96O)E3d28 zi1a=OWR`yu{!_eF5?pKxCX7~YQyY6ARn|OqvpBIpKHOEP{@WocG$~jwV~6az5Kt8L z@x!GwJSKW0x;-Mf%WnEbt-4jm-Zi_FlAjp79IA*zwIf9WqtWt7itwk1=2nE~NEYFg zP3XiSZ?hRChIhiiTM}`fzv1sWX1VDL?ujkDqxUKTnBQyg>= z7-r{?YXrULZIk7H+NpE)vpq8E>ZZJ+uMU+mFi|#qm#(0$hbs3c!hq?OJ9_kd>FTZQ zdNVs z6Q4>h0mGsynEIzz#-E5fkN~QB1%FUDfWha89sA%+oSYh>b`j_gDG#*f+yf3~nU;a5 zdFu5|PGDeazQ0m(Z@-#JX`b?eT`9kfh*<~c`*{6iJov_|C3dmQ%?tkuOGPws`{qWc zDw$N(e_7bLT4s=E9fSqWngmwXCp2q@r_7}8ZO&xGZTKVJ7Jn%>D<~$lIq?;IyhM_j z`Q$y5224%nx|+v+CT=q^5P zz6pt4@{Trr@9GNEB%8I&rw{Q;InFhk#UAsyQOZ(8n1M;P_LqeVsY;QlM@LCzRKLYz zvBNEo*kuccjQsP$!hbx3rG<%>S6G*a7v}oODpxxV?QRSO(9$-@^0y~rzkswJ%o)g7 z^q_071D!X$uq8(qr{tZ1KeOoda!He-8Cu-C*@W<=#B|r`y(b!^u6^mhx*fvPN3d(a zJ=!1KI1+ojN&1pGqDYQIHXP%$=7_>nf#gq~f6H8#r#%3kP2pIvZsrMC+I!Ep&Yhy& zPJNhrmijyoYhGMU;-%=q|~Md0XU+@-?NKfA^4shm{_Sw+fP8TJ9l@?cFRzesUe(oZ?07m~{f^bTlb;Fwkep1_b|@bN=hj)#TeyO2DEL_!`2TvED6^ zAIgDJWFi;=0t$4r))FKRY&xUt)b{?7r4 zVAUhjG3mQOCV;C$(`gQX=tk^s-}@9V4PuBi^*bqzze08r)s(M4O*Jl?Xv}RQi8A@HQz~`@nqEd$z^C?jF zTW6MXLClCYY#B4=gK(o4B+hEzPgJP!7)lGC{q#XwnEM2qEdF%~CV7^Qiv|h{%g^Cu zHWq+vF#UZ4F>fqHCqdpN9J@Ccsu6^&gjDg6IBDf-5VL+~ww^BP=lj1mQgI*aU0(>q z3qB)i!{uO(-XbQ3GI(ity5F3$<@?NXKt=}PRcER1zrV-fTeSnEMgNA_X~}hao(7s9 zUv3`k+$@+W?(RK_0{Wmr_z+RLpPsh*cgq{SJGb0b=Ou)hAcdk3<$ZbhD*iYnnD^ox zet*AuP|Q1pD_fELyKv2s`POLv_UGb$!{^N|f=oWT9B2th1a!PE1)g7_&+}UV=_;A^ zlcOd8jc548IDZ1zC4X$Wua;Y37#o-$O$+-YS^YQRrBl(6oV52tZjxA=B?q5aURcJb zCZDjDkB^vX@G@m8ciGU7w))pWSDh=~{R6d~Ko?G-T{=xwOvkuz>hi>ITeSMhjo*cH zTfb@4kHBY|DA26ZCg^4oeQ*xVl|LAyUxy9PUx!S|d4V#>u8_G=A0tW^N1}&p5`Y!n zjmbJ$yX-g1DtBV^I`4@VKp}GuY3hhDaF+DDAoVVTO4OoM)dME;->Qckkw50izYbRE zbpqeM1tj;w6(9*mb{YXO9K1OP^^gP08U@vY%{gN)e`kDty^^vW^CV^QD~j}-Xf(51?BK@udtwC-ou9v=gl}y|8003Gx46Jr@NCim#Pe;i#F_{*~@Qr z6A&r?V+Ba6vVPCix4d{@C+(q#jweE-h_2ocZsbHAQf%N|QBVimK1ia*K_CX4naKaD1J3&m<^>x;1FNMIa;<-`Cdykjx zFJ(M$1n?~utyGHt^6AZOkRAi14Fo=?e++znwUXW2zv=oluLM)YnNxHqB0_tL$iX|e z7p6%?IHQC_M2gtljoTcId%t@kZ$FrdLvjzrZuGas;No5I!re}vMXMr8y9%r|fHI`~ zrun?Frqk8#xoOG5LDYOtWD1d!t3j;j_1H(plmxB2K|!$o_fb&HDj{)`+hX}{Hl6M_ zN-${wL=Q|?j*t&VRIo>++kf(}(ir~g^gtsx7&vvJ><^zG={LVFvwqG)ucM&C8%0U6 z)URqBgeEEl+x|LU=k8K)yybtVun;ljm#ZW7%219`ligf2iay>5yn2Uo0+czWI_J8# z^wL~EfQA_;rH>lgbOQThH2KRSWGyb-H5UdTV)!`yk{f-IaJ&<^gBsutHd5At`c6l7 zf6{(1i{Lj+621LtZ?^}zWg{6)G~duom5^fG(@>fE5b;`57z9#L`z|i=XWwGaecf%ydODMLm-sd=%Ij8XR z0`#2X$&Gp#&Uu{}6Ld8E)>_Tezv({5HkR>a+i^>~5UAqRhRs}Li;xBnk#MBo4~HH4 zc5eCZ@4b-IoBLBiF;KSU64eG67pZ#>yHpj(sKI(L3d~FlMCABJ>iHQ9riQ6jmBIR$ z;&?4$IBK7~N)3RMZj@cAzXBYRz;e1|McZd!#X-3hSn;^;12S}1IWc-1{h{L#LjvVR zx;6o5SC9N8bOOcpX?P3!nA`3+DboF`%9Enah>Bg1*o19!JpKgG9GVtv0zpv0G$X|1rS*8zhG52OdK~m}M z$rJ4oKjZLY8uXw>Wo$+T=TxfbWa1tWZ9}wDL(4v{7kht@Fo;pQx0IcH{{X<)s0KCo z=hu&AdnPD3p7yo94{jLO-VX>{z*#?D|AS&f{LRg#Y^37*q-%R2ks8ln=oXrfA2R9u z!1}>HOAS8y;{vD^0jdFwS5mz`oG9l$yq-L zGel9%9AXzv*i1Keg$Xb%XM>hnOaa!5X@ed`O zW9P)(ji9gsQEZp6L{Rt1g96o(8_%-=-BnFZ?K6DYW6@tZ1ItP|7-18L zdjVSr_>_MjJO0zBJ>cZx1;t80YXgU^|DzTEm&3L^3ET=9;<%~?@m9}qq_6oUDZ98x zh=xUcMFp4$&SDDix_rwdPf0Y3r&z0EyRp+jEnak`wRu{9urq0 zROJ4+Qu((eWrh-br<{=LG*Xzi-IN91Ej^CL0Xsw)V*~p@L&7$j_4wL|X!hKQR{GSC zFzTr$hu@j285k|?({#y$zt_5%4SqnBLx9H`d*P4itUbR{o`4z7RtLNCQv9Yu95P<& zkvZt&P3puns$0_-3}LfqhOFVX&`G%+5b<@X z(n~o8(dZZgW^&TIt$JC1sP3LI#S0gpzc09HAONz_PZ{j}!l3BLOk0@hAHYX~huIumWwUz-D$;XVj zv8BT9F-8s)RDbmBwf0#ax2pT1d`D}-$HyyTJ-gin-asQx0=gi^xWP%{PK}Z2cbhX( z_bU}SDNfy3{!Jh-Z8c(Zvlq-*M*b}v8L)?cnx3=tXWEmYz#l~Ja!bKycz7r zWFa0EE3emFUgvgH=-8tIMnx;-V8{A6@M#o?2Tm|m)z8(;MUlMfHFWqk%Qf0IAzgc(vZJ~SU6nrZiR&%47 z1}zcEpubwTiA@v!x@UlPe;Abc_ue`9A6E^TT=4JKcQpNPR$nLZz>A-Elnuv%fm#>7 zqsrfxW><^Z6_=k68klbhRafno=EBc)4#VshjrJkN#{wtA}fuC4TEZ5}4Va$VX)6yb)j8vMl-l#ku_Y39?h`EG_V;fASW zx)eRtV)E|&ea%M(=;+|Y+?^vZ+@BRrzWR_U^F9n^k{!0Is71kyNVO^(mP#}r`j>C= zD=jca*ZSlgT%c%slZ5ZLQI*xbGJ5+y&9vzfESA^ys z!HW(meOYlj#E#pRtT+=DyY{`R#Pw!!MKH-CE$N=5RIDD2;2l~RHJvmW9oM+2ECSg-15zqZ?bpjsHMB6iGxbMFCl7oN6UXM6HFCme^(jm~({$9*dh2fTV-w8#-FVDc5(i|%oB51MW05yJji+UL0< zHCD!1W_L>$!H;l~a$|;>_TBDvk@@qjNZ>mKCwI)DM1!fe9n2*E5E%?^x)W z_0VAX(_D_*!y^Q9VT7K+50r>H*B3`g{wMu1iJY6uzhS-Bj{^OA&Du)niUSuBS+f;iNIMbo&-gEGVVRFmuD0$f$<0c^ z#XIAMY|lvv(kpeXmL0j27b@;@+;C&@AXz8iVvKJ@DlqaU-n$S(AsRsxueR3Lv9S6dTV1sYw;+VZEp40GCU8tL=No*u+k3-td zVU#_|CT&0Id>_@xA=j(Sp(n?Y=wJmO=liZz>|0=;eo|16N-9icS67E;%6OrchF

`Igza{R2|}>#qDM#Jy09*oEhd8=_JrX+HHWYgRcaA2b(Eb8WxM9#IYk3VVm+dR8rV89O*>}?4@GB_ zo{>nGKa~u_QhWTL!cf%HL^(jTi7t0R!&ui9pP^&ah%0DXZP>ZNo#b_X5&8+wm@o$2 zbYmw|7xsu|7IwjPyn%)*9O(Q&U?nIY%b!&8zg9Qoc<((T+?Xy`5pMS8O9-OJhrR<# z8X%`kZZRRCCfzmQ@=7WTX2pX=tS80ynaCNB7$2w5gQPH~vI+KL-P>I6;j97WFU0CG z?@8(wM@m^;bF>qm6r)X+@K#)dASI@5jB=GP#TtB)GknkrOJTYfWZxmY81}1@MKwP& zCJ42n4)=H+-;y;K@n0#I#%%So=)k(iG{51%($~{{r^{xx8%(*=;3z5Go!kL816yWD z*%46&^RpiX&ZGujHh=A8y7m0H<8gkH=eWF@fF37HdI9?(L_d|=xQ?=S77-JT?U^}g z;`i{C{lSvqL2cC2FN2!?HS;cr@uTF92mVPi(H#}%U65fOTByd?as>I$NQ8d5l)ARo zae^4x?-i-YIbs&66DId6Y!~tR0^Ng16=xmKCbtn>9wF1n zB=rv}Ci}}xYe=>Y2X|CN@iTfHA{M3@q=h|(k+}(2^>nE|`W7s5`3_h=%U0svJ#+O; zQDO3j8|E|aqohfzSq~9JLIzsnL(;IvZ0;zmW4`-&1;R&B>+!T-3Uyjiy;p5S<)U9% zx>EFbeJgHQ?jBLPSn0@IGhyzY_VWZLo=DuqP_h~(86fS(`%;P8=`drGyz%X;MK0=R z=_~1%ij*49!kQUS;QOr&>|17Ju*&K^6YlYq*Wr(Y+Kxbu`EVfcdHl?BpZc^{k%KhS zEZq@pTg~bbIL%1E$&15iGIOs!troESz*6^_lxDDYB5=8*vNWZ7hoUgGml@=gzz?k+ zLemvPXF}afXkd5;w~h|+W0DF}R>p+?a%ygUmUr1p*5#*2Y{jZArZTx62)~50;D8o} zQw!snru36$R*Ewep`c~0WaEYHo^2?hPBr@J(aQ9fBzYQqQdv+*lGktxxm}%nU~e&b zp;?U8qe^knB5LZ<+ezq069wK0S_tHfoN<{ckyjm^dxJ_wkx7Vwno$k^oG-=2E~>)_ zk{tgtZ{$lx0aO7SsrGbHv_%7=$14E_OK731 zm6ofKG+q?6qXwd8ay#YL(`sZjsaGcJ$bSJ3-@ja zRQwUIiUJ|*e}Yy2AabYF@5Bf`=eea8F`h2IKGZNHRp}_r`qnBLNr>rrVIw;L|Er>G zU=+F+#Oh1kfdA6gvXqfQupP)3lD}FccB=x|l(KJ??tdy({B!IN0mx8JbP?*;`?rR- zrR3$OG=R;4hO)NrcjD(>yrmrL=ZpZt-NOo4Nv40(jQ_J+a48F9XSW<2tJZw&(1gmi z(DCjAZAJgu{k4=GK?yqe_vcsd$_8ekGy1Oc@jp6kzByVb%jDR)`>Sn%kbE^fqB)06 zNdA|mrf+5jwBK=Xtbh{7Zxy87|Nl_GwSfK)In)mQLP~|I&WX78sWK0>s0wJ9HU|TA z6gL=uuUgQn0Zfo4+dvYW9!ppEAovBz6W;XhPuonii7B5GqK4WQwZmmzDt)LEl}n{| zMM_f^HH%Gy`eY8h#CFh{Js+9MZ6bB=a*lAN+;g$@tSyy{?NT!sG|==S%XimcnPgyzW)T+ zmME)QUPZT|3%<=>p99d7g%KqFL{$QBZ-0xmhc21*GAM>4AQMu!KcG(?(q3|(D+2CM zq)NH;Ft-D0WU8=xV+)B;QFVLra1Bmmq*}eQ9LJd%IaXjjo4iNiP@&kFFfpl%d|bAj zYx&9+72?9Xp2WIyHHMqnqw+%>b#b>*)NA?v2%7CVE{{oEYhMOJILJ9w#a4t?-Ao_% z)O%`1D=9uFZhf@xGT-sGIY;knB{q#)Oe%a!VEBTadI3T(&N@ez6$Qf{=?05sqC7I{ z=-V&kLeN1&%prJ~e}qs-Ja#bjw3RPkZad}kGvkDvoWBweKjEZC%F9fq@L1GTIaM!#J9?GpWo&E-uy9*1&P@6dnA=$btt z$7_)~(RjP`=qZgrCvaPaq(d*71nd!xI(>;1zRY8J15Uj;Rs0jhm+f2 zQaY-tBSm5yB)SekjWp)Pfl#Tsp<|43uCK&ubzn^Dva2FuAIC%Xu6bCq>dBqn{?(rH zIIUuf{*3yO-hR;1W`ZJ(agHV++s-PzePZERB*HV~{-K{U?3$FQM@PMCS%*h(*v1jr zB;qQK1j@z0n7J-jZGGT7Q4IQ=y1C?=e!#>`72?SUlNQHKC=)Z-CrdM(F z_*?ZRo+7q$qCM+WtbF?biL_R&3U%=0J_|4W$Snfm(;I7D@eGFDr5wLFe@57+m+N<_ zdDcFf$mxGqf7fkm46UM1#2atd7A+wY)ocBZF-H;)d<>emW8(}Mvp6%o4ZLz@o>T7N z7B1e-7||nK5gBqq5*R0%*d(LwVa(Ywc@-{>77fVFKo`it*z?tnT-e5SS00bLE#H|~ z)|>|)FFR7oYhySZ;Bw^1eHXQ-QC!B&_i}Fwt;*h|fa$v4Ip&(s&{O7Ibe?#{!-dRj z8p)Mi5PF0&{t=j)Zl9;(maXB=Tw1W`(gEwz#8k2*&08Hy@dBYPDP|EGy_9?$QU{Cn zZ&dw}BMAb>Zl1J|1JE<`n1ifexU3&<)~=aj>XZZ<+vdg!L}p7#HvryVHy<44Z*nTC z13;ahRCLoh>;Pdj6p0vF4G=tEmxH>jxvYgrzuV%)76YJpo%Vw4K0cnER* zK4{t2%PlZz87nik36tEmW0sXB@OiW{*L656MF32TAT+A-ns*JKZCNj~`;XhvS9|vq z59z~u#wrCGMtqlLRv5+jbmz}})|clt*pP^-Rx1nlH*JEo5@r1x&_Nw!H(mEme#czd zOX;vX!#lhAM?Y8Sxgo_oO+pu39Vqg3R@Q#zBJqlcTXIehkQrHV2@R#MHejp>;k&_ z?sY_XU5kZ@(-|eh&#+bs=;?S;Np~BB<##kmHWYb>4MusIX!0kP-ydrhMW04=*LlPo z&&?uQlHD7VqA(~+1h&sPq*!hS7gc11M^~Rp&(%h0_L-IX)t5@4FrEU(D%^Z?gUvlh zyp`arI-0TL{T{JeIyS*s5^K)7^autz9Arxz=y}Dsb}hwk4iTu`5}chM%I$l4mGt8s zw{i=qo>TT72+TWNQbxB=Ytk_5aNZelN7KA>gi?lGd|i}%Yt9e}*7%BO`Q8p51vi;N zrcF$2Z9@%_qoBc3Q&3WyeA}Pb9~yriVI)OH4Pg@D>#w>mGH9s5Z6+VWm3&#l z6UJ@WwnGg6Hlk_Y=xeyO%nEe%K`^+7?ce`ijwZKnwViS3DydKp<8*L;W?KieS&=P` zC9>~;gIV6WQJX){JngGYmwCLo=PuBeduXyzNT_uee|IOb>dQNnl-hFG6wfmk3ftXJqld`FqSlx7Z#$ zSbpW>Vg{>~9`cF%!fbVJ`{?;iOhqjrPm{GpdiLn*b}KD&0gFZ0^%+kUnmc&muGq{l zFWB?^Sd%DNev8MLv&oFd{#m+Zu0JGcH%TtwmrB);dpciX%`d{s>!?HL%BAE8#Y4(U zJrsF{@uE9@TDMu|qa&<`S7Vm01q-Tk6!G()9+fthXL)`N62LN+GKmB41(sE+lQ%UA zXn&Tt_ozCt{j<@KJ9VlU%^<--3Qx;CsP?_S-^VnSTPK^_vJvfP(TFr3TW{@7W_eB# zeBYp$t}~yD!wRKnqzqitB2%MSN{-*jaSO4k^9}A9Sf*z2^;G0zM-xq%o{rfR?Fd#y z>;!D<0wWq>;^eQg_f!Wgzcr`7n_r#yYRF?O53Iy%ru+pEL#@$QMo*|D7h3WBsIz~r zGzAqamKGvrPVmQOfbGg3z?oum=SlQ=qGbu#8m!FS@qK1+aUqkRmk6!(Z!ZSEUrIEg zSO1QN!4FFn9_Z!2oig}g>A0bn{|;Ee4@>$9z1)I4$tKM~6D5~2yB`Bh=w)^S#V$L| zr2TBzehlBB-)84{&^Q10FFQ(cY4#Ju=lK4iyJ=~D6*L*^`^WR$OLMrs|6S<&bD+oY z```QaeN+m&Aot^&gFo1NdTG?rx8M8rAqMx^%`-p0{qD?7rU+T z(*OSgz8>8F{S(;h>9r8Bp!V$uPtRIFaKRb|)RnyS(4XD66h!|0N*)xd1v0h&lDbUV zW~hGM1wvu)kF9I(|0in9_Ylti1Vw+`?&3c!T=x0?zC$?B^4o{VfXe6G&?X*G$UKgO z?)VNY-G-^)zSWXXTHN+kd*#i|c45b|qR(4mUiS6PSykf&EQ-5?0TE@)zZ??T#*l`MU#%d%h#O>k-_52Gv zLD3O5VZ6iYW6J4}&)@bS;Ck4>+R&!uk9m(swymfF#73X&_tR$q`f&nRpt;FJ#X6oT zKU+J{uJWfzLf1#h)Y;&kAs9~VLbyLEy7sd+N$&j4$3Ew8Ahov(vy=}kgZVUSkzBHZ z9T@cHr2dT~tq7vqvF_KBcOq-IL=wIfH-urev*T6qZp0#18NN`LWj;l7OhHPPVEgAj z&KQl5z_gLuj^KJj}DEB3c)YqUyT;4f3P+--lWJ+V* z8@w6jZ8pskf1>9ZtI}a^J4LC5i*f|){S*&&bC3F=n@*Z{ADuNkRFbK^VRU0j^jXu! zp*bxT5Kc?24Xr>pe7LlcIVa7)3?L3u*FT?h)aJS^)eK{>;?1oVF;@%XeUbAXj1sBb z!!+h(bM#`_v&1%56EeoflzHqCrBy^3jpF1;-Ves5)XD<fPM4l{ZH`{(KgzNhE3q9tp2MU(BDPwE$(=!)jEs6!2tH9!BgxE8qZ=fb zaeUO9UDyGXJlQcjvg$$j#&_*jjp*7~yJ|Hgk!hrQ?31wTa{Qr%>0py7cjBpRCPH@C zW1e0dZYDlu^;2!-(Pp0W90|L^+{17}O40GBUa*=`wYn;f?JKRY125=tvCBDX*aPD@ z0`iNS(I?{vBR4*Qhs&cK;QM#>brlY?W)YE0pIffY0|yRgGmnilH_jifF;63|;RwKd zbL2yW&d6svk5*s3a!ZcZHBg&E_bcd^WfZkPa~fNHm61=IUYy-Y+UJv9H+Rl}=H6ah zQ9&b95eb;+=-NoRwVHQRai4Mz$FEIO@lK$r)F%3NoML&*upHwbqYUQT$~gjhCbi$+ zXw=|%t>1CXbFIB!)184NKwxa0J2RkjX)vjEF1@*Ak@-adKbR!tS{^^sj6Xj2viwEO z@#2sLtI^B(s%4xU@3TjiM0t*|ODVsdksbMVwbs^ziX36T@uQJ>FNl4bTOAjvc-KK( z#l;a4+K=YGzZVx%p@gE1Zt-I@Zfva?b#fea-i1sZiTvCuaYD6u8OLL898fe(Zd$MHEi+FJ+j_0 z{#1wL>AEjYNA|l;TA2pLOj?E2UG}?SfN&n&k!AfT)_(b=NZ*sPe&(Y)oEw5orX9nI zG;`F6Pu(qzSm;*rJYC*QwT*?FLYyKYLW5mLSZB05B{I%lAGKxIw&3ZN#C}1Xt z;jc52wQYJbZ|(@q2VKr_#dmh$!nE&mWy;xsGw~(1({q{w!Ku^5Ga^oIX1K<#Lgq1( zfr#wsx6u@2$MJ}ltTGyT-u65*cfJ_MLbf*_ZCX6IXhf6aNOs6nW?M~7qeBbl_8wXN zQpGS~ra_daxj=iX8>U$3aZXBWtatH z;5SI+%64=~XRJ1_pL4>5d!-zOcMM@2nMuPa&GL>WQ$o&MF9 zwjM{=Ct~dflC-8@U2;vSO<|>3_eA+inKmjB2VC-_y)C`QhevYJOpKTFlRI(OuLv2C zSt1Oqo{Ck_f>qmG`p3>(PL2SlZ?b$MH%vD8502#8CG#2(99-Gqc61HLiMns;*%Gz< zu|7m_0z+`a#I+QBj$CD=pA8RR!SVRgH=8>o3)#)HU)_gL@e{M?ROw|L)&gGvW#DE0 zUxv_|!N5X@J4xI`%3F}5-?8_|#APkamhm@bk#atc1^U;jCCDrJl~HduCC1UjAb%K8 z2XVyq?Hh=Me!3vVp5M3?qq!?AHL?-%fo%7;z^Ee?m&rAL@ZE8Tg99me_SB#H`GslC z`P*1T$&@;h@}F9}UTnNJKVk7Lc1ho(w+6z?j#(+QN6A7_j)B`AXn!5S?E$o=VfL^M zam;5E^K>&qpGAAA8eh|Vnan*dwW5k1E5uU^1g0fSTf<9Jq==4vR5v@Gcj#ts8@qDJGY~MgXwFGHuYx_i=h@xQRlh$iphNQyh z2BFg_va!e)`N-7JaYm8SL~79s)V`Cf3U_EtGA>H%01aaHdQSnHw6@0KoDU zftcf)bXTYf_A^XphtId~=}}b$v7q(G`h_ox{U&|6m4YwbFjDG(Wo#0=yJj~JiGP^1 zC3JF2g$mYIo0`ATD%EB5ak-RBr;>+hQR7%Bp}tucfYpjV)^O31&aq|1-3|U3Y0`(` zX5_(4r*3=dSTz@MT*bd-Y$i-UUAe+7p?q$PdrV8_culzXCr)xK*YVe8)CVNAyfSMi z%H|rZD{_}K6t6~sJ6$6mi88)8F*f~)2gLB8WtT&!?I}vdcG{qX^7X-gcNT*!%*%N0 z#m589(=d>MVx8(q6vR9Q!#yW-gASTO&lTAl9qJQ{$Hmrgm!Eu-d~vtn+K)Q*RZmSG ze%ulowmICiv~7$=XnykCzocKe?8fNLPgb|N`tF`q2c2Xyl(lj>m+fQxCqq0&T#>?o zQD$w-DH&LeFv)JZVp^0~k*{p7TV%~XmJMdK6@`7W=TdA-!7aivunA(6(1`*?w`)=a zy`}44B?C$Y(8EZuW187i`6BwW(h-PrYgqn7dxIH&E%6a(u>*TwsUce91)dW#KA;6Y z-o14pAU5a1gCX~s_xYpiLn@yprHGl)d!Am)(<@#Xlr5Wzn)dV~4~o7#evP~n`H})= zE>{EwZ^wub6Xi)x0!W|C#QA5Evdin|jxW?v8zavb5C6n$4Xq@no^;m}g5WmlD)`kPl|n^n>6Q}o41{q)^iYPn!Exa!-PS-!|? z2TU1mJgPmCvS4b}3rnvO`&oywv3he<8(JSc&I$&hi~dP@Qc zW(wZ@Q~e(`XOR&Zs3lb$3mN8@YgWmG^uf-8K@(Pw!3Zl}E6mE?{{%1c{AQ@}i7O(Tgo(j;#JRRGbtYQM_j zm@RQ9a{i7~`DD%fCF3R80c9V;fgwA#sf4b|>+7SsGA{{>C;%o!FcR|G2H?IV*D+G7 z!JNp@4`9g_p}3*6i9@TmnW)yDziAbH?tC|4%eh?^&yZW9E3#Ehz$|Kq89lEO7o2n% z^K%G!I@c>ubnX+Ey2W#r-vEn_@L0gx61zhjT|6Q+mlQAlP;bAiqtRl+{ogw%HG|6f zFLreysE@bn^g67$cSY(Ipv3+dcDEqIclKJ9?(!)$q@j3W89)zpW+!e;n|k7(2pt~l zV=*aVsryJXwg?+@|L)N-0=INt_NaqP`X67d*$YiC@4DA!337s@o7WHiFV3z5tm&(J zx7F4SRa8`j)CCSO3bIE&i$E15Dk>rpRF*OwY)I19feTO>k(ny8kv&ubQBV-VkQpFC zLYB>+pVpJzU}^u-)vpgJa|Qi0U`>JDY39pvGVpG<$KsaxZj58{E?wQeLdOTM#bPSjHoR&C8k?(B!4LE7&l zxmsP3U@*jP+v2u1yxj-W~uP(28pj@PM~hhQmYxs&_lC)c#5lRRNKAYNgv+ zh|k%!3Z?VE?~$hr%KwmGd!7Gp+9<84`C%xfEElP*3#2Qk8PW|!UgyBLe$!ypYdGtV zrIbvqiBtSnSfNwTK((g9;9*W8xwJKt%PDdrfd+LolIgUiU+PXAf7!O+90C?#g|vNZ zx>N@s6^T<=j&hcKIlD6YnRsPa=WP{_AzTjnmaNu$n1UASH3?vrm)Dve^@_!CflfI6 zuUz)ZENR<4{=N2>bjYb8v-eMjaH{0=A7~Grr!FleVcj3JVj;(aA;-iNb&loX+l9BD zT-+w{)fTD*Pg!@lWxM#?uzfNw|2MZ?KGBcW&PV7l((#Auu2g&ews!G$tMR1(D!q*a z$Gfg%Azao5T~Mntx-VeJ4n_fnG+E=+G2CRH_jYdsXKeu_WAn##VO{sOb|V6{cBx@ zk6}j^uwRV*A)T*=n*6nxb=x+-c(uwcycJsFD_g^*;Bi5b!qm;uy96@(Prp5!AckS! z+5re4`Mv-GX+M040uZIkz3$yEp%Qu@Dwx7V3UWhUu6NgLldb})#d$adGL#|d7`O$?@<$kv(G6Xo2iUc z!v>;cqCorEinjyj?FB!ySuFo{2<}e&22yfLyE97Pv|T^NEGQghqMQlSx_2re*Lu!k z5-1~Mr=Qe%0A`Xw$_j`p;XRagn;!{Uneb@gc%fr=*<_SO*duI^xF)Ik0znUtHE#9M z&_R!-Qs)F=S~o4zruc*Pn@=_c3j;uTWl!MOyrw{>B00-qtN`gLZTw53Fw!ql|3*ITWqE7uqHy7!|2q$R)*iXl{Z~h)E%)fm^?2Z==2J4P<_>$P5lhngzg8B z>()gBfV5ioDaWa=`%RL^_wG#;*%aGBeN14);uz7XMe$-4Rl(O7>l6BU@I2~wXJV70 zezu;oU%&}EIhEyA{s_d>w=)63)AyNgmma@$TXNeXtbh#cQc&-E9o@ePmP5%Z(eRh5 zNFJa{WMDNfe2xYtzw$XWMPLg;zJYxL&A^Esxibz0a>YjqCt8UL(E-u5C@rdE!0EM} zJ$w7itUE6C()NWu_xv2;C*QIo!1mHze&2tZzCa_E;C?AIWB95#7Zk&^J|8SYe=G?m zcw;br%d*V{#>qB5tt}u5Nf_R<`u0I?f#zjTqZ#XH{}Ev+IJ$7x+HU}k%FQ4teXRup zSlVS{bSQz9W!UY|@d~U;0p)KET6>Q@D|GMd$Qxvf@CB>gJI>%yd1QqEez);CZnUm+ zo5K)unE`R2BiG^fvSrJDge5#r-&k1XDN5Mte98GWn1vvX5c%=juu~?RG6HMM6&&$_ zRi?v1wv&IVl+q&K6BBYRQIc&PZUoC*CAY=+Z`c-?LciS#=%U9;gOo^pcLJFZtMB4i z37+$7rVZSC5Ikh>`05^%Kli>7hg+?&Bgr*(1U#n&E)!v(k^ys|le1WAIT4FYX3ji> zP~*;MOyu++u^k4i8Yhh(OcCnC?v-Mxu zXftw^waCB51=R}6+#mAXOwh^lb^79*&FlzSJs>;494fnaJ7%6KLdEqSIeQr7fl||s zd_A>M&Le+Gmk1L8E)f;@@d;Kk#Rw+MR(Z}FtPtAqqH%Ag!5obq}K_7nhB$%5WIe8C@7 z!A^`Lc#26y0v2wMs0M+(L^>aHFi~3ad4~QewR*767i2R15XtjpCbP}?&P2hp%_(~R z4wnzn+072@gdOlstJ-P|}G>-w7O z;EW`2VIyJXi2N^zmVLQbdi!f9;*#Z@bxkn-x6&S7arxU7fzrP@PV#T*=<7B3s{knw zdF<_>unhbUtyiis;940{TOX>ZCmr#9HcTcKEcE?8XrD z?`H*ZpSP8}VEGOYl6dzb;${MBaN2d;4p3p{*99S5A!bGM2a-iR?`ZMJJXrQM;#A@? zn9fQ>-kLc^LUPS)iH+p)of(}$t~1vTA{xqr zq}tss_qAD4dB+2K5)@iMGULr^;3VYLmO(LM-@DX_?$enlknEy@^Fs#FjIh9dAxLN0!uZp zE*9L*Le~Q3b}b@r0z#4Eq!>)`x*xUsDD~cW!ma=|UG4wBq~Un{7#qxyv?x<}=F3`0 z3Q}30quHsxB5w6+8PH8FaRt?@QSPS-s&@(+al;8BUKYf&j9FqGZ2iV#7!{^Fa{_`6 zT;RR=ZjL+8y$WWX^lq&=f8{lviU0pQg1k}n9mM&K4WnD`-5>K(L6kMt@2L=zuhUEV z8I?((tuCHSfE7R#Fo0&g08SLY7ch%{Bu-wFF9nYuMA7EnxX1_0Uak}#d-BPfrZS%h}iflMDH0tG}hLVaE7-0c5MhOe_)`xRk~ZP!e#dSP*-sZEP?ySQ~Zua zN^PT9VLw<^K9;Y^?~Zcnh%0)x0+FZl+1ZW986bi)mr^y8LB3 zj=!qUP@H>gKTK`xtG|sTcI)0KP0)%Oc>i@u!)^sca*Cn%7g2?D7$a{=%xKv3L7L1z z9I19a*S7D*OxXfJRWnClzxo%y+h92DoSc6Btc#wseBzX;u209F;_hN*X&m_;&(jzSo3d(l^lrZ{5w2&P~A;iR$z017~;Uo&qh@?yYttUJ%25> zrt&t3=1eIk9V_Z|qzc+9haZc^vFrXw#0nS|<~iKU{T&ZM?Zk-#p7xo=D!G5^y<#oF zE9r*rze_hcxBcz^*O)A}fT`9>f;R_jEp>DZhAfX@)^3tkM$2y>Ivfc`jfh#aPJmgA zyG9x-Ga!48o2c_KHaVvF2hBvpwq-29Pd$DzS?lq^i38q-`LwVyy$n{9=i@78)P~qM z$?qHgzvl;qSb;LisTq^sn%!8>p^>v$*+=}6;Nseie1$t;6YVy3)8ZKv#`X3*kbMR^ zcy=C;-XYM5#AE2DZXYd6&ko_>p?GaCz#UKXQde>4vMV(p^GB@0U40O@^MS$YZ+fa% zP?1B7=szoRB;WxpJW$Q@+R=0-5xdZ!9t0LZeS*8U%Nru5tUz@XOk7#jA8Q8_g^JFt zxAB|8i=M@eCuz2`Wo=z`lO~yf2AxJxgNJ>!sgC$vwdU;TV-)L4OPpFg$j+Q!iFQD+ z6klq?-`ii8?-w&@>thEdJ1wHN>!Ewj-Jt6xb$J!lip8GZXfwVb; zHP-ZkXGJ)QQ>gw^awl>ia%K&ja7rIrYBP2d*;a)_JA?GNJ`C?l)#2LnJ6Bn@2?{2D zYKAbAgyz#<1P|(q2Ym;fC-~J$;mIwEcO`}9zZI8lq7QUCl+800v0|AIc*%oPzYbY7 z{0n2fxz+!(^{1^v@%-;*eJ}sacTcwRhk@$RxqJVloAxL@GuMN_@8m>Q>Lo2d5zl+q z-!W0IRUS$8w3t%PhqB)3lH%oFT3J+T6-aakW%lDw8%rxraH&Ap8#j<_-+6Q?Tm}qM z?u~v>+T-6)IQhrLe{;1KZS(znwX0FQd*Sq^GTyc`f{uui#mY^9W`)SsVnFl;s{j^7 zUDn3lHjtgph#-pihfnx{SQ{V5z7M8q-?S<6JU?Z2C+L&0w5!8$bqDZeN_*PPVkW^d zbJcZSN7Gae+aX(cy)SkKw3XdEw}6v#TZ$4@Kp|&zt_$3G%#88wSy=Rc8oWC|%20p? zZRB8H_804w zTEa#em7=S>Oa^U35$ze+1KzB#JT|i3vEKxhx#rzkbhXQNa^aOA2F3b%0+PB#w?KH4 zVE2=;EART57$4>Z)_UqzLs5M6iH3OScLpUx5_0q+vt&a2TXtAjW9DCnRju>CGgvxh z!9+c*Ot~2Wcjhf0hW_qYmQswB^Tu28NPpk8&eX?k_{NAL8hCB?&(Oo|w|5_~C{>Bf z!SbKi{s?|}L9D%7fA{Sg^OL$~(2~`!w~$9i%l?gg7)98N5>TQGl>+Z-ojf@$ z3;fkbVRkFU#pR0~CsfSk(!p!B1zKD1@mOA)(r<6+Y`|Fqqax0Jj;Wne)S#b=J`M}vXqCQ!h=e~9@%u?+x3r&6ByrnUt?3yD6UEOAgKUz*D4sxxDHK}_ z!-a7SI~fG9kp&!+QyaCrymhD53dCiQx`0~k%x{qAf?9nOXbnma^OAXbHF}Qh0rOmP zP61_pRA=|Huc!~ej~asZ_cLZ-;FvFcNCgv4yEnQ8JGIbMT3KM&e+-~PsFn|f{?=uk z=pw-f7u&oh+mSzwPjIgzxH7u@Wggj3iM-IQ?PrAA9q6gqna}}VC5w4$Ovew7H{E-T z+n&tVlm-xfY{{gb9(mazbtE89OeD)FStSfyBo>K1x$b~YKA{xpgI)wZbPf4yUI}>ds>@ANFks@% z8M5`r;wB)R=SMn~4j4_>o@FcRYk;%<{Df}0hL>JpnAj<9NMZ)^&7a!Bnta|QtNxQ! zADvsQ4+gD+VQ*Ef5cK}Hs=!tx=F$}E_RN8Cl27yNZ&woWiE?LXLkgn{`C~&UFstao zA8s~9J26R8bM4s3_ojxQMs@GG8oT46A8#m~>1!yEzl2zI3l?QXTT<(D`(GcGAuyZ& z(1$iw>D=`97h9QYkR zSxm5h{}d~jy-N-4-~KqTvJ-J};9=jgf__dn+FWVH^^F9U zv@c%G-rJs_Fg9Z3sFW4s@00?fJ)|udZcV*pz4MvSpU>fUIWZ?=h@cJ*;}J#VQhrWo z`+Yh08~9^O#Wpt!U{|;)=3U{31n9gC9L(kXrzx~iuaNVA`F^5YO8;Lkt72cdsqMsw z>A>W3^4d;?<$0e_Ke-72Bt^!+xntDEtD|^hC~D=t6;Qd9Jh5^}$uM zjuBNr`=di^f?^6UxSV`2kBk_*2R7-BZ0Xybf)BEcOrru##^Md){hX!M8j`llzeARP zNVeN=zGH|Mwuo4>xzni=)*0!7Q_jI-K3?YB>(@($c*h%? zh6w<6xeKe1uGA7iP9O3McXSZpIkLIN%vPx>T{igv?cc`t>N+IfzxsKDMVbF;x|8pi znm)JiH&H9FX7GW^shP3$PJT<)@FQvTn=ZjCB@Lh#Lw@U2{lLmR zBl(Cbxun1AIe;rA4^=rv&Q%4pLj6&3vng^YQYFunIBBHIepU~P>>lW0JH`;xH%QP|i6|^7;J!F2!D@-V zS>ng*fzHj0_We60IOb^I?$q0?0NNte_6?1astoW1k`D-JzlJiRZEPuVEHxGGJ0Td} zP?Y!CTAWEV%})C{!cvP6o$Au@d!cO%@+ld_l4-X$Q9Yyon>U%#n_@;hC)#(#Ow1#K zMS!Rw`G6XB29F+m)o2Ah_-E|GDeH(^;-o<>4A$`Q=Xo=nqI8@3+|sa}VCe3Q$n;o0 z%$&n5uJ1vcGhs(Ttvyi!CqVHpxj==P<(5n={;-%oNa=F);N>A)=11Zu>qYs{Z7cN0 zFb=MDn+{&g2uT5TpWII)*@y1Z$<DcYWh z7IZ^o^z(_XfP}^Rtyu+r%93|`|*5F|lK z^w_g|O!zg0gy%bW-ole5=k5u5FXQv14M0##*paNyYv4o_5nlJ9CyEOrG(M&UzT>bT zi~qmx+A3oqNqx%@)ST3@Riiq!3{m3OUD)ShuK+~l;a-lnKd`^Bn83;~@&3)6g!;v* z!?>L*QAr1?V=`b1OksROkQ0+aFO7p=$fLlu(jY+aLckfh#(q0? z(dSNEf2(}j@Jf(@XAV$g(yx(GE>z+B1u>bn z+>ae(svd1+F>Q>rw)}GV1S3_|8|lc^7_c)R0HnnrI}kkA=C)FfcC>eIlC5D+Dk=I( zjoJclBXY;@KT*W0w;8kuZtRSUjLw!!sqK;L3BrNbGfYBXtrz&TU+8E9_6{`vQ%PlO zXqR0P{KKjf9O;!obyvd|BnP*2=jQ;8KJ#|fo6yp@TEM@$3G1=$kXppG!*ZW#SE?eR@;qo+`Jz0Zg|FO-NAjaW?)w$m);Z)FD4T)uh&Xr{QWYq5 zwJ{9dZFoqgrQf?Nu=Rjx&k_ip;fU`reqDav^)LQ2Rih&oVv6&WX)P19%#(mGa#kWM zTGD?RZ)=$!CG+|pR!Gm#$}m%oT`PagZ*WoVL^8g}bF~R7vJ>=FE`8puij>;4MI8cNUELDMR&f=sTVT z^&CKw?+EvLQ=eAETHc6n8BGR`XbI-AGz51TQ0fh!^z1j|=frt8a6UY@J7agp1|rzn z`Pi*bQ@CoMTo`4z&e}a^t|}JDth1p4v$ZcR;_$-oAH}j>kf==G9-yiSZi$z5Pq)Qw z5@(h4WMb4XGW59oH%(3)l_A+OX${S3yG}xD$$jlS+vvpKfW%9JQ7m#iV5K6A^~h~o zpnmhmm2ewELCTaYpb*9G zwrFNRq07)DU%gY(L;y@UoeXsh=AU`n5PdFyx49N^CVGAF{u@@`zFsrCacq5p9_Z6^ zp~L#vqzqQH$L-RC8xdl#6o-*3wiI%Tpxy{ZctQxs^OX$f`@LT-FgU#g!!^|n95^&( zrq8(ogtO|O?FE5B4|}@#6>HP(0B%*2>S;d2JIn^-IUz$#Ism2b_Ta-9&O^_0baH_* zDad=#3`$Bf@Rla?n#yd2jgE1?k4o4%p#YY^&>t^D+4ZVxQATGFzk%QXvn@DMkR0>? z)3F}kEMM$DsBo?7zna9cH3?JeBAb318n@0;y%xD>_8hi{bb=DFEn36}qGLh@icJ*x z=w}Cl-XgWi^>zr+prsoq>Q~e&ujcQC#JGTIDv1YuHc&f4WUfA1wV(Vdw%X)OE_%AX zMM6bg#rcD;B;M6A71E~wNMRfW+*woniu&^PSspG{r7{2&fu8A zuF+}9@1D6XUVrnypM!&3!p;syZ~Lik3gC0GYl;B7;U&S#)qA`{mr7H_g^kjQ5>DZ! z5YP|;^Q*f&9OM*#-lr?@GFZgfuPkDi!$PL>QoZm0+@y3(!9HHizA#0ae6-!*vu3X{ zQ~hZ=RZKhqKb@d#7akN25S5cuT89I2b|wD$-ptIBqAAE%!-*fgI3T^t2Q?NpLS%Ot zRB_o*6;~PYm22^+qbc+iAzLGB44JFA*YOCTEo?1ch1Rh8kFEXh4%Gk7m~@+he7z~G zG{peC#ZMy1C{EYPi*Q4R~?&|(%`p`-6=;$#> z9wCnZYUk~I80a2OdR9nTm5h*=CHDs{D4wm%f2o8x4-_vklQ#ud1it{L0(?(U^-7CC zIyKZfs?PrWfR(;l3r-n6jfAikH zP0qH-p*6$o^Vk#Xp}gv!yy(V?6q2`wEjf2E`NlGEQW7{{rR(j1%B#qns?Z#+ku6DC ze!TQ!w{+3vE|wzHMC4YvvP1LqR~9jN*7#B_ph-1@qEHx}U+^hBFEzo!+Kq+>R|i3T zF|+C2->;6HYQD+Xb)eCW-WKJ_?J*>SUpBTva$uukzwax`1V*$=HcgfENB z*cJAA1H`ibgXJE&b69xhxF|cVe)MENTKK&sUXDY#7YSa2pLR02A)P*_0-o560N19g zAescO?j`6u;S1@>l(881TX^+?oRAC&{|q3JeP|hJwnMoiKFeAw4^wAme3EJ@M@7%&uHrgDa2}j^eP_srtduFdX)UeZ6n2v6-DmV+zshaHZ&;Y~*;{c? z+a_>Fc8~1B>nVZ4KBtcTeV$00-_!_l;uUUhM)KXVMT8~=S zj4Wa%>Qb`e==ipSnK4ldzP)Uya41=J0tH;r3kH^^<_m_L+DFwsvWDoV^8i|v80Y@b zPD7tg2TMG*6g{{FejcdkK}8`qk2>;G%tRcsjT2m;x$n+RFZ)loHQ;Adt72rHW5@8M zMK&8*-Ku!s;H8-Y-k>S6Fz|@?Bm8|Y$!O7_SZ7XRh z`5^A^gYd(x@T}@Xh3@mMZfdgrETH^KV*ImIYfiEjl>Q;5s zfx$>Aff1Ha^TNkfi0KmbF`7=Y27LS(uKr~ps&731?f6#b-0gc{e^tQyVtOpk>6((+ zz&y}w8)j~isFn&i+zDY1qJzr>I?lmH(<`FukG49eN#x#x4fk+WuNo2G2b`>|auitP zVz7d+Jg0ib#|!8XU={cW4M#l3F#Y6O)}}pz=6!LQG5f`RyzBi^U*E%>l>EgX{6$RA z4Wc~y!g|hN4}eF3)GOo?0zcB6UW`7A#LEY5QmL%ChtmQ7<;#1x&UPPWiT5}Xb5<;9 zTZWyg6qCAy7o~iFK{`z%U)AqZ$k^m^p*Qg-6O=RC-=IAKWQoX53z7dFm+)qw<~G<8 z!z-qb)-8k=gWT!$S>uU{=k7@hqJ)dUOkA9i1_&)PPz?MHgvJDGQ5;7DZ3RxUCqNv( z+J)N^RTrtCLujz|uP0|bgf3_naxdmtQ*)6-R$_4W7*sh0svOXdRy^`6r4WN==>btr z>pk^ZNIWd=d=~IF2ZNO<$6685WdqxOKZRC}4y?V0GXkw3XM?s9bN;MhEsuJz21MVl z_|@iDD-rnh(tHaYfHCRdLYX=>Yc$v^c#Y}w9kK!$v)HR%?<@ek$nZG6$U;Q31&wJG zg$XTPkv=oW8eRzCm2j;EJ~zUNmamN!hj$#S8SpX~#xOUP>{UGjZK z)PNIcPKdm$QuY&0g^7OJ@)PEwEKx3rIOR~x57GVLxNMF|CTl-H0G1P;jXfb}^04 z&~}I>M1}BDUW+&5N7**ee)hr1I}`{9=K3*!I-urh`HfGM8QRD1`kA|p)urAC4|4v~ z{>$`Vjsrj=3#Mg1G3@yL9>7zBlrcE)up2!eJJ7< zR~f<7H&4Ke694d*@i(u*WZ-mIgmz20*&G`L_V@)d&JDmEIDw~5>h(jk?OJi)BsdA4 zI^h)PTose~Ouw-(U;{urm4=?MgXL`@+rkj0x4*dqdW>f&8dwC{4t)eaKMEyWL`$#? z`6*h{>|q87Z?i@LO$7KHrK{>9j56>x75!?$y|qg$*2+m5%t1m$yo$rH9>a{d1v2K@ zJ)nIJtfx}cgBS)?#j2k?i29hmUQiP~^LOL}?t*E1yfD>XS}ip7ATRU=;z?{~LL^ys zKA`?_I7UVgUI01({%KV)tZiqJC`)P8CP`Z<*q*fYD0drTS7yn`ul#oxE88!MhOod= zNFt&LXu_*4jmZV9A?wvQ3x59OJ{H24{KvOl9(4VB-^f=CxcFvX?OH>&(eM3FTn##1 zb(&)}P9P5C4gR4BvWHoi3}zFEJ=!|Z$+Cg49$&8?xbws1`@m{8NE(8H4><<(@hC#u zP6XeT-!>jk2K+)TmW%W5(xsl4R2G9SJGK7mdBSA6c(ak9{9U($4?yc}8yW{+f4Gai z|4hBvEGP;S9JRT|RnUC7eXipGZMFc8gKsZT-IbAc;q4~3>Y}e_2YD{kUFj>XNKgpX}^DI4p%rH+Q+XmrRWfoU1+$ zwg5yLI}?~24%5BFoBYTKv=?9d=804jv^?o`yn9iY#mSxz=;Q=#e+T`ytf8iNhaM?k z2kg(Rk%;XUm@ZUV_!$-Atl!btzs^e1F%H~u_!h#u!ZTNFd>V`ZC10xg+&wWU0wYhV zFPWlwvo(q54X=O!=A_%-TuEB4R`2a)r7zw?*!VH9Se3X;R)Jc8*VTV8jf`=>tJv=v z`VscqK&pP6PJQ;-2TgogR~M+3+ctfiei_!`GDr z6n4*tL&+{q=6&ZKL6Se4FiC1a;^Mk;-BBrc6S*+E%N+L!a3kgH5|0r8Mqy^``wZP8dHQ>d0c@XKyPaSN~Ohv9SA^ogqlS4F*Lb&y^PtEmPgZ|L2d7O6EQKd=e6v-$H|)@fES z(m25uxb7&$GWq;8QFbaAJ z{5MbKAp7;h9P`~nv?VZ%XA<@j#jsX6ImUtiiMa-&6+qMn-xbgmYOxJyS+?@pj|Czy z#=^{wtCE2`BS$tNeVammj#hI` zxtZ;-7A9NaTvBMASx_`4JPCiPkiGTbH?0)VvWVx66SZhWCtT)Uzw&mwGva9-P@} zICMC=iVbDiq*i8lT>RpzIA;km0h3Ii!idL=PCD2*Nd+2-b;kpl*SNdggC z4&@$gm)Gxp$3shcJ2HJiSV3i7yg}vlDW+9HTx_q0v|a+S*u>W=7HHkkFv>(R8Zp(l zzchCJtTWh3I7JM6Rx7Bs9A?Q9{sIIBXf%6OI@YI2If$5g$T+hcaic)z8(|Z`M7`Ll z>h~Wj)b{|#_+!_}{V8BvcDr>P4dWnfkhl<2^>w-(LR{Lw3fX#eQ{Lrh6R3o zK!pFU^x}t`oWf@I4)O3Bq`~-Fxu=Z6$*4ECzPX8jdm41)vtHw%uxoN&q2Qfs$1f&M zD`v2&B;J#=x*ehyM8%bnz&qy&CTA62Ivoco?~Rbm$0_cE^LF@{@T;V)way~ezFQqA zWpua)S8ti@`~+24lc~kj;}7pZoyo!s7y_WC6fX+okIE9ynR=9{PcnJWex)n~v)9!9 zrf~ppk6fwMAW#ghSiS`2f7-W||EVAoT-O`5Kb3M^f3{+<+i{G&FO&dey$3*)M9jNsd-ml7`nKLd z*Bz4SU1g-4PCt?gryPl{HUOG~chdtTegv7k3SsPkoOyvYm_*rI`489lWLOD0#K$uU zKD@s8IWaSC|4hvobgDvJtLQrJH9&sm5#^-I&YiUr7hxz$C0m-sUNL<)X({CsRF5T? z(ItNq6@F>}Z3nu#pno(i@S4DFZQ*`TX*KjkCHZp-&z@?(&FEmOE&5!)A&K4QtIAD9 z-^DYDywYI{Ttv@2KS>0)&?gXzG?=BM0n|K*+ys0p3{2IipTdgjF{A7(O2x7C4^AN< zzWn22wwH70-_x#9j6>^Om+dK;3eN-n0`EL=x+xv>Do(?Rxmd!yrOcEX%9f^bg=tJu zp*yf$&^_vi)!tEtR9oq^?*CgBymb0^?Re~MjFvTZr3 zyQ4mTqC1$sw>+iZu!KD=e<*(8 zCdDPW*V{lKOuorh!%P|^Vt>zEN-kO|iQx3)kxS&(C!BO9mRc95QHv72F#(!1XiyEG zYe{sL`$$m3B!QbC<>K`Xu$iOvSp8UGJ?s3rP%Ag@eSvO|TK_X14*4QpS@XUxF1jQk ziiH|WJK4Q^a)f)IsY)h)rTT+5ytHE+QWmzej)xaPL@02D4AAl2Z%17V>wq^v&(p8=j;R&^#qtSh=~428#&8&?f)am7OAGq zaCP~M9$|1`{QA*CZrJJHsD%|Hsz1 z8D+_1B;b#r)NF2277oG9G_FBBKCXOXUb+NrCTDchK6WrC$cV{5#9F~+pR|H~b+PA% zEHgktAO(~RbMg0QsVc~nS5uq}O7GIhdh3R^*0Kur2z2(-6ZSNKVT4dbpPQ|3GZfpY zbRxGj7JIJN!*ZGAXrjtU2h%Mn5!A`+=)MPb>(0>9h3}<@O|$%hJ9h;XGT!BH!#Hw5 zSta{hI1}Icyq|3{t}fa`K>Qm0;KumDg6;ks*W3r`E=~OpyEJK}kQg{N#%2gbWc%TBnY_ze&Gr&OgnErPcIXY!eQg_oOBk>i5hRFm zf&X$;7lz%!iIc{*nFlt8fh1s?iM#_ygFm2?s;sjvZM#4J_ zKEj>X5IPLWcS`8>!VFMw`G62LK|{-|i!xeY?tr4-w}>xE!780Ha!DEU%Hh&ryZIk| z4h8+}q2SgU$i>u%OR3tAqx;f70RPI zLZFcPCJ!nZQ<%6IZm9EPGzueMYtb4Nr`8N3H*6M*izLa}r;e3vssz?wV<|K?dMjd{ zF+$=`4`0Wn%PMC>>PPPmqQDv&*oNra%?b7?A-Mycr0xkyKf@B-mv2&$k|CgBYf;yI zfto&!{h#p%Eo_Up0&p+}2Y@JV_TGv+pTwp5nr z2Mc7ugs>a|svKdWQ5fTXDE3{Xh({jH=bN>ZWyd7`*hrOxf{yx?N|Q zgdgC*PAkDUJ;_*4dLxkcF8MF(V5A<498tC3XmXzS0R`rfelpBYqGvO2UpnmLfj>zo zX1|%DB;qTBXY$_vWh1Ed=;*f{U99`@rF2MHHI6LgP^QPqYTS$Z00IhV>X~KvKutau zEW0eqdoY8WV2j_Sk~Sx~ToCt3A3M#&N#>9`NWm*V{L$xYa^9bNzQzCIpp_)Anxp%D zwo*${zyL?f6NWI!lM9}vd9D-OJT1OSYj+S%@gDK zJIY0ns6oxnpVd{)K)Eo#S2fy|_i8Mkw@)mP6%Uva!xpdMETQOmy_mm}w#Yl$EhDpI? zZGGZug##vx$H{tNk_WFq*$M9*tpc>|kNSTB4HI8p_yB08wR){%{lSBq(SNSh{~%Kj ziDScZWdz|Fn`RSh@wmbkIIz3iSpU}0M%ItN8vP{!<C)~})L%Yw7#h-mlD z?*k^H`HW{I>7pN_P2TNxg+Cj=8|OW-m+t`nq4)3-48C>j7rX6+{q_+>$;?+lK*$S6 z{P|)BYmIu&*fnXVlx=jEB zhFfaF-w3K7D+A4mtL?&*0lc2rsjV|Y z1Q-oQb;L`oNwDp3{m`u(YfA#sxtgzG=OY};2_QwPfQsrzAfc;@zlsxXYwz{n%tn@l z%=X+u1<#G0!u{e2z<9!TSw9=5Zj2 zGR+NCPf3W3_a=*h(pmAi*uOxn-}G*W6m-~G>;cQtww2F`lrh2GI9Bgd7p*Sc>f6+iKv+Dt#Lm;FKR zQwFSeslKrT9O2)BSSbhf0s}?kaJM?#WgR}8eG_EK2OcRsogFsaRN);|E+|+Tw?09v zTu-MWTN~sI{qbpk#5DA6tsC*TXx_a_D;K)An zVtb%QTl21wv9b=~A51mh3)&HgsaH_YX?GWW?DQtpV;rfz6UB@tShgjoB_!0NGi`XU z;T`YgeO}eO0w64DU7St&Ac1emGWe2TXX`wYmVl8XF0_wVIODbaM~5J8<4_)(m4`Cf znF)L1cYvYYzqG}$HZCJLACQn=hUO|6fUAFJ%IbNo<$o_m2wv9Y%mNoMK!@Mxg0jn+ zqKb4E?+8Ut%NQlCl9oe(@*rK2w7kyNT|Og|XSVF*0;%;kGKU(gWj71XK^3V4b^Ye` z%$501Ok8+$mt-JKo4ad}$TRllRa~j<`<{|eex>w^Q#ut}gHzX|c+11Zu8B8}r-Bj` z)QSU1TyKUD-ccQX|N3UlR&Hl|U4mDkLX?KKoh!M&NkhPkR&Ylh=lci(Z)tyJ8Sdq# zup+eH#Kq1^Of#sz0J{4UeVJz47MSR136qZWkcL+Nqq6C-m;pENip=mXM+=|Z|Vbbtc_ z2w__}$$aW)u7bN`NvR`mPyR7ra$l|?Or2n6OU=dCNAEfeOhm8-VY{ogK6H)sAGb)r*&@le}vof_2Un5*veg zcVg%hjN^P*fPgh}imG}CJnn5-0@PGl7Ny>T`q!7Jzx^9oJx!zXP!Y z9CQ4lcoC8@7R%frJCypv{rVZzaKt#;JUiZg-aGImk06t?F~}fqiO{A*dx1 zU7BQq&OeyPF?GWF+i-$^c2sGmuf1Ei{gk)wPXjHon5C198ThAD;;9hUBX!#J zFc*ehT>Sk~wC`<#D6Ki2TH+3dliJeq6QQs6ldY$XbzI(STODiCR|0TTFA;cb+8xM3HzX)fxFGyMk zE^O{q90@CM5i0{1;JG!RdJ;n9%86M}AR%;{%DD6|_hBJicl>nI4pQnlF-q{_cAwkb zcBPi&?QCwMdu*B@{?y-iD8J|JZ=0U7q#1t&03$6~3>SAZ)<)S;_2`?YCs67WrW&lA z+6SRl#^VeA-BuX+MjV@4}9J3ur34X`m)vL$bzwj0Sv+~YF;c0H*P z25uR)nl)EymlU;JsRa%DSe=vbvRcP4?(Q1l!x&NCegFs@NL*<)2$iY@h*3?~z7@o@@6Fo9qHit*IF(WL4M$d+~T)Rf?@x%b7D(GDZpS zT4a0q;8ccCqFw&4w->C%O9thwdi?3y#p)%2d!JLw4u0-lwV&O8--!6-a=HG5@gK{_ zKxTgEDsFCks38O_Ebkm<4VBWNU(|Ioq=yfm+1hb z*Txz<-9)-l>t0s7JvK7(bK5Xifb9UQFf=2U{vsJfFS88p0*ID2(C{0 z@7L?C!P;$@b3QU^k~vpTTG}tWc&u>mt{EsK&w0~m&Q#|_6v*uO&#@_6x~HGcFa~t> z>sn2Ube&$4xd4X+^BX(@-t`PmO_Y*X+jy;_Tw46MyOlUI92aU*nj0kn&6c(j7*bb5 z3}*0U>uI%A&li5x&-*rMl{79Hh|3l$c*Gn-^bPQYQsNKQQH6VW{Cxji&%jH7`sk2l zTxy=PJc3J^S1)cvb>{LjiAqI3;p!;&Unqe$W`bKs#tymJVxeY1Du&7*?gVvuV0UO_ zKr~u(N6qzpA?1V9qgDT}HS@L+YRZs%2fCABolOT{(7k#;)R|n&{j}O@C(wWTSMF$$ z>4NA46O=hC#S~ogo@60s!U_wQLZ5y$^8^lcwdGEfE7T(4{W1UiT;bp|HKz{!dGki| zc1!y7C4<(!lW8%D)MNdSjr<9)@Id-Q#Q+A9vk`3$`f8j7mdow z{7I|+vv|FJqS&RF*5Nk6tR5@xQ~u%RYZT+?Q9sQzWd8BcDt^bzI8ENj@gKQh^hb zYT@?!{+5^65S9Y*-f#fO(KiQ71+54quJymuEZZt)^B_?X!z`Z|dRX}zhcH?pN25Me z0RIZ_$W+<}KWupM(4$*^&byH+eW^x%lCS+~Jo5wV&v66{n-?3O=&18BG_@py#T@yw zNfdWNhL(uR(~PmYZ%Nx^&)*5(!F--kDx=2Lk6P>JN6*_FOsm4-+A3iI34u0KLK&=- zLD112av8y@(A7#CEJ7=fNod*$)2s`bde?v7$5M)V!7xTE@M))}TeJ$67V#colABmu zTs-xWB>r8l>-85ZrVXi_iA5O~eJseYM z#5<|3!12e~>q}^n-YsBwWK*7F%T%4|o{&(G0NdGlX%%@rNc;&Vm;SP{!g*UA^R2=z zm#y%_SBy+!!UA6U=oIxWF?moGFU@6(RU`TJ~4ddZSe5~!cyx3 z&r|^?EqyqaiLSq1(yzb&j9<;&4YbgkHmsxP?;U&y215{<{~|PZziT}cxh@Hqa8~ldAq@ZoZ!)J>DLxLk!cXo}fj@jjz z(m4)#Pt#3t#`PJ%6b=1S_kybbvnSvlY=wibU}eA0^ohTGl<>C$cjg42!6rM@-&YeH zE{Lf~S5Xg8+^Ow@I;5iQamq?XXMx=G9v+z=j%()@RAp_$1g|a39`n){FU`x}kK3GK z<;?)k(EuTaX_w72Y5>(m_Rczo(0AU}+)Z|QV1NF9(oAX?L?kg`h!dhBL}iG8B&~pmG6xVyASwcq z01;3iGAS5B7)=O)5R$x8sC{qW_V(-RglIPNtv?&CgCDd>B1`X%h0v&*a~4Y`GFqHgi6fzWvR~Rz2}(R>hjZZ3s|2sx(Uf?ii|^wSYO>cUYh<4_655~k?cZvy$bhyj ze?3@fM52S(lu@~l8{hs08Yar}BpxA4t!(RL*1NG&D>sW!KzZ{TnTV zqrPI0TNgJHSdlsc$+&&KZXbb3Yz=94a^r@AwY{ZdrLc{g2@{}lhl&-V9f|9@)*F8b z2*@`KRo@m^f9hsKX}hOCn|F;_M;adTM!#o&9u`hQLq+O@T8*(egE@IU$7BmK=czCK z<8DKrv38%{o&7OPMv{DJcp%0Zol8x#$c>FqGG%$MWSFZ>35{BvkD4c>oxqlQ$K#7q znw#|}r3BzerB{%LCnM_ z?|EiQBpxAuaM10auV}~PvlbG61MAOksQCcubuy;oBoQhK*3#s-J`6Ocs<+W6sxIeLhN3a<^Lf+)2}`Ih^S4GI3jNv;sCuea&b% z_LNTHc2;hyk5cfB*Xrc8BdYDRvxSwc>e zuSg(ngUd0kx_l!QGkidwK_hPQ#MrXc>f)=B6CcCN3ClF#a+MT0H6>c+HGxC2gp=F} z#s$S19UKkzFWy|Zi8AwHt)8Zf=^nx@ zU7fCS^#bYr0SepHhh_O#JTdz*4<|@JVfwo0)k^F5)x94C4H1l@A|Ynsg6&gMbjca1 zg%vqa-30Cswhr(hWyAeWr_S17dBb@(BFtgh zvlFL)#t3y}?0J}|ALJE~2aD4NN24&{s&ifBII)E35^8|+lz_qd3S;^EkDCe}w*j#Z z%@lhoxDd(=UQSJ`H&he2p;weM!u6sWweg^tgE(ol`3WUlkup1*-hu3YH5&C7njwI} ztZ4M?;>O=!$X&_HtzarYZr;E|qzYFW2{uK@bn9RyA zg!tyJ4lDF-Y`m71y&iwN5CRsWM`QwKtX*&PVe-ILR`9xwn@xR6(w%YR@0&(jJ55-9 z-hUza2{uTy<;p~FWlj88q=yjRc{Pgc+b&D<+uM!sDYT5#e-L<|ma*U~90q7u0G`=X zrp1fn!|Pj1Y2LA)u~#AU5PoB+Nyu?f&;8xgnLZz8BknAXGX>>CMUO$3RxVg-L%Q}8 zCpC(7OV=0y=Ea{8#!}wW!tkWz-x4Z%iVe&7cio-gp*Dr}U_omParFH^C+qJF{SdMb zY`Q^|__bKY%Fw(dCev=yXOeO*@)+p&g64uwgS?cfGYLL}A&dJ|oOXTQcEo0dT)n z4L~{~RFX8DuS1tOQ{^T71KG6YFK-BJs<>4~etTqUD65DclVM7*Vd$ykxlGnW7>|F9XZnMK; zfHdR_BrjIL)rGs?YGcDh=Na2iZ5@k7BCmKlez;qF+@rW?ayX0rHW~(-YaG6G{g@Lb z$$1AZKe7YY4{W|%ekEEYlMvMvq&k;{7QLf+n`9tGO$1rlXE7KeFxy`%RcMoq@pD$C zIDax#Yd6BU7Ja#{!bo6m9 z_(c~i6|$;Hs8C+})DYw>cI%$wejdelNknDvYgXz*_Yjpi47`~4(fqw+k=>9r^BJMJ?EfefGsi~ zMT7Q#k|)U2{TI~xeIoh09c49Z_=0woP2wrIJrOR*n*)h38i!>EvfyY=y=g1q2^**y zI=y-wntw0+tsAC&8T_$d#vx^(Z8G@K{)?IRPn?C^0bd#1ibz+uOLS3gdg@W`o}gbm zuK(7{@i=tdoJ$$V7YMRb3&dmk9Vem}7`M!ycxih3x!@ySj*)Pz@TXX{Nh)w1!@nM0 z;4^vl6|&k@nM2ApYXePHGe+*gf7`at`h!4lCAMNC&>4A47)*O4u6l^HEp?KgE%DCT zWpi{TwL1#UEark`bY#G!ewwDMz80MEmBOqT@;H^ZmtVebq}{GhaI`y9m(HX) z$plQj5zs)YCPb7Q(kUGRVBi_9=3QyPJqkERz0T|f(=&569KfznBFg;=*@~LUe#>vL z^5$?j3eL`Y(hSPLWsr7_b_g0g<4`Qw+23**F3RadVpFA9+kjz2j>$=2y604){u={+ zx&D>WV9Tyb&^;q*!QhP8jTcwP5P4l^7@GC^3NO9bk2sv}v34(e^`MMMHoEmgd#t2% zI0XIv!tA$qTm1;6H_Atow*vzK=uWWzs&|n`*;Bk@|=Q@YkPRh*D5dAr>rnWf@JL7S=SCE2l!fFXili_kjH+TWkZ( z=5?>BfEF}?#`&|4JeYF~Fc2cdKoT90SsSFo@}BFO0~sM>1@A$-BH#fOidtKRI;{M) zEMpZor66~DM%YdZ77p_!L)V>6r3L%Tf~n#G%HRvX;BqvmY`1|JOO2Ho0ZV&O|8Uw{ zvR%eil7s3rsYzQVv#3bbZi3=GzIk>sEw~Hxn|1+Jr&$Ww~|x>@zAKoHr0x*qOTlVct_zQ@o#VhhmCH$l4VgD5qW6)RyGoC2RF zIkwY;wH6eeH8!`!7N6n&O7-eHVi?r2wbid&5vZyGbSpd3>#l2r(&noqcIY$-v1TlB z& zAPC||4Tk2DDRR&0XV2ZF*8 z*|dhb6~=8Q-uZ_B<^oA>QA9}Mx8Nptz451~<(h)t1fwQTt*Ej)I@OgoRFQSB4P=q~ z^a6q>`WxghoprWkmvGNGq8A!-5PAvzSg|15>g^V=26_E3jiMOR@5uDv)S)sx*5msf z(aovEw~v_$N_JlV`Y47|e~? zN3JsmN!QEjXAF_G%d-sLf3NbS9bHY;q)W0e?36`oyqhfb*@;eTwz*D+NH*TApk$)_ zCz6O1pGQh@oY|!>A%m;~%kHIWB@v0C(t$0;+*e~GAD-{aj*VbiO}@8u6!y7XWStDB zm7I!g$CoSWiC^s*yt$_b=?A8)_Sy z;+@rTH{4l_SWO?Xfb*mv_ZZ%=dR*S9rz$=P;f?i8N)Y11tqx{4@DB%8Mh$Vw~+duNnAP z-tWETJ!{0xjT@(>*WZc4#4|4ziOMqz5Bz8@m&P6r6|PCd50}>K1aTp69^sn_ z_P^fG>YsWA+nk*aV%JW1?wa&xZ=()_QeJIfYRWD3%z!%eqeMDmKy#I@NpZang%A01 zs=k>C^{h3Jv+uN0I;ydfr^-iE%o6E7K`Y z1LrB7a&dh#7n+=?hD_6hrBkNqOb29T1*caj*)RX0*Gk;O$%72HEAz}4jNuGwR#L%m zndoW$$-)w$^8RO#Bk{j_^M5c;v&mC-o7t4zw$+-GN^fk4V7e-R3onHE^gvaK&>wDn zcH++fOZudfIpqTj#@d#^4>PCp(hE5rPENiKo!OUFtpNvU8>03o8)D>AA2~>O0`ss7 zAuG><^Y;e5Ux2?#GB`b*qumD?jt#jTZ3p~|^QyxqyM9m@5Gla&8_}KZWt0w_nX9-l zJ2-VRXp8+kLBH~ppZuHglOB2o;`i$y*U{1oc{HbSTxyZilcn+?9V4BfLT3zPBf4h3 zp@s(s2}<`{wt1;jTH)CU{^aEtjSpEnZH2<)L+`?^r`~&{H;WAYV zLI*a@Sr$!hc~71=Vsmr|xT0IuCY!d>;&qa}bjgt881c=fs|%tS@<~MRhI|sg*djxD z>5SoGVJF7ml`Q|)FzFC@K-#S}pWT~CBoffH!jHLC_q__2Pv>agr>U0Z*f_1EZkVY2 z?x>(G09^%=VK3j?<1;+kZ1p3v8~wK07d~th#NiN=y$ZFa@!GpO1Pz@hc@>NBLpFKqv#9oy>szsz1q=_)t0!dzPRBcf-q?zc&2|cy&c_CJPL_<6hDSkS!dHViz=IQ7763H7zoH`0v$s|}mAc0X z8kmgCmHLht-_#euL-nSER~oIJbeBzfv;xx?m_WXM*a*v?cD^1GSHK4FV)qL_|BdlE zmO>Loa#|QBXZU9mcZ8M%O32Y(_GKa{+_XjRfM75uzLpOHu|2LK#Bsptz?If3A}_RC zc|!7fT$EK!&sA491{rn(UeOr)p(pRJpr1WFBzo!dWFO1KU!l zO-Q71?r9Hk*kT@{y}kX}aCGH!qXJjbTM*a})&WG6uQXJ1Sc!g(N8htu_)|I}wEHE9 zglu1q=bjmq+k4odQT)m2Gx%QWp4?gqh_m{&W^_EW3yf*rt6vR^`fygLiW`;J*J3{M%{`;okNZtABu#Umni| zf!haxc&-QM!fnw1^e{e)zXST%@t-dPN^l~F41MXm-4c+~V;K=cTpO@17XKr<=H-dT z6}$u|X$fi@Vxt1|g-dIJvN5?L)qqa&V<7YGSQ>5WlF*s;3h3{@1c70L0>#kDLWo_? zgpN*FeafnWZt1&=(EULN-wgu&1!e9*$G=zu3Oo-w`V9`q3W#&FpOtvsC`b>u5A!ax zR5NI+JhzUO6^yD&VKjHAIby^rsoWO?S4HROWOk$~N=He8S*i@7S$jI&B3T;@ z(vs;C)tMs$HM$)v7Af)N(}JA2{UEpTp~d^tiZ5%qmB{Vc=M=5vr3TGcpKI1`Dob~O zckEcF(o?lkQg`>1*Ykd{$qR~^yi~Y;EbfdeA>K65a-+*kgLNu)aM`1Pt~)|Gz10t4 zhGpOKbhMvXeRMJXI}9%(nckZ*>3&c4)W^L38i|z2$_?hblLF0q?l}G8oZmY>7EIzq z_)=lNU~QMD#fQ}DIDF_*sq(H2TGDyi(d{sjbZ^AKq)KC?<|(VaL4kk5j-zz&8P<-w z(#1-XVw&AmkfxDesl&Qq)65hq*197z_@i?(5LTCa*w;|OCj;7Tca}ZkqTU^ANf`x+ zrAHo|D4O-50(k^hvht%cB2CzTFsiyTrssK?4*f%*bucZAV1{IkC2y}vPc8K{Vj>CE z?j35v63QWdKWQ=es762pVP91Bl@u4#$|{d4^=O|GatdwWIByC3?8P+x#cc`*wr0RQ z=e>=0yyTk{EIdYja;izrW}G|pdOg|0W5)F;rmTn8X}+>5;Sm>$J;-s}4lnQhVV8JV zyN68`8g;uB(6Mo4RS{#M6^KQS;vg54(Z~RSwVC~KuM4G8|7A0Id@@S5ax<)Pr*~vF zo)WLOwciqYi}j}!U+rhzDm`J$M};7}xrY)WJ2yo2lxwqp5+$upEm&*3JS48H?jx>j z2Cq4Z6JS)1N}nIp?f?_r`LP~XF0FIwLb**0t41$-V~|2f>ly-?+J;9b_6|wDC(L)k z&xkF1x;k}x@&_L_-1BI_;Y(OgCFlJTT9JvuO+=azs}DbW2SP7I%YVWpV-rZVYS|>W zU?rZpx-UWwx}ehSD~u$Ilj!yKu_&+3ksB^F`_QNNVrHCkwLOQr+bzxD*HEmCvq_1o zo5q8KLpki2y213+5jt=bm6)7;rA+ps7+C+?b0%-%25p0ZCsj&nRE>_@P@8H**iq~F z+V$JfV369uq}Ruhx#`6dd6hv^n;*D^uJE?4RfdD^VswOjPx()Ta6J?Xaxq zYkjH>RfpqH@1ABdKj}7fj_}$5K3nx>==m(Hc7|#Z4-r&$Zuo5d4?j;ls?pz8U-1-n zgNM~-g^q`9FFUsmZ?)xPIy-Kkg7t)z9^CbfTP6WJ;l{~6j29iP+39!+UF~fHie>X1 zD~%-4WhQn=_fcI|gBvGLXVi<;4?C50sWcl50s@s$&v!c9Y}o`XyQ~ZR%JeTkQz%h( zY4?6m*StA9eAekw00glc?;1xrWX)iwdNoRRR#-@Gmzmku-$m`MwuS|o)k_dPb0 zcRPz8pj)j#T1w9Nhpdm6^j|%y(vRz+6UpWGDYFplWOnRf<6P-K7KJ30E!@hofb=t^ zq~!>mNV1>Ut*lx65B{Gd>CbDOR z(Ur#LqcN%6Ba4tgV=Yhb)+t8jRdR2sqNKHyJ3l5DRoqe;qD` zQ1D|^waF1MI&I*y&jR!G2MfXUokG3uA)6Wktnc0){*+t^9iRX0{|6S%g?@StL`h~q zNBL==yr)wqyC820^w;wu0Juu%{i zn#Ldg0RBIJ?*`c1KYnb^sx6Bc=KucX{pOed{F4of|Mw^VIl}*sWBCtpv=qNHruOvE TT6bGW(9HH7+?%!g_{IMNQ(%%H diff --git a/tests/model_comp.PNG b/tests/model_comp.PNG deleted file mode 100644 index 8325248d8b6aa574984ad272f0ae622dc54b0bfc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 84607 zcmeFa30#v$w?B@hLai))YwH3+YO7+afP#Q*i5B+-tQ#szR5m3EQI-%0NkkOtf+#46 zED;x|vP8B-2qY*fAWKk$KpsL+mH;6H2uVmnlK&H^ZT0PY@BQ5N{{HXhUi~O0!#vN- zoO9;PIo~;R=KOBgE&87ueXgaYrN3?K#ywhEx=vbJ+6JHLfF+2GlP|#kW`*q8@}pKk z)8aw!%clVwc5TqoDomI=b#ylPea^|PheEWpz6{m;o7LqH!D?x#?`_+-VIS6em^0_b zVsS=)1aqWmxXHA#(%RWgI5E-?B%H9eRy1JHLX1$-v=e4c>lryg<j{ z%!-LG{Kt*z^!c_{agX$VuGURC_o(nu=*iEb*AH~-WO+PD+_%4c`zy3)!J?~+Qf1_y z^Ssl_Uh54@4XXz)kb5T%n!@19IviQ`eWOgCci{2#D4i%xr!KN7}h1@>+z{+4tgn%hE98 zIAhX>uX1zOW@g->B|T=Ul!_bykneH_83(0{1zaJ14S)2>MY2v@@J3S2d?!wk?LXe{ zN2tx{Vm8#jr>AFI)M%0t=it>v4_N?YC8XdLQXIU&p+Y%Q?mI~dY`xjb>hd3Ad`kU3 zwCsVRgx`;UQ0_(x>dLvGa1u|v^`$AuHce!=_bHs(rJ+y+u+?g`gNo*s;2j+(X_{fu zm5NGBe-jD{vO@=55BEN*Yj}8kCEART=GH0|^g|Clprh@ib$c4|aH6u&mqzxYmigho zeednH6Rz&sZp^MqT+(E>nL-*$YrqU}{oEMRO$o%QMqqQ&V_=?}MTxeqQW(`17p9`D z=)f-z;SP_*q(13+HdPoGOECkK*WFobC9~fr%asD%Ow|y@87EolLK>8}!M$q1S9@?( z%B}^)22dMjLT+PVn)-9#D;Z(LrWL}73P<;{ITBT1eHo-CCu%{kO-AlOd2%|f>T zK}_SAmRp3{hjXw3E}g^UNRQOBH@+v|R|Ye{Mynk+qogC876*^Gb_3-lw3;Bi=n=;! z!p5KVk(XCf-v}HNH*}s|Wa!Dt9ELtn%wJZNP}x0}lPJj4&-`F+l$&CMOpHpsXqaW1 zL0}Y@f1udksHQ-Tcmo9FsaKslr$Xt-E3*Z(T;In8rsAZaftNZV7{=zw}`7k zLd<=9d|DhFw`}e(z|RckyF20yKysFxI~GM4yjZt7NJv+nL(?jm^h8Ck&;vuN>$KZe zhaZ=k)0n;^X?J^}9RRh9VAnB%X3*&WECRb&pa|5~9=DoUWMYEyE%)+woRE%xGko!o zE~LU)-afDHVUup~A+}{)LD~!az66j}?wTJ1Guh0LDhX^ zT(U1SN7-XpUMO2EQjZ*(55)64gQWB~>L8By_2m&NJNDaV)UrAirbkz`c;_1EXoG{( zRP7mXmLa6xKyi4dWQ0SiXf&fW#f4?n%c8C*cIG5F3+cK^f)SGT2ZE~Usy-_Y>KHzP zygM9hH99r;mI83M#-8X{Z%I^l_uQ*y*ex%w%&iwE3~DF+K_cD|^IngA4)6o5;1ph|oU-RWe5iV)ikgyn5rnrp6?XxvZIKMm zA(L?yJV;v6MyU*jSSoI)3YSk+oJ%BnZ<4P38LhrQa?yOuB5A4<7u9XQw&J_NhZtqD z?kOl3U96RB_!GH8O}9uzp&fuNBdv^L8Rf~z*ctR8ef z$?g8NsMJ~5XUVvUl;&5989CekM7*2)hvh%1SSc)R5B|9ZPJnsB^JL%pH=@%rBE6Rq zdRiK)5X)+YE%tU896Bt|chx2`!Z3EnHEW}j^VG{^mJ*~-y;?9Yq1IC4I zg4UNb36`0R?^?cnyaN63TozDXyHENoc~V4r%i|`6jXnmn4WT#nc+W8c6h6%jKq(U4 zF>P@fiW(20qpInizvWb?M1kj4tdPE7MjBM4LYw4{!`*YKJnvX~GS~H*X0?#Z3-bCUYLP22!b=qB+?mQ&XttSGLnpfR2m$WMu?E-8;h9b%F zBY`_GUI3uXWgL+_u^g_VHvyY#3~!n^9l^0ULAv;FCWGWJ+@vG-lJ!hkts~Sio;do= z&3G{|bv=t|iqfUNwop`^l0A9eGXmeMz%sp$m2N56;k_x6CdlVd!dqe|UcA0}CrwDp z68j8uds8e6wfSgrWn-m-4ZoB?j<=N7s5V8$)Qj)qE3~&U^LFr2=D-r1@uQ}=20BNC znD@RNnH{j)d9ltabyR@=N0gHUSy^@tB({g!kve%j%1TUYC;gW zyeT{%2ubEtzOc;EX7+OPf!v5RIGo`&$V1)!6dIRw9@J99pq4^eJ!3M=_}z>Ht-5G5 zLqWhVOfl%)%+?7Nq|_uu2PpTa&!f9pAuC=RT;>(*(76nM7-Jlmk`&Fm=dUXKy|e=8Xddy>UHJGK zkcx`T`<40P_8S<{5>%tAa&Ub}qYjWp$b0Ff5Zr>hD<%o{G`v$~{!5!OPa>&TS+v$P zCnDm_Ti&0TC02Or!DwFpVM-8(9hSL0Hq5Qmi}7M){3tJvhTJb0e?SXJWgHU~T+gR6 z%W?aa<4Mqmx!iF63tFQ4pqJV=k~uEQ=(W!XsdWB260~8SQy$#wDJz8?kgO@Hk4Fg;(v-AfGFK|6A%L}b^(!W@ zaU0Z9#+H%nhzQ|zW}Q;^+*E{tTwBw%N; z7AN@;NYOI}sU!mu`BB}6Xz83xXn!q`@;dHDDbXxJ{UY&skF7;cLd)dj9bt#+o_GoG zad={?pR)C49jYwhYS?j4=n-Ay9g_z25W8<~l47ZLV=MnA^Lv_wU@rrYM=XHEW=l~^ zaq23-{Xm{8MSpk>-d4a+)s?^{3+qr~et876gJDK9 z0axNkoYhI(cxUAnRRsN}0V1hh#@h$Z@w-AHu##3WUNCIWvP2W!mV-(rg%4!O)(!Uc zN``wHifG>XY5?YX2hF}u&q#wfa&!Ac$=N;Q7aWtKbI=#N1IW>+|<(@FS#1hg1#4j_nnQTTLx+J*j zWI7hT@xgf5j^=qa#s1u(h*x|e>bQu4M@BG+!@cw!i3vw@yLJd^4^`nVEj1?ymr7hB zW!*2BtS;|%Y=w!C`x7yp8qU(a2`<94VqkMuKI6-^gDs1(wlqJlmV`BLfPO|?`0#d$ zBF50Ut$o3JHpmxN&eqSOc_*zVIv$Zf$SM?$sx~0b+N#Q1HBw zf@10E)cmmxz37r>@KX`Zk`enOx=L0(g{vCbV9ec%yM$br>g*2=Md`TcB)#bFSm98L z<$Xizq}Qg>?pZ)(2*ad({9?Zk3|DBf(-IOT<}@W-)pt_f%c!feO|dZ+R6&W}Vs{*+ zQ}Ux=jp4AH>37Ch3I>4@*d{p1u|{LRwpeN1Z0e|ZroLM^(NTH$y4_=oL<^GJY`A3Ay429-AvGHo1QaJ+b&lgS)`R^vD`Rj2 zYM^}V;ZkqftC~@%<2pK-z-5Plmdi~87ZI?$ej|XcE4Ew;;6;>|$>Q_x+dwTV_Zng4 zu0D z?=HKWtMEtXmM09+$pw+lpm#*4K{+3C*+j$Ttssp?jyg`*DcK8H`5r86n#S6>li>Ul zq!vXs?3+R<`kKS`mbvwS-s=5I)FSNs3b%psIhNreq)vS9Be2|Dh;9e1*?aZgC!$P0 zkR+v5Uo;LP{{!|;vtd!fzTl@^B-uuJQj&FVvG6({O3pgdce0SrUMF$tQO7KR<`+h5l$ z;7Dz4bwW()VlicJIA=j>?vt9oI2|<%x5VE307tTFv=&G6l1(ICIN^)~-IJZK0akWV zhzZhX5`K|^u!8#iqKe9i@gD#OCx4H)&4_JWS8Q3DqIU#n?7@VPdO?!baj0oEAf!>J z!3JpP$M{T~>K^DdciFPbq3^>nzHHwiZRX)Fyp-cRkmOhHMId@9gI>%5iJJF6CWxYsVA!>4LkqDfnO#zM*WBF*c_sGR8euayKx)g#PDD+)N z1pyoYh+?A^xBiIDi1Ay7E2Y$>fKsJviptyZo_+dRs<&)C2$^lQmNpVsM9?rwN;lDK z{n}KGEMSM_F$Y>iMNb zUc3jn1>3na&*T1wzS7D3ZS-2@w&<>k?%l;JxU#13S5k`wHxJD}7j34gpjV_XLbhG< zLA6bV4Ss#US8k1Z96J%b9IM=u=7GCpZIp^y`O0JYHA3feMMHW(+N(4jrh~_7>xc2v zum2K6$tcHP-u4~s*ky_vUIUbeA>>_sKU&w{UOml`)3YciRH?RzCS+PG^yDz6JJrBK zRK1?q@Fa^@)KPaOWTa>!xbcF0jMIc~_Z-bT;}V-2idN`3%>q9kIU-wWjmI7$p7SSY zbqK1g7wv00bePe+d+j&r1C;f;n)+2GPq{P953PZpNTok0@;TYjS{3rS%Zu}-R2M$E z4n??xJmNr(puysLPlcqk-~!0Y8(%Ff@ptaGL!&EEltwXX4~;NVL|KBW_n|;jGHHR}j*w)71no)O6 zq-$ExikQ-O)BL}FLDb05ZuP@P?ZPq;YNEYp*;ua`_@mfZattaZa z^LNGFlqqby`wmZzwah})kjL~Zw%e*b0g`kd#YFSETVH0S`u((Urr%nDvesPLT7o|R z)0pby4VwClJI@QUSZ{eSYk|RHfXh$@7lbRWtRdRV7Una_GRxm>@Fn6g8?Pgi&?Tp3EuCC~nv>o-S zC#Rtxt0_+2iqS%4u%fD1tYqtE#`6-5JuwE~g+T7~*s8IEeaTt=^0A^CbXk6O&neB& zwHB#jr-l4h^HVY%332jwVS1hsvLq|LMKP76{!Udb9YgIcn&xg$ewTYK zJ=0%lhC1h;yy@tSY%g^BRj%2e|NpL!tnzg-S8|Og;cW?@7KirDHwK(DyM9-Y>tScX zey1MLc7q&uz1D}Iv{M8=gU-6#WOENBmyRF2_jn83S-5zfJn*IaN(%ilDsL0qTpswW zjg|27*w9YR=a=qiBy!cZRRg)3;Q8%x2Y5>q>f&sCspk9iqeIuj3^arwm+X?QbPB&T z>}sh`7xP(LN^iY%XFArLr5~PXy^v4#F-E*}zeqn^C>Hya+;C{oe4c3aIdWIi1>{Ue5^;q})P4LS0`D_0}$|Xc!oqj(L zag&THyrt>>KrS!C=SO&DQPYg@WiP~nJ+-9Fj{iuek7WAaC`~UwoVZ5PF_5YKlW)D` zC!h@Hgay8drOUCrW^s&>JkSM>0w?uRV+8nNAa~bqvpSZ}Zgz*L?Kkd(UA#rMooHQ7 zp_^*d&s>h}&smy+>_j@wuzh4KX7K5Ti_tlzpc24dn5+`L*?$M4Of-4l#$V#aYvYA4 zcVT)T>AnxDJ)O*Np`%kBa~+3qQuMO->kc|BfI9dB;nt(+pJ{J2*U8j#GX8K^-uH*p zlig*#Q}S>6e+G~)k=KEfilbsaER=3;;2BiGjWZ|)+T>sF(sL3Iq-?U&utT=vRW72aeqN<`*1%CJZ_C~P=+Qa@Cj_fq2EAo(7vUUGinj~D8+-!)kx&5Up#^dM|K~sOr~Ewa{3o=t#fa)Ut7HDb8Y5V9|V>7%6)^i znKwgkMW_-dg6r`^YPvvDy%4Ljdi`r~1^;aw9V!i{bcC@uH&&f|f8N-aggpD&on2e| zGm1;EjOt!g-lcmBMIOoHt)`(+%D{6oeRx&{ZWzQ&v)xsNxV*6FM2kN_L3xZ)Ht6BS zW6zd^DsN5C{!t8@H z_Q@ZtmbI`0?<}9n!otF(Gb=jCmZ)8#QuF!0+9B)Q+x|$a8Jpwa=^2CRj|K-gmV?dz zffaPBAnyZafu>QR*0dEBiS(Ja%|N4#aH+p0@SSl{Q?+%v+ak>reE0Sq&qH9ng7@ty zpIL3wX44;ysm6rXV;g3SzCSfxYRZa!p9(QHqn@@v(~VO|gnv`SK zZhynjuYIPI$wxj;w?_TyD^rfSjR zqXQgjLuCw62Xu62SKI2*#p^(O`meI}l)jUQrC=-qHM9-{FaE27yWN*OIzbE|kEEkQ zlK6|T&rkn#+WM{5&BRyzEkryQ{i{>(SI0S{P5%Zd1wo$~hvzSIQqO6xo)f5cYCf6v z>V^vAr*#nM#L3s(gyg>SY)rMkn^q-E)>=65+w>3d1M8K)@1^`FI??&h{3cW03((^W z`)$$vd$hyR?imlvdH#{Z9QVLUI397a1t>+aV{foA%K`8}`2~nIA3s z$p6&e|2qY9p>1h*}=&Ibk?{=);B`@VB^1^lKun0YO(4s%W5fHf;GP?@mEfTos7@va zMBm;aUqs0h14Wn8(`FAk*hey!1Qs)>rfoG`VOirbt9zMgU;+n`x;&PhjIAu&V z&*vQ^{sB(;#VgcuyV1Nu!33*o_QbkRQ4Sk^n;tmb zLN`;uO7U{#nxFmsi4a9Qb{y{W^h(xMXG(}2em{sQ2k%1f$kQE45CODs`(p?A6fpe% z@^@eqB?viFW}`JAA}QY3PKOM6{SHnv1P+3@;>`ZUG<@~00e$>`5qU13^{uv>^E+4` zgiMc{k)n*=kB2fC^E~zJdR;RZk!t#GvmvTGqAOwnI}SGWeu&oD#c#*5au~1vSogz? zi54?GCtTdD+w-;EbTainVCa>>c{-Vu)mlGYj5gC|>UAKqUfGpq3!=Hd;`q$x@y0PM!@zet2XUx8 za0>y%^)>szAfo!)W)8R`gzPA?**2g7fq$(qso52NSFjv9yx7R-Sa*ju2!=1jT8$z{ zqj{IFa==||5aM`dr|0}<+h>131D-#Bp7?ZnKNp1a{vRNu|9)2LKIo|D8s0%ipUpSz zceEfJQ8$U%jb6##15dK``GIOi)BJ{X)XE|E{eKT?y}eYM=~wI#o4&F?{lr|`y9?Mm z1G^95IuD8P-S?~i7BA->8w~%9utoGc0oS~dqkO;!@fT)wH1~$sVsWnDq zaLXaHOtDsFf@N$2cO3tg)~c*)bZ->*GhSoQ`Ok-Vx5cG3x}$A_6dkCH>KBse0APJF zbgfrlNU?j0)!zdLUpH{#Yd3Xr**kTwXEFK~MV<@r0&?x}KX(XLIxWH)torL)64x%k z-*iYWG)`$*Nq=aXBA4iM+Qy{zEsAR@eNcmG{|#vTCizN~&y&AJ;6CCrAMu%wg7gue z`O9m^laE^u|M@2Kzn=-UdUWT%>mC-LMRrqvt^1!?!5{a;K4OHL*xmnmjPS$gWZ-Xv zd#{}UQ^x?#(4z+!rHvF+0`FEF7_oz182M{tu*rWnaN`{OMA}<-HW-xqal2dmFT-0t zTEQQ!;Qz%|Fa#R}0Rrye3^rb!1}LT2nb5?JBK%tHe=RmKLB06fY|^H8P~!WMiA4zR z2IKpce%<$L-XU=iEVLrJaB|+@)JYl!n3gTz@?e=t6hWXtp(dA z6%;xgjHjN7(f#b7UbA%F=){Kpe^5JgJdiw|PHNku&PZh}V3=#To|7ojj(G)!BL6vv zT{}_NtCMLnxL!213_S6pc&)|*g19xKdVj#jK`i~|KL?ju6QBJe=wkI%S%+2{t_Qq1 zhnPn(09)Sr`MV(Nr@EOwI0=wB7W0!Yl8G()B5pbulyub#cjmtcLJqTOuzv|cq7hU3 zQ}_F_=xqC+1So}8p%{(z*KVRP-lTSsWr0iw_1DyjC&lyrz^Qt8mtV2J@`GS(?HgMl z$}ss}<b~2(-vSnBSkrVj0LQnrDKj3U^e^dvae_XuD7TJD0&Gy_h zS`UFQ{_(%aQ)K6(@BcMVtsTQ-dGqs$Y1YP3u2XhSjus^4zF>QuX`%WpSo)gbyTeVI zV?X~{Ag-OQZSvCTkYF%kHsg;ciTz%FiVmjHR#o0O48qQ7Hbk$;?~7me zH-U#xjnE;s4aJ8-lCu{$pxE6BeGmjC{}=fFzbW{!wc<%g2h#D5-@!7ly>%_~ky|Zo zKh?=dy&WAO7;}uvX`Md8^_RW)Juk^6;#mPB*HfTc8TQQijrMI#H?A&U14(z!A+tGn6b8xfb*{7tXrh#L+$!=LT{LN&37u^4=GdE=gvzz7BT7N~=GAd6W&6wZoOBoc z4T046w3#>lU6F^s8ZH07+@09r9&|pid(iR6opX&oopQvAZIHq|$eic8nQ3uv9kyOP0cN@IM(PUw4930i*j3;~`xlW+ zqNX)jWbc{7$}M1ucM7gzwc@$qUFq+sm6{DTsV63{Q#AQbX8!1G{xIFp|9r`LKaBX& zBHL=~fJ2d9>O2w=Q(1qhSfJ7$ysn?&l_n(}bq^4ycyS&TgfmZ6a$wfV8`eUyS758G zB3Nn&b-&t4Ie{d{Ht`-JUE9{Ep=HeKOSy=Y$h=Ye*D^`wfNj17J=3-m9bgw!&tVjD zCjz)DRP0<2TeEOLi;%Hw9AK$4!Wt^w2%L2|!st(D%e_`E?_ylPAc@&cJ;J3A8$W~N z8cSE`=?YLJDOO9sQ+4^TgHs&p&V@MuELm9**uahexEkZ#*NvV`O8y$T-VWvbneAZMYx zESpC#X4WXea>8C%PuN{S2#XSId889wtn)({4f3>9Cyo+9lEehD$Y4(>I&ldA?!;T= zHw<{>kiuUNagy#<&~hvPah5pQ2K(xB7!)z8Yj@uUdxb6uE{`&(RCMyf${@bf*#uQf zIY(EpZe6$sToQ9zkVZ5R3zFe;oy7RtT*S#FDPOW~$;|n1BW_irTO<+IA=|V3mF!x=X_Cz@kO>LH###8aOp+Vl?(AcQL+%NHZXQVWlfrFt z!gUQyJ}0?^m>`tlM$hT9VQ zKk^UE3ylckf;?)*nAG}YAc?goek?mo3DE7q4mMH4iLR$FNMp{z zx$B6JxW&fGUme*Z&UNMUCQr7PA#d5haOJ$#lg$+!l z+p#F%BjbywMBXJj;e;mLv2~9|5ex}EKI{$;M!h<_06!fI97 z96_8MJXyDu0MFP^#F76$AqrzFegE+54p}vg?4FD%B%! z>rwo!@8#@z4v6VeBVdx4B78|9Y?1-c={Na5!srqQ8QmeZY}3r#w~<*s9KKk9K8$(M zzhF=HBZNrRaoIXGgBKGxxYy&GqI0YM5$R3vVy~HiA2`uVy>IzNzV0WRm|02+QX%R4 z=9g~!NkM&}`vd}Rp@=-(-}lKyGNO|KQEaYbWuksTvDl(OWo$Yo@18qQejE}u56zV* zm-D+MyNYyQWV-C_7Tr2Pfvi0ngz#o1;P3E-ejOphHLOE5gmyZ?rjL|>;G&BJj#$LW z?BnW}gF&#*q(Pe3KmUp@0g2TV8l>n98LDS_?;Q*oC6szgsd#p1KFe34l^d#RfuJENhm!ygZVi675bQx+*8xh#w*h@JdxTkbMNULY8hf!cE}q z9xq)WPdDr`@|-9o;au@DuvU~oX}vqM4yvy7U>Jewf{p4j2QV2V(Xc9ps-rNJ6*Ng$ z^by@ne0M>Sw1MK4hdB8_8PdGc%S1Ok&m=5fVc2ct$tthZMe8@6mGcUWdu`S+;?1H6 zKQ2>j@KFbq!I#q*us9@l;U$}OTxy^FL=>zR-BKlQolK{4+tlRB?6H0f4%W>&To?qTC4)Xm_?D|N!9m8)3!5O{z& z5c*cUlAB+E;ofLF!c7R&y;9kco&Q3r%{9|DGnAFy0>#A+jxz^JVbitGi0h?wm*I@P zG_*jkot2o<;9Q^WQ#<+axabUlh#M(|01=Qxds`)ACn#;4=w4wwyU&G?2ZnhSSSLPgt)ez9Df8MVuYhTzBNnSb(TlV_G6PzqUh&57I{ z9lKHG6CRW{hvc$>(IvEg#$3`u#+T)ibCQ)=&lhw`@ZYxN;_o0KyZT7*meWxgerziG zDC0aSce^ZPlwJv2x8vFfsFF9~aa8m*qM$XCxlw&RfJ|>XOgVuCu+m|zysTT?v;0dt zWg)c_bdP*5YRy_1>aZew72+X3%DYnGfRMGH1G4LtcijQIoKgEp-V)h{?KjUbV>Z44 z=WM#LZC&An!11M#XNBed`=balk|sgIR&-ewL&kl-=dy6pQr7D})`4Abj8^Os8+8_g z$%Kp+v->Ln!_Qgd`CcPt&Wc~BY9ljT*hY@Mt-OWt;2TC2qlVO`!#0nUx_A?kC{v)j zN#Q3528f;Rbn=2>z{)EMB}U>fKX}D>NXGW%qrs(BRaQF0A?&8ZW6Qsq;8)Z4OoCef z3Q94>z}-6u`^9TyVfi{PJ*HAe(T8;8sX5KXASaz2r1j@x&&7;ROKl=h+=Er;&#E+c ze*}50XrpBaIKhgIHB923jD#h7C^`!1d0$#vWcbt;Yj6xN=rzlLSGYF0XNJk%5+|xH z{$Pw~xk#ozopP@Dx!pX8VwX+!n^vnichDzg^bGaa zn|Y!@XiA-+7qW9CWU<5P)NcC{DSsr5w7o85t2ii)^10xS$@tu$BVz(4{ii8#J`?D= zWBgUEg}VTGpERvXlCmqe8~2;?w!Q8x1jBk z#0@fW24IHsoHn=KB(a)}I~rQ9c@Fi}{qeXC>vcoW&LMWBYC7MR& zOewYsDFHeCywVWZ*4GmLuqgAh;X9zlb|gYOcpNckn;m9+nPrjB67vw2IY|8=q{1|a z72C*!^~mryVd3lP5~CU25~K_aBzQYYV}?V7M$REDKOIoF z3$;6V6)?qyH#ENNsQcC{MO~6- z*M&olry|Sx@KbZwmUY(8Zo*|de*!=FhS!Qwdw+&Z>;KkhPnhti_8;uA6%n>yY9(d(zKJX&%)Av zLBl2SXULVCNo(t(cN6JNKLXjRuWko;H~VQ>y{av-D&)Pkd7d-+#UIo&J* zIQd5HCxU6VVaVz!2_mYg#i!)Bra$PZ-9>JS4K=KIX)R^1$iCN6UuBd|G>E1fwOzleM z_YroN5c;SAJ$%$|&~Cnv=@R521s8%J*oJHvudM#W5Ih9z=q5PsF*5?Ro~tPL#kDFE zIa|s<%qjxKb#^Ew?co1=U+SG7r=|5j@3;N;+uB_i?%&!!Tg~v zB&z*D{^1PoxtcexFZ?}o?5%HUHHK_vvYXWrUteYYjSUG-Qnse(DR^4 zloO`YJv#e`6rEp}5$Y~qJ{mBefjZ_P=$falRShJJmvtuRP-D%h^(~6biQN3bFfZ)b zI9rkKD-C)8_f+-tGjf9;&t4Fw=U5yyc9H1w-FJ5eoz1RSjvcZpt@h*PE{^# zjmvI@F4qik@r&P6lb-&(bG-8@d8Fa3bk1D8(O~Vy=T&)Qfld9n;Kc@gXNJGND)v;} zhOJjZ#YA2#HhxI};InV-M8(?+D`Q-mqAoj29!+#D5qqngJm6j(ottHXmjruUDI%D2 z-mI09#pT`a#kyXV}I6M z5U3Sb^asv;;%?q)x;VJM2kc(uu{9HKh9ba6`yFv^CZ-YX{^@kJ~ZYburC~~xG+BO9KAU!1l_cXhxs)cp8 z)=Jl;#V{x6^Ym;)3Da4JSIqxCC-T1TfO?EhT)J!$|75(nVxR3k1WuftgC9uNmm5h3 zd#Xfi1o|@rdy#C&-EF6k^Zj5d(ITE_{kY0H!k|pi_6i$z+^6b>@d0q3uc}xj9L9&Z zG+d4j*fm~gLB$y-WJqf9!8YJQcZHBARnt;m{)c4L`;I?G(1y=! zq1fnTsL+g{@-ki~yn_c*KYM;lWT|>f9yWuOIaGeF8N})Hf0(qN7>AFzS#c^ABRI)( zV>Ekm5*w+nsipPoce$VbuC;pg$CBUiycueMIoCfpO#Imr-`#)he#yU<1^&aJA@F$c z|HLJj|3^ES_9~hem;BpN;xC;7@~pBF-eraKl`jizItbzZusT2<$o7J)hXr{{I^_5Z`g2LG_6Pu|Xm8n3hUd48!N zq;mY z7w0cC%jEPH)|eLOdr-4~YrL?>Q>>l(oY#=P_mlNeI)bSra7}pXU+t8(-|!09up&WJ|X@BBqcSP;m-mkQ><7Rld@lr^uA-Ir5f=edY`;)bENuTZBJ!T*vknVhtLE*ch$?#J*@fnAh{~IsK20Ig=oE869nr(Ds8sTbA}U1heG^WFiArM zrH+go{Bm+(2dYC^%y*=0+gmRhz%n|@3S17X=bIh}uFLsU zE9#3FjH(Wt3da&NcJ>-d979ii*@3+@y!3nH(N?#vM5$o%;Jl2D*=9dJ|{{-w1()fTs>8vz9bO7hLX2+QG0y)nAYQv zI|J$2VjnwZDcjL?qTS{k(>@fu;zj}l3sW4K{HgGfp1!857n0DDRrqX~PSQ zH*>UZls}pGU?;Hc;I5i&zCBNWQ3j9T9iazQPj8ES)|O=KJL~o>;94-{Q=0ybpV@&h z)xxZVR+Z;rx3iujpRRP4ok&OaElo%wGCD&H!Mmj33KV#O7i{cvd_i!Al}Y8^vV_(L zAqqPfMu8Hfgpg`_BPf_bEIsw^4oQ>)j383JN_sN~@|`*=*o%$kx2v_ZV!oM~h*yV( zCMyS=Zi_tcil_V3)}4_%KT}b1pX8V=ZDhb7+rN6sOG7zC`Z`eOX$G`mo&;Fw&6%b1nAZQ5s-z-8D>o3lD`aaF=c(avmCVnu8)%WH%{q zMQ|le;J}2&>t^jVHN(5kBYHbhckn64ahC_h8B86QXxWN35 z%_JYTi3;`CzWA<5+d6d{D=GbdDR}b3S%)M`T zrJ|m2WX?{NPbhe`o9nDwfhxqAAI+}MT(a?C^-KE=BKcd<&k`xRG9Aag9CGflqkG2d zGB?%uMm1(OZ);u1oQ#W@iP}(tg=WU>Y`c{~v>N^Nbo*Uhg=}7oJQ^=rPZ#^<5I(t* zv95%)vBbNH6#AM=9y^toa5ba0cO(TS`(>LcxpIeOQegJYpbsVU^;x&xq=076qcW9H<)LVelt=HZcpy0U!*u`4w?WUeC3FID-i5d^&JC7$kwqK^z9rIQ*n}661t95g&O6)Zl^1{5j+C}*$ ze7uEl>~&Gc(K4`r_Qo>hiQe;8~#b z%;}5jZgzIx)!TH@zmdc`1>Mic$21kM3Li-l{)D}Jji~1aV44se{rSxA8TjSwwv3bz zV7cHaIO_FFW(*gt(zOMHtWJhkMy`01{-xG8U;R4ct4eCBtSj}j)}Np0ZTIf0 z>aYTHMx=Q66HU`Aer;N_a2NdgHt@4%&Hwc-p8t{JYcF&%)A~s#LQ?K6K;Cc6IF*<2 zCbc;93I9ODeRw7qG#%WL+}Px-qcJFsT+#GvCv}}#;{cp?@c_-b71#Bgis#h~`@VF8 z^3AS-IgY0@4>yOE&TCE;A(KcyFGnL-%rUCw1*bQF)X1~qbWcs2cm{Zow72)I9`y`d z8EAG>rhbBXmM-6xU0+}bPf=JRVc@k{G`s@i!5}!FU#gPL+<_iP__5}U@5#N(HA?u! zDb=>+BuUuJ4Recc&M}#r_;^>UhCNFeuPDI0rffqAoE5aF3q_{e`})u%Y&u@CHKn5> zB4jjj{BcnTuW4x(P^&vvffOGlww_b&pT{krVFaIdZO%63W)By)#(_l=cN7KLk(lXs z4zy4~3dr#)@M`ztCl!8g_zd<#=UhYn@zSHX0}#rI(l?X$H6_ffNzr{ToTQ_+ppt?Q zC8LZx5NYAVh5FUFVP%%HWTcHFX-nov>Sk_*!DeQ8SjXmQ;WLXR$&>GMhw5_NO)2PUhV=o627U_;SBhnxm;&#Y`o*lP$Ba` zf^t7doN8(=Y2B^E^j#eRxizW>k5v!$Q&29vOZ&4ulu-4=S0IHBBbh^mmk(SN9C8K; zw2ts#K2RK)B$K@c&&t?Rq_+g^_$o@gfV%5D1&Sj%g$qC4*~KnrwjsL$$U*IhdX}Kj z4F)j1*)>3NFuh*7<5tHP7b^~2Vmn(Vv|GZB3Q6X` z2`ZG)5_HhYmt|p8gqV+_lAN58Jk{_rM^W}BiNH85=4n#D6X)u@0_?hV>(W;Uw4d!( zy}c7RTh~Xq^>}kDiY_Tdx7V4G+8#yqU1xo&>w9RVLX_FPJ8wnHd17OT)8Sz;kL0G4 z%2gPuv(|J^MDgKj1Z{AlMNk$YLi!$GQ5o*lao7mdzZZPF2Z`=}1?rTd4(^uJ(ekEw z`@P?EI4&MIzIy$uqojOOMXrG&S4a1T(UjK480CKOZ!QvueK*(Zs*0)YnXNu1`5M7R zwgjG<+L!N*c7s^SPrrme8z3K<$hvL#75>IR%tqp~x7w-;&)kQ|2dADbg%=K_79Xd1 z$tEz0v7~BAbxqvxTexAz>;z=75LVJi4_CJp=*Dk}>Z3hgax8iH>x)@$ykS;8&*zxC zJds?}p96^;xZ3^AL!e^pdD>t=StLCncxs;sxRn%-uJkj6B?sI@We{*i0vLZ*}x#9qpwil8&K5#?V7X;K|eX9~-V; z+YYyBhnK#fpfrB^)6>xb!b0cV8i&K?lmKyicI(^8`-k_Vn4BKnBzWZzBe<2=`oM4% ze)phuZ9QorMPyoFXq}8L+24t)+PClct}Fk-m6LOy-~Ai+bDr6` zQQ^i$+pbf{yR5#7&Xx4g6c?0e1|edLUBNnfh+-;Y1`c)Spy$>h-N#2H?j6M+N?$Is zbCtZwS0!G%I9m?3sQo0S3w$AAA;Lo&mG7c zqZsK`*VdqBZXn)liupfRO((l)!;%w8892}H3 zFx_DQu;<2oPa*D7t6WmF;D;9UFvG5E7(hU5GglpF9;>1m{@j1p=r5+@JPdLBpxJ~G zrjsPiBNkU}t?9Mbb-GP+{#2*2Z=QEz^aNC zY>{dR6Sta)bMmAb$_U-*=g&)cUtz6c-%RIpp6IJRC>dtbSWNmmh_n%j5j|0 zT#PvkQ*(pWn?S3=H`f@++{&Q+ie_;0xj2_H@RIPx2S?(fEfs6_Og{a@j@FGWdA-N* z3~Z6{(f?AvT`j}f(h6U0?&}BA|2&KI59e4T-`(|L zz9mOBK&5mE{>EZPkF<+eoL4iu0(pAs1XV;b9VZC$e03GUDIC4IIvhQTggq@83n*O% z(I^fppQ%3~V-``--JiSV?^g16whvu$`MO;yfM>-uL6gkS>XLB_Kc|AtaoCs!rCv2> zF6A1VeE>~huFGp=l2U$Wjxi9u^M@u<#YY+7EuA;VF*=>>Ft$z>cVV)M$>$-c)s1dX z)%*!9Bw?**6Yip-;`S1^pVA<%N-uCRqv55IGMIVa5Ht4BGTo_o>H|}@RZOZI-%XMN zVDu1?V~S{k!+bb!ayi~LYCC~k;)sQn}FX+Z8f5M8^F=9UHx&NTW>rNaBpN_o5M9x~z!im*J{NXOi4IZNf(YJ6RdfYsr(a7toigj;O+NQiB z{i)A?sn-=I=viVGUyzLAzBa95T}aA8bOxK=TI_|sHy&*xRn1d|q83d=x9MgH{d#J; zv1I~Wv`dZ1vud&sd8>BpNe;Rt4l2Nz%B5xY#-y!PkGGEpN07`r0sys>&Ek_<^r$$m z*5Y94tRCwrQ5dFXK?;z}K&Xm2>8B4(RkoA|(9&t9nw~yqWjzsKO>j>piaU?P`eGyu zNZF_!?x|Z>jjGUy0hnic)N)Imn(aQ*icYbp@<=Am1i+3M`rPd3xfg{%v-~AnHh^cZ zk?}yPbhmrUXpmt+9I+#4yviNH>#&xgETTR!gNQH`^2cT40ot_Qr0Tt$n9F{yhHs)4 zhAh=^-E*~GXCCwav{k3A^7SIdAnIanjIXPYA8wzR;XIl6Kpx5W`x>1CIjso33h_*% zgeOJ0KaT+XCsVx8>&|MQ#Nk7Ix*zi596nIk=CwPg`%Tf$V`5`j({=Hswy25k!9}NO zx~Kr#S!=3vBUeb3mugiLUjA)6vwGz3bbhDJ1Q`3DV77@`GkLy%R@##p<(?vbTc6D0b2jf zoP)36WT$*qtjR|hY6Pm{4Io~b=H+t|Axn^GTfQ7%kGH}u;xr)c;=x30DRL{}O}81x zDw1ts`DwjI(;+$zg~h~$Ys6w))TmFB%`-%J|JKNSiH@pBO+XWit~&p{b}dlnja~lh zhGyK6p5P1sQ8P6#V8~87QsqB`FqIB<6N@5d+%1}Lk5Om3?hqCvn#cgEKN2$_7;$(d zThm;y4)w$hyY?(X(9p~$8xWX~kM zS0u5ZGG&BsYII!_y*Da&5=zaGpsXPW1v-YPr{fka+*8?%$60X){9BX_+{IOQ2`^{P z%rEed&I9#szk(Vpp2Sh39o+m>G!S<*_-e%VRe_mvLSjHqdf-L#P$8?_lhrpz>$Lc^ zo2JttDXu0u{4uabUUtv%Ssy=4x}-2tN|?(+=CJ9YtCxI5hmO=Q-eB@yw$f;ze4J!1 z&zx@+GRKX=C>eS;dXsDe@WGnCHQt?W&u6lVfN@ zbK6Ii2rU<5@pN+9RfA@m(lA4L?%S&rRRJ(JtY4Dbf(QKsr-I~fHTU3g+L;7?9b z+49u6hNB@IgGkQ-`9+b=5=+^bk#*c_95Q}|E%dCsvK3urs7&Wm^OVo@pX@F`0JQ7N zfrefJmssl(%Kazbq0>svAcMA@8@Q!mxT73K@QfV%+qSl|7H`-P`}cNgC%c$oS2yXg zCwPsxMo-3e_*vOYz5RHOhSagGdD3A0uV@APa9=i<*uZPkdp+^G1F53a0`AZlOOI*g zO5ryoqYR64_$AiTq*7?_oZF!^LJ;R&mbB9I+4#(mtg*T)o6o(vgzFyxq!#pW87A|g z%`^xXV;t_VkP%;zuytYJ)gj!QlX~0mmBKiYk)|hO_U$Y~^xe6Ue%rN^(-Cjeb$*vqvuE96@ zqx~&g&Ua#TSzs*Z7FEO!7VPE>#X|h5oj7moh%3~+y~e$sslEW13`q80w8DcLOck=J zxPZzgFAY?wcs@Xi0F}^%k&ykkDMVkQWZX89*!Abs?GJ{66?o+-QZT-cTZThrojV`+ zVyQNTz#|<(7esTU7bjxi6Y^^%lj}v#M-KlTqt}@}|E+A-sh+R&y1@#{%})Nk&RFcw zWDssT{cWCU1{PCDEzLTozT#>?V}@y%T)?_SWon6F!BEk~HhU!~;y=%&`KxM{<+xH$ z_2_!nT;+&j@st}xbn8^qf@+nZPmR?Gll%!%bfZK!(=>>O7hl96o}n)^7_lxc+gdz{ zBesvVrUj7AIOzm2ms%T7@rzRQgcWQBMPXoOmcBkjE!9bOO2(t%;gj~x;fxG!)^J?O za>*#s7qqVhXQC$qGnuj%{AWv23I&S}-noGa_%4l1pO z&L+fDBN*c6nHcmFja`A;_+q{!_PN->Dqlm>mOj~bwA|p7cd?%R(fH2M~qRzkdPV`NnNv92^ZO_{PK39UeXfHpYw}OeH>a=BhK1#rLz7cd(xgGn#e>bH-Y08w)opyvtPtKkl~a ze5j03?IS-p8tuHRX1rsFZ3M_@bz)Z$sBCk%*yUB3uxMf~F zH=lOTzcZBD$eMA$_vO1m-RYI55@en(Ygvx;Rop_y6I}~y@)1i7cX-A;s)|2WQ64z- z_JH_@zFfe?zJf#3Nmpd6=Cj#@Rnwi=aw*l!Dm#T2&utWcpCTtE}uS>tP^Gm-*QK@(5 zza9eqg|U~Ix)L;!)0I(yH-J;G&UO25n`WB-wbV7fc96p~K}i+M6t?iEh8UmIrwx-P z#ohNxRyDABdQGopi>iOkT*qt!wr%wm#2h+l&aqjyR7V+#$GuJ*f-`U*V@gdFg`L_3}E68e!Eg!Mweu zlad0&_{Fu(#Njr<3Vf7%_n=y-639?`-9iDT?Rx%Z@NdfS^_v}TLynLjn-G`Dz&FxX z9kb!*pZ&SXdBcHc!9???9!W0Y^Lorf+L@jZe(x#WC9TATcc-}ZsyTgs=CqdT;0X0H z6WHS^h3w#GBL^%*nacd68zz)Acb#m3Ah+;COLHXckezbr5^qv4jJv&x^J1CbXs2xs zPsttW{Sm~rO)IQ|Kx~yxD48;YoA$H5adTGea6xm7zk$)r4P4{h`2<)5rTuS@gxs~VQ(lY=kT@~4d8PQekexp z9>{NyTxOAT0AT<+^Jg84EMe-gTIiw+s7VG+JDRMMsa|gCw)%k5O`6(OP7>gn|8?8` zwme*}U**W7oro4(W+~ehk%i)yE@O2zX5OlHGEnpP;!|;`PA_tWaYWbH{VRY%$pqAU zE8<;<3A%5VhUE6%FzQo|H))RJTS^~q9~AXT1;oB2j1bRV(8c#Q;Asch<`JEKNbX;Y z+RE%VKHcLcwG*}wMVB@Nb~h?3Na&)!v&kY5 zBPwnCE-Fn>zOU><)6InrrTd<4Yki)GlfM6+Mc%TWnAU7KU|3F1#!=mk%!^;dnX+CC zp5$~|MzwoIvC&M*m0~zxj&8H}@Ar|p z4p?Ya9jO9XyE3C}xh4A1_I?pHAl*~Nb;eFO|G=|dRA9XVsBiF>nlNlG?S8nZ)Vuvx zA;p-{tBMii0M{r-S|WYzC81y`8fBAWc1Rq1AdOb#q#o~m6a~hWf`0&s7Ar>0zc?OK zEpfYw5>nxP#h!Xvc)pHlO2L48@Jhdc>RC<<9MRhXY7Uy>YqSeCTU3{bZP%EoaAykC7=1ERk9S_?T7D#W z8b37JvT?3{Jn4uwH`*560TsH59d4{;Ji*LaBE&{Dr>>#-htz1jY4e%GM{eYjVXEQi zMOj%{+)ga*mgt5p?6Ck$YnN(ghBqr|=Gz(?P4x#5BR$`c4q@h5m*bTvz}4~%cL+e* zTv`(Jm7x;7#7glq)%#F&A5G5T<7Kk%H;NwzIfNEw?`C0vchfzNC9jVm73AG;2+j=c z=J^f1s2@k()A$SP^cse9SZPL7yRiOKB}9>@=@eGd*>FC%fh{Q%zq{kkOqeVzDa*5e z#GBHMv)q{rm3j<1E`~(W&NY%*c36Q`(9QXI93~4`4>tm$kUU5!gEo*MxX982T z_f@7u32OZRtTmb%+wZ9?jjbnYdW*KOUbqpw;AFuQlo@L10&&sMSX#tRxY5V#S7zL8 zSBJ5HR7%lAz44cHCSup4#G?Jle}I`ehbwY2IZ zlj_n=PqhAo-0FECn&#IoU}GbFCZV!Gvxz?Zns|$&ZL3HnudMc_691 z`Iw8%fIdSHZb&3c|f@PlyW)i=832C_$FKsP^O2UzEyPf?w zRcx@J6Q34TBCm;>0}}^l;X9Ptwh}^;2_UR22WIF1{`4C!=+iDzuEv`IVKcZt(?Kf4 z5}q%v_)6-_0=&-#)iG?Kjo~L7<6OQhgAB6|=<^{)oqI`&*v9*TloJbHT5ri3ok#kG zurG}Cbkd?Y1>6cUYAs*&F7f7Ivkq>kryD5w&)kQA)_?)q{uG&p#I^DTJ*-QZ89lHn zchFnQ55aiGRT!=>a`I712eL8x)9L570de3r)lX%`|HH|Tx)%1kt!rHdPz zqu(AQmKzGK&n}$&;>-}t&`uSB_Xo~#DO?tY69CpwC!L~~l}Lv%rC4lOCSew^_LJ2- z;!yFy&)cJ{q`vEB^#teHaGb5uTcA{zAsRJc_#At`G7h8=vWO-S*&rpr3oCcvUA0wu z^*5rr?JuI5E4UO~{5g7a?*e@@O=N3m&YSW_hP2*ssk|r2NjEbxD_K{84)cFCP82hN z32_E1@(`2v@biqtJ>7!&5Bd2g+I_px)(4Cw@EuN^Qi*!I9PyGa6#`g3EA5eb8l^|( z>Q)H-sYRZLfHkCcPESS+_mcX$nnvL=3+{p8(EQt6Lm{N*oHCXoa@++jXEPkt_@6VieN0UrBplo&U)K;SGq4B?M?0{7p&BEO0T~4-XNI?#(}810gR|jUBaX#qDq~OcWHJ#7J$ic#`y@Tn3ov+oVjTF zj=Q8`s)~(9`qf=Ae-`u_y|55CynB}B@Vg&!ig+t+i<0L5qwX|vslkk?CER8K0YSfd zMA&54inWXYkVoftT+u}%@dTqcf-p+dm2pgB@Yrj%5xQn{#vzmz!7y*iGi!Mo|9&3e zBRe&s4CffRhSyG2zWX(eKcp1GJ+>BozT&LMEN)@sZ`tR61o7_s#pXN~ z>Tkb{T+Y1DkHj+?=jefnLCi$G8;Gp{mTR_j4K_ z=}wXc%r9zE7dcWn$?PDbNHx>Sl0CrCxdB^PlEx{+QDoe~xu?*Yt@w!{7!NiyHnNHu zSN1v(BTolCU4Ub_DF`lw(KkD)M+NU3WDWJzj#Vgk$zx?HI*ryXR?+ycr^~AK!?OqA zs-5vmJq35{h-N;L%8_E-G{4}KE~&BV$FKzu))nAll7R63DO5!13`Yxt>?|>5&h&&O zNz$K)lhP?SgJnd>JUiy=y5tV$v=5y^j%6a5cbD5d#Ys3A+NwbGIA#IDjJ4G8RCWzb zB5;~@fNK}`3EF=j0WM%!Fv*yB?#FS9wqtqr)G4sO_Ad7uFeOau0~U88hS1HYQ_J4| z*R6ZfyF>aqGgXT{D(VcqkgaEgR$umUvDTMS{YUQO%-a)O4v=5^Z+Uj zmvE&0HPRH!Sz`6zH^})XIn2S@;!WSEe?rOYZQTXVeq3hW>S*${t(9*;bd=~21uf6! zUnLt~E7z9pq~%&86RCaYrXT-jrqp_SOIn7SKdF3083t@2B+3MW7U&o|yAB!GJA|3Q zr2WrAcTiNco7>uMviiz&!qrlvfRt(^c3bo$KG8~M@yEsn$`=ZbRI&IPP2?M9DY~nQ z8|~wJ%M%E-ycw*KIe(;6Mzno%(t{$0;YTvrD7J5vR3e`!yhH=nL<5rNR0T1Z~ZNU%iL0t z6)=>vcs>~V<~pd8pGbZ_%_u;L_WK41aDrx37Q*-96aM6oCc3zTp*<8&Q0I*j5_<#gR3Gga(n9afo&n?eoy{$1um{L92&Ws_jrrp><1KT%ORV+18mUv^J))4fG8 zK)1QltI3)!243JDqNnJ#ZsPHZ!699QaG5Cx{KB-C96lPo_?8Xjn=4HprZ=Xe9pv_R zxu1XdF*$wzYpj4WeU(fmrCZg*v%%BBF%MgbLW&0LuMtqd<_h-oWu{qjsW0#dsv>!Z zTm5*&NRLGm=36$gxLaz$uQsBEcTQDKYMRc!2`E$86>l zWxJgq(Ojy0DHh5UnL#gXkV=REye*~Cn3F0R=x41}q9CN0&qx%$k3Dieh;QY!@i56l z4b)sw)1L9OIT@$GvsQmZ9?C1L8*Zv)-?(N-<&V?Fb0`)Rd5LZV0(uv#6$#%S3@IlS{<@YcmWwtX>;-MJk25;|+NU;AwvoN~Iy=n`O5BcBy&22y z0o@>!?|>8#BQC?$Wsw4qv|_NwzjLWLS5-gJ7RH0d7-rE<_a@9cn!ks~8%^}Wku+nD z@W$dCIGC$oj-@d+;AYj=e+75hiUHN9K2JB!VR&?48N@)Sk*s;yJ~JSc_+iC@vh7uf8E(Tg2RtH7 zsqWiTN{=}WMp}k0<$N3n_>XLAVwBsDdG5=vG)K5h!3~X|C3W_`y*~0z7(2h&&DPTR zw;T%_%nF7h97WV4W+U`_fjRxb*GnV8uzeLEt*@BeS|!MHD;vhJd4Lt_NfR-rQ!qoz zxNYgiTv{G3n>VHT#ZCsVo`x+jF@cf?*S40C(cE_pd!`zp+)Zf)43jido}u&8!KFwc zE3AWermu{B&s^#%4uGZ4?-uVCfKfh*MP7riIF2Me{iDLo+q;II{59tK?+up7BJT6RCi<9}bVvXMSAiRM(ttuAF#~wI z6OBkzG?r(v&)IzU#p?RVIpu{eKJnxXADGG5Dq#m(WCL=67sLe>#OQU^CciWq+LVC* z;1b9XW|L}o;ho_8jO|&e6)mE^M~3)w1HR4)Rdn!3sYuDI#Vd!Gp$?{v?Me;&D8*gL zTOu0#E5}!w3YozyAOo^J7u_8-!1AjzBn9lgV@Cb{VNgj>(4r}fkkdb>JO&3hXed>U zQ#6`z&NI@VX9zPFb zk5ZCzc)ND|r2i;^E>v>`Z&y5940G6AjQsDob+J7 z5nglkGTF@bf9N}3xFrQoe8~i%@&=ee#L3aet>|3ik?MsW3)%FC+G-!k^_JN5loPWS zLNLM-AQEYn6Pt`3|3cn+@W{$eGSo*#D1|gIT?SAWD=@`0zIS}3bQO6Nt;EITrO+5#tM7{f{8)-CGWcE<$_Y)-`$q3@$R zE{FHs8^GLEGGiBxCzYHw*t+@Y+Y-x=eBmF3D>C<;7#Ige>X|bu30@2eS2#9c3(KVM6pV7BoBOa`xN1iW0okG9nyUJM7b*X7au7_3>1k^g4 z)grzyU1X~HO3+yW`+jQYNC@=gRMhta6&ZexvFcry1qW2#KC#_(0$Uj>srJ1Rm;4jE zdR-_~%71>dq?0I4vtqaH;p#g6s?1*LX`G_?#?IsZnfh77+y&sxrXHMI z7>^iY?rI1ju4}LQ`60WKc@gtm@LMWDZx14)a$S#qq}2RM!W03?yZVT8>N`uE-*K~@ z66r}-c7MgXrAwD)f=lN3;+QL|HjTwPkRPAj1%Bg^wV8rn=3Haj!jGACX4c$XXJp`U z>O|0BM0%XKTe~^m7SdFsGyYxS$n}S7;oC+YZ&i2~0{-N7!v#hblGnd>?BPn?hxK@( z#;QO=@tBckKa$;)eYSICj8alVI0X5;`l<`?6GurU#%{m%G~3G4CrHmrU?`p4%dTMt zn|>g-ZESiwBjPVBj^ZCw-Ern|wa|ZxI#?q`%cq)z#;ETKJlxUQsby|Zwe}7qZOf92 zncq#AK|*+sc2q3c8`7~}nEp@;J( zepV+_dWfgKZ z8X-MvI2{%7ShYHr;}N1qS&0>+NE4)cV60-~VSvx6lY@>sT;VhC@;6c|L({IS8SS~d z(?^TQZ>9Plk&dOUc!&~`9KXX@*$-j%cIanT;)yOH>+AiLE~b5#6VV$uR+Qf4y)xa_ zI(g-)HIzMdS;~BGdA|3O@3IwlFb!|wz^D9-QTqMWO53KlM1b%ACO6Uy#of!ztQbb` zH(jW#f3C>W8arHEb(Q1aY!DXC&s_}9Fj4G>2mLPia^1K|!JDKkOV81F?fOqo7(d~r zlD~~fH^=Y~{W;!sND1exeRk8(k*pep`NXX+JLOoCPT#>)U&O#Mzcn?0Yym@BBmET||HQ!^%SaRCL2B); zI68l~z30wVjRCNPMq5|;3HerR z_=H7)lbPm2^Axc|hbK2DKFB2{Ftc^sEQ{mxM|Ksx{#wcp>VXZ7^mDw+{Wr5J55vay zn$2%|GHJK8r?81z5$tIt7IaMGq$b-2qone%n@t5yO(Z^Nlr}$EL#R`GUkr+U;+!D^ zvqMU9nftPKsKoQw+D+Nk&i!zQZ|UFQoyzX&d%rnDsR+hD%!VeXDI0k9)bHR?RBG`= zR^sTT0g{E(ZuG=n66^HAZ~jM!@;*}?zBXv*COgAS6cgRshhd%S9b;18OhHkW`uc9q z>RQ0h$%s(WUP5YHYZ>|;VJF-}QJZy*9Z&ZZCaG$e?59G+8@CPmPB*B{o+@xEM!fj8 zB8Yh>fL;9Er>#r80RMKmP5tovWRK^}+eaORE;+mF27B*YsD#ZkMq3<;c`e?%(-~1F zI=f^2NrZ29MA9uNK~qI1Y)Z|yZQorS9b*ZZ?NXZUlKa6*?M$!V8aLUe-G7(GPIXZ4 zvp?FR9v@p4i>&Hv8J{}YiSax2b}F`*&m4Zf*@m&Qw8X15VXW1eQR&)V`_tG zynOFx(2XCPe^MgkHa)0~e#FB_!%jYToL83Y77+Q_TUPJbZ^P_eOU9aWST2aq4MuFn#5qwsD)^y;JxLQDxw>uWCr4D-b+Y)ns8d zf8V|p0&lBe1Ua2-GxqkmB@EB795?RCNZ*rPa;p<#{lc-{_}trHD8S(kP~NBnp!7XB zqOJy`CdN_?iqW$ks;-2}H=Ay>MAMR zWZoNUkBuse25b1HQ*Jdv4J_-GgLA1G>9C^ws_!Z{t1}OuJntd@?}7iwF4cjJVMG7|3w!Zsw z_*VF%gIxnvhxpjW17`DUvp)hx6I*a}EKxhichdgPaf+RZbS~$@Vt4@=WfOC(V6zTX zu%84mdq-6hy~h{7wkeQ0|Gq8ZxEZY~0|m-yI7wnyjkwiR8Ne23)j^|p`KjaS zxnV1RQDG?Q%U91Q9~IZZ6cmh+VjiYNqzt{ePR;0`Tu;gDSG##X{1@YpcTa`aZ}4b7 zKc6}2-JP#f0w*V>g-tazEoc|d9Y0vug&jsfZ{C1ITIS_bYhYSNpdi=4=hwQi094$2 zUV0&BKn?)NY}ZOwjKR@s(FP-PxLz|=mV$8!kw?t*xeWgaIFlcnT7qky3898l>xIF> zyo5{mb50~-UqvVjH1ThAOfrlVhAMIQoV`a?cggMHo+csl!)?`;AuVyys z`S}tSOp^5*{_MIcm94jhMganD%!}RPhs$P4_f&y=UA6YG+1Lgs)ODqv>qH@UP8lYg zS6eTyl5-BONq9v9HI{9;q+{7nAGp?lCIMAp@^8XzjX>TH&?(`Wr;0 zcc&S$1{lF|QsV?BAbSU2-;xNMD*?d^Vvsb*eyb{1&P4IW_T7XKZoY52sYAm~zstf^kS@9j%Dl+d*jWl7-CBITZ%@3dZSu%3a{E`_lnoh{%1~wT zEk%W!zJGy#*{hfPM$OU;u0_db)^ufDfMW!iGmgiP554aQzdo@K&8$}y)sAjLLMn0Q z5!@GeOG7myf2I6)A^tANxp&*?$rtZ)dt&cQ?3L}ymen~tW`eDp(rs6w;27uvECX!9 zg#e;1M0wSkb7@9G?zf8XH{0+G4~*ojTXnVZ;sG0*ZfaujA%H}uMrUhjRjcE~6 zaQom;U}BD5!Pp!utK?a<4eX1GIu`9y8OIL`MmZ;wFWo;e01OlsJ@+j?$VPX#3h5|j1)`mBr*%x6_@V;q$r zrhUw7$y~ymV+vt5xSu3nJbC)=Lywz{xQppsu@^f&^1x?{PX7Z-Up`fq+i3@RcDLj~ zXT-zTq?=zoTvIu8i+#kc{=%f4;H#GVOT6K?*n7OG0X9ugelRKQ8%2B2&^-sAuML{C zM~EPgU@0t>4v*K|bSjNxzOX-zj(+POtNzLnwQ=L8Q@?D@KMATKr@U-<%}otQaY=BC zQ%;y>jn)C~2^;cGfLhK~A>*^pImQ~6*B9D6+Qwu1+#siOmjMRnX`H!<$8O2dacC|r z_j}Z;jQ$ZwXF(ba7iSUXf9 zGCaC+cRI?yww!`GuwhA~jdtJQt2K&t9#EnWI%3iP%QEcqOv3i|g>#F9$5-VzAfVfr zW+evUdl)o=FsNWGB$Hh69!tu*4+|$PtYp&SkySmXewlHZPm0XdiIfnfDysr(IE|Ps z(ak9-kJkPEDwTjlnNn-#H=(`;VoS9jf-gctCrYjEY;(*#hBRus9y-~5s2fMWGham^ zs~HC~{1%frKiu_%f6Wsc^cWYz2^Aj55i&fya ze>s8e_$=dPH1&RX>Do8T=K#h|+zh~@UWu}@Igq8VaC_jIu!%MEI$U)8t)cHt8X)a@3~d>7{l%=jT7RAM30<6a@mjlz_zNwG_p0;>-84<)}x_-#hQ z%x@1KY-kEVbIL_iVWR_GnW?G9IwlZcuq?-8Uri)F(Ld$DErE)E_Y^Y~epfzWoigcp za>YjDJmv2meRSgj7r*?1($_7P%hXbydBX_p&Lvp}XY9%8#F?NbfgR;39zS}3^r|L% zxc>onlo9OT`}E0CB@^*y;28~l$!RAbgHG~;QUwxWhX6@mrm#y5Bc9YRJY_*KLsj;h zgn@VPEc_Gl3?|IEVa5HnL{bhs!?qP2sq5koRlu$N zpT2)ruAZ`qQ3i@h%Xl~1l^#d(^Ep(F6qO{i{I~pZYxpz&FqES=?|K^d{(HBZ1w}rJE>Z+f9lI~IM4mC>-Y3~~=*Vzt@~vNEdkPXME>#C(G5iz8Z?Qq=LhX>&ju0>} zpQw-?1so_C%Quc7Ti}lJQDa8+|>XJ-dFAm;aUS_~)K}=1p!-qP{14 zG=vvj&O}E%YAP@JnXGjt2geWC^1SzwCM@=x{b<0SSXA_d zLHOV&o$ak^2bCM&^*Yob^i70}b|3!=*r5}*U%vFjsQvIEg|a>8-PJp(S|QlYuy|Sm zMU8OJlXQLUup>2w!})_DQ#(fH(vvnH3&XZMcK@}h=$YyWAoA}*U4ADB3#)=UVS~&C zoOE-ENm%j{9sB)3rym3aOxOKGgL*OkZANAs$Sp$&6aujWjw?Ug0fz$3AT)xGFnMdzJ43&RU13BuSrkPj zCOvMr;W?S_7$ki0U6@<=&VT4{Yx`~6@Bxi2G84G3RJEfbIPA&sBBzs!obK;E7Pf67 z|Hm8GOn+5XwT&VsslcCBNDo#<{hY$2kzT>fi+d9||13%+KY-T|KHf8Z`Ak`?I`n6X7G zbI6N!0S?}BGB7?khwTZ_ZemgK+pR+4RPRtt!=}P(rg*(w^7_GUHblcvHp)1Pe}T%e z&M+Au?DPwCdxr-+3gi*mJ$s^KnyMopE#UHMGihA0$DWy&ZCcrt8?>|z?A?2jVYsE& zy$aFv_NR3Z>COR7rkGi6K-qc##rHx?Za^mMrF+ z2i+@=9ON=Jfp@d|#C81IpXK&9#$y9v_0$>^oV)>ls?MteN8cT$^$w&}Bj5po8v5&& zx>80EcWOE1-1b(&yx;V6@N|Q4DtNCl{Z$}#i3Mq*?s=z71m}c%o`0ykWc^1%{UT^k zbw~AU$JSKx^G5on2}o$-5Yuuz>%@do`2q=_Q$j-PQI8Mrhp z57uo85^K_iwIpsf*dIh?g!w{IWig{{&1)D35Yyx6Xe`o4GO z_(Un7%6TFQ`;e3_CZWW0Qv)^T9PkhSdAZAMqeL3O>bc}gt3C!E54x|Qs=xQHp_e~bdyEsIj8y$ju6annHRR;lJeUROg3<2z|2~jc zCq(IOd#4VJ69jM(eT;%ZXYWb}9uP+?HzNOud?u*n*vgy{Pmk8SfAI9A`rAgBbBWkj z>HY&FgSp3U2?j3&utmVz{|DwKXVYeKqB6v>+>g2zNXT=s8KIMg3Oi^Z%uGOw<(hK4 zCf9TKUd%U6+MI=Fn86zy#TA2r#j>~yATeL^+w{f%yoN}b7pW>=ES~}KoIh+s4&fkZ zdvZ8KM>m>%*o0q;&R4X$#JS;-j-k35!ZrE%$IyRhuNRh= zYSbDFMf_kOVlG*w8LsQ`3(T`G7^kfctIIC<897q}NdM+_C95N))14KHg~5{>7|GAn z$YzMxdkIl$CqR@OSM+a#{*@ZQxRvgH%(+f-0sVJH2JkpewLSU%ec7ZGJHcG8-$**< z-q?I^c62lYCK(0;Iz2eF)-nMv#of7lKZx&u&vs9Z5)-qdu4-GOF8Fkkbt!O!1eRg zpL+Hs+`FR0Bb?ajEhB=P@PyYhG_*Y|zSapIJ%Q%6OX3L#sTWS5jRlr?*W zWS4z+q)3JA5hF2V9ZPmID#>ojnsw}qZL*B9&+mB$ojRw^^80-KmD0z&@AEwObKlo} zUDr*Jj54p)FatSvnLExhuAG+-P)cgAez)aKiAJ_tfRxU7)#?2=0^d(+pj?Q{JF?vs~S2mOd=L1$n&$d8k3z*8kAgptdFFuORry7v8z zCC3d&aT5Gc^%N$}zzqtSmNP%+Tu#n(o@xhZSNsX<&g@9dbTt)E&*eQi6*arB)ki)L+r}vs_Ll<$OD6uxZ3kr(1KVH>}l1n!lC6DJJrJ9|z%fp)IMQ4<^O# zI;*dwqP9G1B>w%!s}{6KG!W0Zn%}Vbvok289I0iQrrRx#v3S2%Pg8x0%_T>AuRd7! zYT->8wlga&sR`-9AM5&|{r3BJ>w-Q+Ry~&!+S15)J_yf#)O2_b7eoH{YldwGy=6g9 zFIQSYN9zb1DNiH$pK~Uc7@XIVcbAMy72mdLP+~@Sb5F3$T*Y#J)}Deqp}7jr8`oZ0 zHuL1H`hi>{z#I`z{{Bv6b6Q?qoN(M+bLq}EjctNS>8|0Aa#Ax0hl3W+?5jOLTby~} zq+&Rsm%L>#EbY5s*fQXA!k|SeMtwM$oUVGrZU~TYTfIXI}_ESK}(7auT;;7 z{`(b+3vo(+qAiQYGH%X`SYq<6pB<(b7_NRLBu;rpUmNf%0q_k^old8uplxXQkYawV z%u!=gC}}U?9}v;RI*tJH`rz5#F}<7alKbi?r>^*X13n|;xFw-%-E#9L9kyk^GnBdl zPL?IWXdoDq>@c(Mdaq;aiz`C(q7L`pzHN%_uOKvlqc{p$?SPt8Mi(tKnedWhU9-nG z(SH)4H;$8_65F^;`S(x8XEE9E#UJgqg|Lh8FmB3wlI79_*Fa`2L1uO>peg+dCzB z*m-p!dX6Ipbj)GXuBD-L+&a;A{hsxJ1Hr~Oci#*o?`;%i#HqD+0i~4 zw@+#72b?(dT#YY9auZzC^8PMUP-m-M1VI)$o+T`klD1*()*l;Y*{9kq2WiALH8gy@ z0?7f577&MSWIlA=j7oUINuiVN5EN(Yx0C`nK=iJmp!^U)KRI5~l0=v_THXyGVc>P_ zvSW6x-9D#I`diGPI}ZpKr{n^u+;n}&Y0ohk6j^a!xf0oD5j0ShG`cmA6BJhf737X= z7v*i6x3o%3COS(E&@wSlGGH(?phRQK&iQv2*umg=?`6Y2TG?e2Ka+AkQPz-N{O$EU z#0M?I=Rb$TzBOzOIP{JaYI*=e@-omT{$9AovKj8x+e63V%Q+#l*!I>#=*`7BUVCtxJ+c0w2N2w_ePtCD%FY9odkRcz)x;c!6W$st z#hgjkUT7Pkytz95zJ)j}%;v!#=Yq6po6@_WsHx_B6iZ^xJpLF?%Sb!$HqB0^qqy_^BdC z>hnz2?Jq`XE+o<=tHdB8a@Io|{3(^@9-uEOXqYX)f&|@7=dRys&z+gL!(1BA2U5@o zMBuk-q3v8L`~Z5X>2LvHbsLnFFcuj$Xup{2+*E7!18o(IvH0gB^;?>6x!XwK%-V`6 z6xD@^oh1MSi2-tEDCTz;UGrzVIZM`_a)?wXTmgX-;&Fa>C8TpB{%jO;?iI&{v99N) z{km%M7DP$UW?q6t@WY5Fr8D+&{{9vpp1Hq-GTLxhAr^~uT%Hl@F18JVBFFV=`37JH z5$ia1YK-A7YTeEV+r3B@911)6RKzyCg0`){x8F{D{pw9-_fcs|VfSCeUF?@1x8KG^ z&-mC&H$_EJg=Wq)zDO2cC?dF|* z?xvD1rT||bZuIw`d@@UG1{k_#BXPBJr-iWswLNd3_U@mF`iFwM-UDv{9=!%fNftoX zp`_ucaB*8azStk%@1Zsk!QSZO_3_;9ksU@PwJ?buYn0izOkTNJthG+!t;wuZ5hH8l z0_!r2DnN*@46F9QfM3 zE#B@qcov)IaFUJ99Ti$5Bqr9~VSV(Fx}%6QIzen&n&1RDvx4RgT3C$VnxAyiEBF+%YFjBj<{RM+yFij4ty>!c!8OJ&*KH1l+WZ)|;kjTxQi}6A-Ij#D$sF z>mH4ah+yO8ZFV|($cQ6N%?4R~{R6v+Et`ogb))};^(2H7dA>vQX%k!V8d+1yl6dwsabqbLWswUCjW2qN;IEI`|4g1RyM;|M z^8`1I8&*T9+^KfaZN5VzY^o8j<&&Y{!VVJ&ta&9g(e|=i)!fx7u4Jewc`_;2 zecz+l-V%G$Ba_}Io@d8Dc`^w=riT-W*(ds@U3khZS`xwmC#@dPc+JNeI_8|@ESjFo zN*{K9d!H6=Qsv)TWKCYy|vsO-V^)bKJE~%G)M#m1bxFx}Q zaiK$g;!TjI83fZ!zikiB0q9iusCBV%(Xklg>f3{>;%n&Xl1RysAh;&Nv~7Ax{*7E* zImf!d!Nj!Lm<77bGW^oyhY?9a$zy$O-Qf&=0p9qte8y2Z0s?~h+T50#n>dgHQUeAV z{T0vvi4#_8VG$c%t+?urOK%?nLZ;Kwfe!FCd{K{lzK+i(jpgRJqH9Zydf}pPqq^RkerUqI-9Z;#rkX zVMP0_b|rDzM838dT=Ym6bA8HEotRoALPWGCYjN-C_cg&82MyleBXNt_87W||qqKY%g zH~|7b1xwHG(K~bVk)f#b{WT?ekXn6t_~WT0H-w;LqR&f4wk0(_A`9!2wZO1jz-32t>_Ym$<*1~jgrC)R z+6fc!rqKkc#HhyBY1o zfV5m!U1xDTjufP#BzG=gOPam9h=x#)MY9`)H(O_LTYP}agzv{SM0Mtvqn}@la9%IA zx3}MBtU5?(vrO&YR;#?WDn9|$LBQ|_0ByN~-e_fuH_~GQmEG%eK_0Eo&P1*j)04BV zSeA)*qL$5Sx8`p4RH*+{W?()WUg3EBxc5HF@6z+{T1#psNye5@6zaMxn|P|}7taoN zmby#{KYn^PBIiPk^pqx8u< zDl~XAI$u0&$1$_pO6#B8Jj@tlxz5}Gh<*iA7cydZ{e+Noyb>v3nKjODcjjadxN~2>*^j?2aTAmH65|UWTb0l z=ty+!Z?|8a!YPFuVyW{&rodI(Z(>m^1?!e;N?yzjpsFZX*N9J0sk2gURT($@u1XE& z@W5iN?Qp@0;sb-A<@h3nft-$4IogR9E13b+?t%^gl?u6!iZ75SL07)C32#mmuD%1e;xJXj&{XT6g zmDw0%>h&bc)|+n*Ve^+Vz2+@n1+EK%b}yjk^FZ}(=}~$|UamIe?2|*^Rb%aqb%z1o z6l?KGrLH{|sphpPJEmBP)I1Xch|J?<$gwC6o!lSt54e#jR9pM9^_YBx1tD0jK`HDJar zZHAe3)N6^k^BU!zG*?!BmxY%2{*&a)dvBfZt3658=U`IaB%PqGe$($`uE%00#9_i1 z!w+r)4ns%URDx(50R(JzEBe&JT+*f<;Dd}*Z8b7^Gh@oN1x?!WACZ)jUtKql9EY=R z=Q>emo3M6tEMSy$R|H(y3c2_3XQs#L@1Trw9|&80knQpNl)Q9gJ$Ril&A1UU!Ejfc z`#k7P`#Q4KAf@KQ0dZneHm&bIhY9bu!EJTRM-w60p7t9p6n0W8ThFj|vy*)&*ZPPt zu))qQ=|Gjxo8xJ2d?v2;7bTLF4qC1?+XG%*P8HDyA{Q0d*B|h3C*<1Wx**=>-vu(3 zg)?VqnI#{SQAkIZIZZS^KF*rzGVKbm`eokwJ2_iwww4{aPo3AUr8+Z7%ZY~=oy4j4 z@FkjtPA~-0bUT~eWOQenR!eV4p2p4{TdFTSjH;r3@YA=Ie9D@VLV_lB4F@w?IkGyZ z#W~cbM4F5{03@Xw99A{bo3$io4+mIYy6`*UGg&5*rJhAi*q9E=t==lYzQ$KAv9IR2 z+I9=u#`j5jbppBKotQ}ugeawf_{jrYIU9B&GqpZ!((y#Yz2$15O(vQK@;A#(I>aAbk+rhJ@;ZQ2q}=Bob7Tdej- zZ2>4?C(7eg7IUlN&3PWJt_w8S<@$TCjud%rjdVr7!v2?bC$ayOm%>dFC<8HWY@0P=#E=hBmGqYC5aDd`pHeYSn29e3N+Svs>H#tgPsZ*4Gu+8!~^VRZ;3gYBdHYK=L|z)SscXC zqX~fFKa(J1GG|l;I7Qz7yeK_aE~CWJ?hitwcpwcJfD;z_Pn2ln%HC>ro38&o{2&}N z8Q1F7$I)uKMuQb3YURqlaQe~iy5aIM;RAMIDu1pSw=e%LI`l&VtYI*O0H3}iv|#+O z#m?tYOO|**atTnNPnpuSaadY9nzfx=Ewwkn&YKYFK&NGW!jDfbODnwS;K{mZ(^^1I zW!g!^AddV_B1YoK-|5@Q!9Xkbw=*X81C?AAZn)?tPgWtS0O7g*C$O7x%2iW6)(2XP zi5D!THcTz0>{6T)=(zi14Q~TDxTdV$tH3_2V%9r80kwEx4jxF%y!>tPWR$a{N#%Y* zwV6+WRVG13pO5ZTOo(>k>+Qqw;Su(pT#(t=69K_gNArx;ZZ!F}N>@+x+V2S;^W|VN z+`Vq^|HVoqCnA8B=)hq&Eg)8y9$+rfMW9;qt`|xM0LOPsCY$zvEgMdF?nF^9b@UY;V zhkZt``OB9_1Wm}B`dOU_eXhE~jAe1h0SRqvJ`x0Exbt6diS0cW@n9l^B2EzWWM0yj zTxrm*Gwy(No}~Kait97dMmwl{NplVptFD{Sfjfw(rbH3D(gejox$G-rG8f?msp06Le$w!(wj5g1PN{&t)Jot1Kt zd}69`$?aC}l>=DuY0bEoBe^_MF(|y3@qNfkuQ1l18rI^94{bIWFDm_6kj6@#Kcu5vw&z07WN( zrjlm&mo}U20S=ex_lcsFm{Vd_6U5T68bIOW6+lw7f&$_19@nA1D>ao(%xgKprngvb zym|dqdXFrmM{6gw(!av=N1;|a07rn}I3;@`OLH<&+(lo+<|CWX&8C&x3R4tPn)0oG7dI(nbXFMoZ!yF{Jh z`Bhp;C(gAgHHvg%rLTmN9R?M0eJY1X_g#Oq9c7CmK9H$D9t7)&r; zA8WSmGF#_Js(T4K({1X0p#OqeFGJV=zK_mq{>2qXXM!CXQ*%dEQSmXvPqV9jUvxjc z%ZujM{OoB);zD(nMHBG&{PmJ-{m&PHiP}!Q36DphlVoDEbP@HTOcLhfO=7J%7BU%n z70DZGXYDDd&tqar04^RPu9O(c3ZH*f*6K5LD6Vww(F`)`RP4c|@_@^T-Vwob1(1po zhv&0tZ-Pd&7$bHdg*?4)ffwhD1NL<9cVcM)0XHKncR7rycrszgX z!VyeoojsSZ0eE8!BG*hZ>q|LCap1?V%T2&BM=i(f4WK~e)z{ZhIVz+e{&@FsWFonwn@o6mfdwK*u^G!4uMYX*QYM&OA zkEhW)CS@@bk#CJ|9A@Pzz|NRLB~=TDWHf|9RQ3K^x%K`>k}L>nlR{kPud?=Ctd8UcqVJtB@jOBFwsNsu zKOY!P9FaeAQn8i~nd+Mo<3XMrN@pb;;}&mj3m_I%3O^p(=f#ScJwfcA>#BM_g4)YQ zO0iCR)yK9>1C#mXfYk`HaV8tOq;ckG7EkhIx(lA;VGV}D86ACUS?q?8?E|L-@vCQkOD zw3M~Ls)Hd;(!AUmJD_XTWgl;`e`=-oEVt4m4b zck?!2JqqgH6JjvkQ08bTFRY6{1D`uwb>xVnE6B9WjBx{w&Wkra;=_k}D?WX86(KR} zzDV7h0OQpM<9~7+cidLIbK3aRv#{_7(bm>O#KPgXT&+k2mJChA6`1S`%vtXC$ESck zJf~PL^$Lc*)v8G!E5f1kyqKeoLqVf&B8VAD%2C**3TLS0EO(y>{%kS!v%}BcGqA9v z%Ew|9EM+BmIU4ErymH;9eysD?*N#kG1C;!u2)q z$WV=nO$W<%znGWI6~>+|GN(QOU~d?JVVmdU87rcXpJln(5NRYVeH zQ7c-3wrUYj@>#a81Fiv|DuCPzbUpzZi;7;Tg}!6K+gt+<0VywKpzS*WDR=Mj*WGs; z2&ew=!Et2C5;NfW-9@7M3=HoqQ1DNSTbpcRPUG1-8OTSb`;awJh9O)Yc6}1+UW|F# zrph-;YaphtS_+<)7nje89eQ|!{U*T$u*6i=`g~6So_s@_hU7P&==$*LOj%uzW=5-p z?Q`KNffgxskm2d`8CQ<>lHCCu7I=2e;>^}&fz7BQy;ky_N}#l4JmTp4sfx(`Fo-H# z$cwPH{PMF6$3FxGtLkaK(}@H~%VNGbo+D_{B1wP7>ZS8stK3wlH4i9)xzOj@^Sz76 z=k|Tjx1~@iM15#=+R&h1ujkV%Ur%7Hfbo>DfUIi3D-p6;k~90tfG7Q#`^hxv$QMSm zsi5QnNV*DPqc11TT=ksquOv6lbU4?ifIDqWD(QYo!n1XfW6mOaAs>jPe`lh!7H zrP{l>DANe|Vxs^OT3GqOiWy)h6Cd38BICi8CS2U-}7s6UodxKidM;Ym*trMH{4s#oX8OAmStM>!t&Nm*9 zqY6X_jU#|_?3_@na7rtC4rUO`qc_*P4*{4fK&d$qP~m2diC`B*%28MXvI@j82I}71 zK`;rxww~+ALLp6PKbyahcq=cqeUl*b9iVz7^;h+F+-4figXrL=05XZ=)?N1D^+M7y z#$+;4?|3DY;F(1WRC1f7{5OMuYPW1N=~+r*L)QA77&&AUStFrV@9#ty1r`-bEu5ZA z7ycT+nGx>Ne#Z;OwKBc)M6T#lC(Ob46DM38DSO`l%u>ZN%wosk1S%m()Q7e*qglYF zG=~K6t98F(2H<3${jvU1g>ARXx2rIa$e1HvHPyKf=kD*%H%(vPc>iO7f^wrsB3wTN zx@uo;D7*H1*4KGAHBFDHwD_^yP97DmUTzNDS9z< z+gsur87zqmx}CuSbBp7xludW<{tQaIDByo%bhAm1lT6`JtVrUTVh%Yk+_1)Nf$BVy zA3=vM!>}Zz-KJY%%H=4Bt%5fXnT1_M+;mvOxaqv3^iW2hPn2x6-4EbWHM~~vhm+7j zMiON4Sd{>8OsHOHsi5w2>VF%o*=Kv%9keUBfSl87?14>Ka5!YK)9AMn1S&~DDrVKx zFN*L{iEXIP))zE9wrc_PPa*r9f1C_N`T`1qMxbskxV5TJFJ&3yj616lUyME3F<|=y zoR<=BJRwy-MXmK&^?u8*tHCC{8kZ{NGqi5-EDsR*9sKJ_ z-Yukf9VP?oZAn*3atUtrZQ7S{HKW{%(qN*jd%rly5h_F7rCU5r81<3*EB_|!vGh0sm*i*Gf zxhAtnisHF_41zD(>P##{vzKy;9gEWXbtkh|FI@z0*BsX?C&tep6ZmTF%lOJFIKkPfOeZw%_5)%7C5gs&|l3>&?2VD z?4tr4=(Vl_vyZ36+y;NQM}PY4X!TE_8gg^vKRW{f7(n@{SLRxHDM8v~jf$C-S7Iq9 z9v|OmFqSn=e_tGB1P|59z^GZhTsb%8%*ds>t`^A{&#BP@+TMlt$n4IcqJz+BsM`Cq3A07#L0LG9(~S#F5(zDUFLAUf4Ebl*XIl-<@wR5dvoZT7i{^1TNg{Z4gI!_^>J6m%?~)1gzdWh_8jRWQfaz!F$S{03t3 z)>@ajcZxie`;@tDmID_w>VJF1K;~vV49jodUJ&zH%eLt&&`d)cv5_gLJuWs{R|!a) z?N6S>4P-`v#FWqsE}9!T;~Iq?>XxmWtXM(E&_@YK<8izDkvR(Xb$|rwcy=r22<0-&S18UAyVJV|fy8#{+ z!YntEIN=8}HwG}uNwDMmzWqlJ#Zu4qK~ogITdbpyz4r)^orF{w+hM9+fo1Oqf(1p4-8Q1z4i zcJYQyb~b~Me4s9ZuSkaEsk!dAmG0e#NZQeFt{wU>%cSg84GRxHuA6HZ4%7h+ci>s( zOR0LT)`Pb!J2DXM);+SPy4=+VyepJp$uBSq9Sn*cflP4U_2aE}pM>d8i>)6jS8+>> z>F2O-tnpGGQ(F*Hs(+uu4k$XxUZ_9`XsHf`Tn)Ayxa?^r=GmMhiNN0vbukg6-)xn> zedWNx*iBnbV2V^-24AhqQEKJ$EQuoYzi|ci`LN{=;E7jjgtj2AzgO|4TRmF(-qQvo zd6tj*^WK-?e+W3`zFI|yev_N457u92I3;V9M=xJ#G6 zM*3}ujLSu?pl5?Q+gQa4)qj~dmnd=DHKjJ@6!=J9T&w-C27Sr3thY9gBJn|dDt%)V zP@j!W%6EMIs0sC|m7HEfSL=OfIWG>)#T%nukMVYQLIN%0MrIpv1U|xc9>9lg(S0uD z$#Y%VT`7r(WFUhhCoYN`P$lA+W4X06`G69^spMD44A5n7sD-jdgm<|k9O_Y*Z;*V{ zko-PM(%IT7B*G}a;ip+@#GPH2Ia18rHIOBBs=lguNe-Cdoq4sjm~fy?Zb;*mU>}d9 z6Lei*B#4x^2+3ot2V^t5td<%KuxmaQ%0?Ml>Cx>r11bz|EWAt#P0>HLbv33&2btAx zSwkv`nW|6cS};nHUKx966_P*g-fa(q{GMR5$DIQcx)Ey>&&zb&58ZZ2CnHn$DNa;( zvcGPCXr`DGCm5t77TV>jeBqR~+0j>eTXT3gv;k1j&8M0?TK(!2TGM>oGPFzbD_3>k zk^%3pB_`jBCZB~v^+|dgr)7_QrZY2AF|etJ?L!t#!vgZ~=&O4`wJ3j3Gf7E)oDzfC zw+hN_l5uKyunBugfSvC2nNNTK26U?+H?&2746xuIOw@0#n|A}x&`Nn~OJF$(_H_j?ok%OT=Z8d?cURm2xb{XVbSge7 zij8DWOHw-q2M6aG_vYOa7n>T*1iDNc2kQGn*L~gbk5)-`ft+#AZk+~3lO#Y9q+_qS zjOhV(@I%IAYFZlLZ=W3lxl(}_fmyG}D&!K3nH5UTijCHwYh^T1Ts773-i*ZuWJ3ve zSB!FzfinJNf1gE2@sxyYxsRxNpqdG9o6Za=d2Sggfovii*RQ0pam&~XC+s*`7L|@> zk2hHiVh#GSCi7UocNW)guc0M^Gwk?K$`ivH`ugz`B2CLH&BF9A=TX9QLLE{xI+>xc5trW5?;goo?(f= zgHp1vWFmWxYHiAE4tgo-wHt%pcq_rAs>j+KnotK$qsWeuP}&K4)dbfO#2v`1HC~2Z zPfDZjU5FqtfHoEAvp@^3bE8luGC7*Nv1jkW(VqGQRtl0fg=GG=ouL1H2!uQ>A3R08 z&Qd6`!SJ&18}nY#G)=|i&9|b~U!6cO+qU^RUc*mDnS1?|rNBSnb!z^4o@dmKpC=^O zZ&7YG%sS&_yoed^JPsjuK38vIOwq*fi32{P`N zb-b6tzp!l!13uekL%;v%=t;1{z&rrc=(7vqx0Q;2Ea7%fSx#vp%MU*Z@^b%jMZn}PO|AnLZMLD zppEMQZ>sgisGvud_ZdRzl_#JF3O?eAd>vJ_OHpKvv5;>ne4CZlJ=kax6E@Wz&I@!F ziH)&7UP0u$vQns1^&QpX=c0AHcX>Hxp+ir~6HF23&h=31#7(rRE13x!@)=gl@u4&4Fp*CLbD0 z02-FafYW8Q zHwgXa6K$`X%@u8ZCQ@IuLUANCV005T0~#>K#>j_2YonnD7{7R5V9$=j|g_R>m6z*y58^CsB^J%(=Tm2&BbFqd=Y z#~$d{wgJLuCBQ~R5#M~0-*G^m$nprsi(K_x$!)hf+wLQC#dTD1S zmWvA_wm^>Io?8pCAQ{-T%AG!`8gbO3Wo4O9Y{z#CH~nW(I+!;Y~8AKm)?S6v1LygjQ7;#86k$^V4gerAOQ4v$Wc)LKS7c2EyZLh|&|GRCqWU2bUK zV{JB?>e1@;@n_3kpFClkjt-`%7{rDhz;v^`3k}b3=@!AXvO5f|gr_+1`uK>nL<*G( z=@dBn1y<1cjU>H{S6P!`0oOFHkM-#?bF_MxNz|h52>ZM&mSMqi$9a9}BR~#h4lv1k>r*%RFx|dO-te_HbPMD84F0XYUMk8edMrTmNImbGG z-!TnDk;rGDi?wU{DO?4&y*5aHc+8+bu5fC%l?%xEEGN^+X+p2&sR?=@Lu(en;1(#E za*rrn17hR|got=arL&HU$5)K{tPu7@1hAxRD6;MvUBU_#c==^#_mryKh_QM6LQlnf z^6rtZ2m7#gni0ob=h}iKtSugGq5eO;?Y-jYD!YqgIW zUb4p+)8y!}TIx;QX}w?>YEcS$O4rY&gLZVS$-_I$pA9;D{ehq567^YNL_=5Vk|Q+P zB-O_QHH`tEx=raMqATAW^j#wpIsm>9>0K25V?+p*B&52as`W;C@hi0$p^hgpOK?10 zep_Flx^CVX5pv*+R~c23sL9%+_>;5JE0-CHrdf*^+#Wx`RdU#)j5?f7uN6QSUlgMA zTxPpH$C-GS*fm=U#lpFiR3Bn+t93gMz4+?Exc!CtH*{>Al0wQy+D05l+u6$;SqB!Y zw08~0?dI5IA_Mzt%U4%gD)3{&H7Y1xPX~Xcq&oxC?Df?UX<1t;sdnq=0FzMQ?(*iY*Jg(Re6tH*RbkRCbh%e_`_8W_@2O;&#x-K5h%y{w-#lj zO0Iu!xiR&&6X;?HW-VWMYb$INiQ76i<+tvoF@Yn>T5Zmj zq`vE_;_9ecqs~~4Mkqz|6N|LAT90aAADTq$OCmO&*PG|Bnw&FPRWS)aW$zvBZ*lzE zU|jXP5kjg}b?97Pp*7lF0l6C?u zcc0m#>>NG@lmUK&VjGDJbG&MK4#gV-1WD+Xj8n+`>6PFKRs1;Oryy8w-lVvI%Z+Iu zlL3Ny)>Vdh-YSBR<#HlUIntgOAiPmwuM$wTDUq>`uZr&RTcpmwcbP=S+k0ydtS;!L zu9+Z@YZDh#s3a+WBvTIuN;pB+F#!;qP!clN!RRtax!&`YgmH}tdQ4x zdN8>mDIh2r7?1R8C$~snG#pxGTP(xTaZ+tdG(T5g-|&w=;WLQsdsAHfk zp%2{oLxcCH{On6l(EasfXyBvO`pjT?Q5<94*D&glXbb|bLWPdc#$-67exW?bJB9{A zdu0Z*H+Kwu0S(g;Pzo>u=Kqhx3S7INo5;rwP{88QCUuoQZa<-?XFg1dh1Mq6a8a3F}Lm7?}AVp1uD%tNZGgx01t@!jc*X zFDj6srxVUdyeol^g5Rxv#j0c1LIoObv1Csa0X%xhJ6A~DHL7~LQ%T;cp8Ac-!EMuq zFW8RIN&u1Mwj=Pa;^cetH_ow9OQP8&!$nwl-2l9YI)A3Suy$+v^{1aMnhIEPC+Ize z?0Q3qYY317q#%5?--+Bh)@yo$ZT=lKAiz=o_vHMG#Sgy^;EJa}${G40A9dK2eH`BP zS1IRl4RV+lB2}fDHmy8M{OB33vI!T@*YjE|2*O(1BPy|C^M^c>UFvH9vIid#FSEu< zuZ60FKuMB8-qA0e$p39I`p19;-hCV?UiHN*eHe^UTjSY}mF&H1;)ojMOBij(F8aH> zT1z!O7I&2U@dA)OK?xO;Yq2$^=t5E#c2BLQ&jt?v5RmUP^G!(>4M;HYa#y@7vvLkp z58=@V^^|^GJ-=%W_FppU&-@j`4KnoL@fSkC=G}2ze@O2wm{z1l^xzQaXs*$X1E3Ab zqDr|itSw1#DbzqCLxX4E*DH|4l5ijm8LYCTZTt541mMdBSW$mC z>jRT{hZ_w)C;=kRAG9Jvt@pFmi!@*I!RkY_^%lSw5c<4hBBy5(#GFDWe1~2*HQm>; zH$k521-Ac_%d_xBwC^FvGGSxFTrv7o!HqsqF~8CK6Sx~gAxn~lkDp9U8NN_gRQ%Kz zn5-g+SG{z}8Qg&v+v=}y=pu`w%>%5@QPD1bhZP1INNV%YCnZ<>!b)0SDS75d6Fsgs@V?`;~eCm_b}t1)RCjUJda`WI2;#IJ|Pyo@bqJd%duHa))_Sj1^fhNj%B z4#x*I_G>57ozjVfRAI$Y%%ZSybWtRSIf$Zy<#tAv3wQ)jR?hlun8tOK6~49ocuc+3 zX@Up9-2f{2ik7vT&~8sz_>z|OjR`W&3I1?dGdZjo2jR|4Kun3&c*wQYTURW2$JO;Ho$XCl7QSOt=!kgDY~^j&$$5J5d+ zpF6+it|!A6g!_L&Uca1CQSDzK_^A(gD?n^D8j0Q_+$o6B#S0Hplz~a_m~sFErBPtI zjsZ#QyQGa^s@$trbi$5ubg3sU%o>Sq3(>xWhmsT#NiY8_y8ldu`UV%-{Q6+o#wVkU zp1c%^8AkCLIZPHjR#6gWEDC_BP-fohOzp^WlZ_iB-_|WT>A7BA68{fK+y46e;a6q! z{;tXJ=czGoJO)=!H1468Q-tgD_FbQ7(i7-Dsw@`nAc+J+Uu+W3s61yl8>53+RlD}8 zd#VBhc{PrL(KZp8?lUR7CM|z((@!M(o>KZRH%}>zWKz}sxYF@b=(tGC7?${^*sEaE zd6?BB4;~zQ^?*(=%gF+}wwMHVCdAP-YX@U=$FXf=bJ5(u1?sa=%paej;l}2Y_)>+U})ocr(BBg@*){`!uh|g!&d61pYt8)U^WepJbyo z`>hXgep4&{*q^Ky{%#ixfth2#hjl5wwAH8sOC`Z?SUTE!K>F_V{L<9?NP*q(1xWs5 zxlx1wY9J;e-o)IsD;H8g)O#Dr*)e%e1Fd+JZgo<` zqY04Zlc=qeAFu2TOxoo4)056LGAM3J;37Z{OiZY6L?c8NGjG=R6vgj#!oH5jt>T{y zk%UJ(v7pj10M}9AZ!Vc+ezO7c4T|(9gJW>M938~TFcP@BE(O36FTbW!@(PIMv3O7H{~1G8(V)q4%_EG&G0a+I&U$`*OS-{MkqFWATSV3<`1(LQPHd?gONn!MRur zQTykLp=FxcUA(%*4t3okkrU>6qmoICPS|ORS5Gs#13R?yJyZ#EG!oNi9zD9A9&ixtb=IzKpyr{u=lXIh zmP2MQl_+MtLD$T=zr4U@*+!|-qm6L%@_t6hoaO7j8HWN^0T1rP-<)~yQ}l}vZL&JJ z`wt}^AUAy}$yAZe2S0X(mu3(3c1;k}b>`EhdUrl*3F?PWu5WW38xQHduc~q;j`72L zETl(hM6n3Kn!m{oWs=znpxh-4E#bPqnzpfQe&vy*kDG#xtv4_pb^LRJbAD@?fhF z*-sxoK6i-@$W(|vV9cu}H0G7Gd{0O&C1h+8pZjzPrGn>LgFZ`4wgbjX(MrkDrcEmJ zOxKrymz)yflXujXA%im`NN>Gpk6^uuY#b&Q%)6!Caxd9*Uh3HxqyV)t)N+ga2<-HX zUMgxt6a+Ogs=Iviz$$O1QvB!bb^8})_Xls49~FAG5u4Olc3p7`Q^{ept&V|-KIDPK zJR$sRGBJ|mxl^?qkfMN|yNEcw zfr->$5VDdFCa;h<_=&;QSO(brT#6%kY~I9d+ys$*SwSWOngKe~Q?ww=T3zBc1b`vh zT1d_MRjN^2e`Sh83S4x1m{gEEW8oZb6n{72(ztAc^psbv{&CuWqN5@VyaazaWv~uh z)0py!NOh=_+&W~k#SZYMl@3G$u)6EFa28+xjRIhT4uH}_GSvJd=}usT`?dzS_gsFT z-@c=-qT;yuS-QPD1>*A@=iQ4gkI~&&-JMN)>ZLV(jB*cd~yHUMC#y}cKhnNYKx6z)!U9-Q=w@r_};`Ttv`FhV(b1dq&dIe~#|d6v7m+vp9B&z{H0-QGI!fkp9%F ziw^=&F@t=5K}}#y)udM=12;+InZM5{~rgQ+Z3>p zgOTt|`;V)^!+AoiXSAU~PaZ7Z$BmQ@=XJ)N$$x=h3W zzuk7vM9I*1Q-90QKkM~T%ujDVC=vKFYLVDDIT{y9BagHb`hDjq<1p!A>j^CdKm2c3 z{x_FF52EY?twm~y2_9pd*BN1uuJRk)Y>Ao=VNhzsd2@j97meS9+?|L8<`7@{Hr1a0 z?JnSKcqIhJ&1Ceyj0An5;%kYxjyoRCIGDuSgzk(;XHeyWS{vX$&mU##X^N;mJqg+| zcrYq?=zn?*_+O@gfoek-C*H^8y38#8+Gkc<5~8%&VEB@#7}z#3-kLkT4`_;33=Iq4 zVN+8dFNOc{oxH!{6_`*LuG3!_lsM~2O|Wfl1`*$U_U>N6OdZ}`_+x9##7rla&(Ul| z-eX>}z{}`gEeZMnqx(DMON)W6)dJ${2QxTQ3jLUo<1y=nKtF%UaJeLzG&2v; zm!n`+8|FKstOB(%*{t7%#X>X=--DI^F0tUr2c(W$>Ge%p1D4W>>aF8zcWr-_F`E9D z@2)6sPKJ`feR)do+`w88PAMAUrwODz{J^+tj<}js@XSO;40y%Zt8-HQyKE2P0uAH^ zA7D`UDKwlUpN6Uht-*xqEMPwfd#i1-(4jVLm2wa=B|zI=bxwRjB3E>i zoC0|#&qsnvDC1Dfrp0hIRavA1M#$IKO9Xce`4q+92+UUd!Pv*(0E)43{4{L+InuZ` zO)+*6mHSrHBIT@AOSG&f8=&O78Sd@p}Cw*NkIaUdB zucBwf3#2ce{zG@6gdZ_a59>xuc$bUpsH*qC7{f!afpcT(m@(zkDpLKTaO%^(+G z(}3}3o9IjPziwC>OsqfmeRF90zu3%!(y1e0>mcddV+r8f#loK{VB7z?zXkAeOQ=mpMaU1#npYdAscEHsVTE(!o>Aa{Rf9h$Z^nR!@h3td@TXF_|AFYzS;qD zzcDtA{jcq9h=i8c?+I!6+K(PW6n-wt^F+TN)PEA2ge4@pi<<<{gl=7(SZh!!cMuhe zcj9rlk?5n*5TXBlS7bDKK+m4MKS7l)q{i>a6!T|jo}yUL|4T1L}N z0oqU`kx1b_>>}B`9#2ddBQ}&}9do?#WKo*%z~usqX(A!R$PD|R;$DX#wEv;2Bnxcd3J(=@9vSBn>p$EW3e#IAsM$rN{Fvb9cVr2G z7LB4DBLxR|Cx2#AwGQjqFgBS38kBB$2nLM)Xs>SF7vrluX%s(L>CBRGTJD z3GO#pPA5jH4{8c0q*C7VdSe_JnZ8BIK=Yb|kwA7}>m!9DgXCUDiA@2o8iAP7z`#XV z=yDCuO4~Jl(z^*yFtu#S?|e-ZpU8a-=Cn8^rY|sqmv%3(Yd*?J*KY1+$7-1sT$hPE zIxLBVhy``!M-jZQ)J3+z^2a2#f7__NWTnl%kq;i-CLw__u_JwLWHK(Lee>SCB(E(~ zHZlE62_GxlpGKsJkG>6Oic*|C zYiw+P6Qka3hLj( zhc0V;_>7R>mEStJv(x~f8IxjRcs$3f8a|qhGWlD$rDBdlW~!@oQak zg6N?r@PMmBqt`Cml^>1)YZ)GG&re9RsJpV~&SEGduZWtRcr{yS%Bi1o@f0Wy1*e0* z(ISl7rq5v1`7YMU?BFiosbh$;XaYH&g)?9dNyR_HIOs_8JIvkzqfXNuV!Dj`S$gxw z)riEbJqcRg6iUQ<qI0#7!N# zy}}AxwHgD9$p;wG8>WpOnfpa+-##Nq6=fdU@*dqZVNCFDYAH$}IBbc<)0s|foj)I% zIrvarq&vUKZ}9)xyV9Vht|uECwvfZlwr;js-!Qw1}()0;QqMt=FPlw z&pq$l@0{<WU9hg%rz2;#Ft$RN5dCrsG2*CK{{WRh z`w&+~pt2}X9JWm7`_7D(^w^s4r{2Kz8eQ5;UIy(gV=`@zEDnQy(XBn%#)brYS0#dP zW*g-Hn%7ypIQ@WmO9m{zYzb`{T|>-H%dq8wR#FcMQOw51#|U=heKLidM0Ejm$l^(B z#i`>ASS_N4d;`*1@mN@Lq!b^mlRl=$eVfRlv>I1{n5Y@MRE>!^J}tZ-T+4{vivN6zO$YuF_a_bvOiBa8W@(; z+Dqr;+YH6ld?>ss;7AGGDg7RLTP!;kH^Lh8JGhj2d;~Kx!Dt9y%zJOC4+E#+e7lF( zI}wY+ejg2&DzoodTjvuBMc%XVN+)C#e5pncc5gG*vZI~lG&ittvX!WJ$ajwQQ!zrK z1TRx4?4VZBkZ8x7N+Yu*_8413VvgM^G01=%_%3V%3F~PPqldJN9^1zg^-Z09gjCjKUy)?Ewjo%CHP5)>jKfSPiA76LY zj;xu=n#V;|H+GB+r@6R=I+0I`38gVYO^m6FYdlRSUxGv0RWW~s~O(Uuw|&98x;WDvY=Inu z3g(%K$@}Jl`Y6brDrNRj5dKY>9`?0Bu9Vj#KJhSRvl-MxNc-OXiML{Iq&UCQdN5b5 zkQGzCR0ei2Wy)jA~qd3X+F+<8)StO(e6A_5`Xp%5*ic5raD3-lP1w^Zu{D4 zOaNTx`SiZm*AASin+Oj^2^WgkZ(P!9J2p?x=k9DpgTmr#3&9qHU@Hsn%wo@~1(+aF%19e6X+Jj+%dwJq#^5=fa&Htm^er3Q3kCRLW06qhKu zNwWw#SW&=5t5WUEDKuCoDbDQxsuu&bQ>8FE!GKB;Y#Dz^!;As|>8VnJU=t|M^XOe! zauWaylh>$%jW$|tEW{rrzUv>=#lAGiESER;ruX;%+7eVAi3-pDRK<-Z~~a8eRGCh|$JMf5$}?wpyvK8_tiF*55hu z;{{fVlxDBaKN!g&r969eRywC0v*+{S`@Z?5+B&gy9MgzTs=nOi61^|Kp^@Xv+p wNiN<{Wy30v*KGI-be8_v2?CkWr?*YuHebQ1lS#x^vl1Y8a&&cgV)uR0A9j11!T&9Ig8agly&3^Giw7~x)1%9mp{_OL(t8|^FpyS*eFgRckx3 zAHILfz=MY7gy+t`eWNxUzBDx3ACzy%Yx|nc*U=?%&6%VG|1re%%uen!gw@irxajy< zi%tk!zzI}TApf8Exun9)dM7>%n$PI z-!VNn(%KMA9nGoO$gdb%%*GWY&lLnEcg_sra{Tx_b-dK-NFx=vcY{?WkDmU14Je@5BS}zuht5_CHTzJI(1qP7VYM3v&)txFXo^lB*W| z$@&8|uD5S0A*_6T&$yY8^t&yaWkR((u8cU(4u6teP*>PEas1a+#DqF&wK!N~=1mwC zpr@z*qwdL~#|YW)=xmX2)pqLqP=&^6X6EP!qobSp{Di)^ygXDUYtdhZc18aTD{J(VuV1YaznCajub|B`%|cG>&Y?oynZsq| zpL3A9J^a!=yu4yuBE3mxx-8{6PO*T%O>;f%=swFZkK60jt2+;F6^z9F#jT`!)0|qa z?ge>yT?^d>b&dHJy*X8LY|3#i5d|DpHZrTT-roJmtOJ=NV`ab7u7x^hg;P_XH(3BJ zYDd|BNZC3{4T?Go>G-%&Izzi^Pj)0<`g4=+WybTh7@L|F+zNIj4Y3Y$w-}qeXfABK z-2*(wPfg=Q%QVjtI)eON%u~FB8~(XIblqT8Y3YsN$em|dZVo0&Q7|yh^Dl z8QWW(>)+oe+0y$q5%cw{BE>&%Z1kJB{B&CmzfqjtTqabjfy0V|I71y7bnOJqqZbJk ze-U=IkVhjW?!(k610$nozu9XWykP$^{miP5ANeyoO`|M%rX8pKHzy^vWFHF>XO}!- zxgGf7^{;ktOf+{=Pa5;$(kMorg1rWwylNb-?*t!Y9ld*h8-I8(zxq6khBJ_@UDz0=9q|`r|w9!AAtu?;nDN(_C1YB~?e__^C zUq6aUmoW**6}%0dKUOvq>OHSD0=4Z-juEl#Xq!XZzFUMRj`(8pka*JXQUUAxp+Y!l zt9bDmXtDR&(yXXqtJ~z9TdpPo6wcTfnu?HHAgCpip_J z&Q9Cp2p7DGf5^}Qc=x^>XskLe`9;tJmU}_S*@WQ$Pc=$ML7R%LuC>egp%u@eF;WOB z4`C`*%@Q%1=yL zM@GwI^;~54ryi5lqW7mVMiHGN!OkDDhxEy>q#Axw_EJ{?nVAApH-%Z(Bue?HuM(h~ zDV0Q*iR3zewRlOdAE~P9ixT^8*s<$}@9v+9vaofvin34i206*>l^K?a4$fk?b$ew! zSZO3pB)2%;!jCdEMxcYIO*o}X2XEDehljWI^^bC9YNW){arg~n?X%62#sC_MEz*Dn z5%ZV3gKKZf_B`5ND{>4WHXAO~-}rT*9FNOqDDIa^N-00V)nW{8OqcB`JLGAEOSgB< z)f)*C|KdcOG*(5NLyVnv2CPEJ=a-uMRdk>l^H_n3!0+l+gok01-_jJoRjw(+ej986 zn-JI4J5eoE#-O{hLFoLRehEzxA%6AUeN^GgIPA{j1@*e8Q}2UF2v|>T!H3I6Ddi1evGJRla|!5Y1S=p0Z$cy_c(#9Z=S~e2S-M|_G}vWHig|TCPE1=qgW08S zdkKf1%!v%hy9A~D3vI9DGu|~O)rY;}^vM3iy6CNOz_pq?%m$6f-yYHr@Ch( z7@h@eM~GDs_0L&-pT=Hp;s1W79bG6OM3R2DedYT0#1|AQJYYTnKlNk0DZ?q3NcaF4 zKRbkMU#jFubsL?`3VEC3c&*i-8lhj_<~Y&Nb0A}1YN%x$yA{r-%jXVe@PJyHfN6XA zi`ZOZVMQP@)neB8t)aCI1DM2}spGm?k#5$6q=T+MJHZ-CmaW%vsV8|4QhG%i*U@7E4RwvqA*$3nUbic@Q8}AyUlIZ^ ztJhjKhh<+CvYAGk4cBpVB%8)05eKIgD(ZHvanVa$zXu{BiOGGzOQLOY#9wt=ff0Iv zJhGjdC=JWWkfy%e*wxEk5O`+~C#&&Lm(5$E#ztMZncTvvQR(ZtP_x{Lcb$F0A`D$UhC?A8|j9UsKtsj7{E=flD^ zyb>J+ErA|<#S6X9H@hE-t>>g&`?A@y?-j>L75DaiD;@54%Mg>}$jk=PmSz-uYJjgg zcjiS6;aSJkPHEc?j-d_8RP$SxtJ|+WCp#9S!d!;;%scCCQ;!7@w_VWEhdAq@^0>(@ zI`-E*DSi-Kuwt_B@)hFKC1;^}E}F`5e=mtg!i>vQmup-BmSN6`8DrQD@D20r6`IS2C;trhJmbYhEn9}5& zZE}o^KhM-;Yc!yZEcYDvQ0YpM@KkwM?J-Z}g8PJ8MS^ae-^Gn7JyS7jFIgV-#Hcv4 z5f>LHN6oZ-Ute8B!(%mkr~ratqk|)-zeU!lI1~DXmp^FnEhq2K<$m9biG!Y9C)V^& z>aFIrHJmk#?ssA)2BF4AYj`_Tj9-;7KR1%4eqGs+x4~Ez&Mq#^8mBc5Y*Fb@k~cf~ zgzDFd*BMV3WMsZw#Cj-5t!C50vAo1I4cQBHjO;l_`-ydnLJR$vHHuG@dgcZx*`MRq zLHJg{TxgBA*`GLQy38U9MhUpj;?Jc&%<^Nd*pm%KHUk|9==QYj5Y)z8b*e>u;87aHubU(S9k}So6 zEx3S0<~G_Jig`q3OGpll&(Ggk@6F_pAY|4M3;eB}0>aJ~SiBFyu=ezu2B%k@H3>BS zhCqQ_*tWhYM7DHWB2+okBuZu?IO6te(MEGtZ;dUBHh8`6+GOnmD@8be|dw^d4Lrcy?;Nj+(nnxQUshr@Im4=&nOa zFIE%^7NL+e?C8;AFo@;4%Pctr)#Zj^AVI}uC$f2X$nYQdwI!wu((1qfgU4u>2NLj^ z?Y07i+iBlFJVm= z`Hb>q`X@1NQD>72-;JkKG1q;x>NVFeI+26raFEouipow)4h@GiTzi^FWq_!TaoG14 zVb-=_bsx)}b)4VXLclTqvq{V9c0@npa&Q9LOD zw=*&r5-b#&F1q6vd~{uIZaDCZ#E2@%&t!Sl)69#L?o8MY*KdkKQ^K+?L z0$0t0J6S6ZBFpEK>ArIkpM0uh=o|fp5x&jnLVf6T!)FF=y-upjT}IdFPCc9GR_3GZ ztC|3SRO=XrX$*nIEILwEiRXrE!04hZ6DNk;Jm*BT&%lSqY1K?j$)s3RyUbDKZDz0W zU6Z)i{AvXn%u~!NUV`BKiK?46gjs1 zU>+2X%-!3eb+bP6KEkKa9&?A|_;Iqn)>48+_x|y$x0!WXIy3)Y_r#UOUl+PYLr# z{U8yFg1k+vrI4eZjOT2WG;yA|{Edg>)l715vynfxf)xu(tyiN16cniwMh2Ce^Pit?}&aQ8jYYbm$x|rTAo8YK#%Aq*g z59O2?bN8PjJOP1O)#5%F&5u3nn8*GVj>h-fJW>TAJt2Wy_J=?x#q@hqMOFk6HkY-c zvNaSu`wu zAq@d|`VKYf%GxA1>7|f%TLvPGLT;g|N7*++g9E2%eVx}AXDcKs#lm|yBE;LpM`jQ7 zL~NmF;wCMXbSGJrFq2B&1+5{7SnEY&>M}=4H=?tkEl_Eqiyyfxy0U^85?-o$KgJe( z`K%AV+hP4Cv2%z}{uw1j^|0(N4Y{Co7U_C(H`LV6r2<2CJjfaGbDx{DVrZTx(Kwl*g`+QMGP z_K&VjLd64GBsJF1GYex>IkA?bFUcGD6yh>@XxA@MJNYSOIdZp_22`t}*7+F>-Cps5 zU9liVVT!oQyy5dsWaD@3%NgC!5!0rBVgw)F%r6`|augLkqlFV_xc-G#y5l*24%?N?GMG1+*_8%ilmKIu( zr}M&t)37~lzI6fE(l2hJ=B5%W^#tF)?5%FjxTGlnZ=q7 z*75eM^T^wTPRX2An}{qt0Drjbop>0W9)v#P?pd~c4GSG(j$@QbUbW@Xy-R$%g1kX= z3qkpACOC;r=}mQx6gP{(+EA{FF6VAVq7h>_ikNBS4~lD_#i-~K*)Z6@pr2zm3^%C3 z{;@H-!7jRe{PWh#lbcgw;k`dpppK3%&h0aFa2LI%?jfjKT&lK`rY~vCpx+5yN98ngk~AOVhHY9VzL`O zco|LMZJpr;t{ii+4NDnDe35Uwwsjx3%`Dh#OO0={9a5V~>U`P%mJ`bs+(h3R9Nt>w zYBcw|=IVe-??rRvGC;5i2SI|Qwu3=H}W zDJ=}_WRGy5$qO}T5s_lcna8!8%elq=o$O4!v+10!Rb>?YPd9^BI0|3k3$}fynn_*~ z0me6P-n4dEtgK!8j$H1JG_kyXH;~I>f&@$R(}E$X#krJqfwX=&iFlewl%OV}WzF2KmKKl__5XU^f*6Vi= zlm28kpY?AYJH2%(Y7RryZ|p@o-CrS$1Wc2^qNOaM}Hx2ZlWH?M%dZItU|?WVaj z>R2Ggg5?6vQ~Zl+4E*-=sqFI(UNWzbd1zWeuH<})tTKt_=`3uy?O88ORfflW_@Ig( zFWeqCaocwGaBDaZpAMbFF1)H3V+5D5`y15wG3;1!xG?^e5=V6X!+r3?VcQNmgA8&_ zj%=|ZhtS4KNoOmdXnY(mI(UZ4>#Pj@Boan9Gf0W%K^qj-vWbZ?9I2D=JUGQGn5euy z36*eyna9nIkv_$~4=<^03x4o4j)~61XmZP>Z+#7|gXt8^d{VIMyh9(M=H||pP|LpN z56QSsJ`JcFkfO{7}vBV`e*8;q5D8@>{rC zf@KipLhHrZAG3|~nEz61e>^)ftxxlg(XEW|jH2d7hik@t`5!!vZsI8bhfwCE}~5Xsgpjn$2MO`l6$7zeDX1%#8-*lSEDj~>K z7QlpKp9)l?d6~v~F~aF!*dS@{X;On3TO zp)rqPq~0ivkm~}jA)lNbbH&imhVT4BkQozpkUxGg%edFf9(Fb+n?KlE&4o2QEb|e{ z!*N;91}kPzOnlKS#M3zAp&YzvHz)?bP2of8t4|l=X9|>h355-CZo@EzRFKvl_Y?+-~<5?!kX4AIoMlI$f7m%!V}!9j0p*LslC z(i0oFRoTH+Ko2FEc+y!%cdqwqLSYbw}2gOtkbV@87 zjR{F$nys5%t6zBy>&=&~*WEA0!9;WVq?N-!W|L=CwKRx3aDEv@S;tXuOrVSSGI8fK zRyUD5aAS^B#IUg7C|K+qKlYhP4QgYaJz&-XeL^dYL}rZF6-T(B&$wmSOk`53y>is&X6V?}3diyuwt7h~4nf zK`1R&L+h8QwR05~H}lNOb@oz3N|qW&DNxSm^BmWTE{_S z^=r~6CC|jmPMWdlUqzN!WQIodO%-a7HfA?11SFTqPZwM}kpD(v-$VlcC2B_5?6-t- zwZva04T z;9YACWP!Az*r+dBpu6%d3N~rkZ*0c&G#la)U_93Df1#2=mn`8d%QF*#o52!qP{>Yd z+pA61Jx%T4*F-riPV%yEhyC4M_XMPE@h+9h$-@-SE}rCJht;B*2JI1Y8jLw}!{>#_28zm)(C1WteuD{k4?o;w^otcJ-s6FD7peTe$dKa@>EX~PV;MG4aK z84msOO?8@Q&T9Ci6`PX6$F3%*PIg^x>2=$pRIXY@*}IN1_pzNh112PHAR}bc`4?Mjqj#FC$$A&1_u+ zQwCruSNumx9g8m!^nkle2Iv_pj8Z_(3%3HW>s zoP_h%#gJr$sxZbqpvBS;V!A{E*;RBz1{#x2lxN~-PU)q(t- z&O*PiRwt<*vUzb^pH+5ZYoBIxAxiHWz!e)wBtA59NcmWjKv_ic|$utN|K7*Kq)LsS=n z7b$DgrJ+Y`Y6Zs*OGR=73sz;b2{jd@9KPr|g4&Gs@@z)za}XV-)-M9qb~lgDa{Ft0 zS#-wGK_Hz`!R30+FnYUMDhN6~L@v0s{cM)b!~5zL<~XQq!{x$rW^cv@N-jWaEp-+X zO(NRzpxgY^7#d-PbqC+djuj={DO=SuXUH|@)1?}0J%em2nQnB{jHhb_xeKr~aBFZ; zW4x3pXuL&#CtR;L#dO=cDoAe9)>}|&O7Dh74W(r$W$Z%}8vk|&N>DBHo^I6vX|-g3 z0%P(;L!C~Z!g{RjD57$d@;(^7L?BnQ<}{!u5Iz|lOprSNg6xjYtyQ0dt=sUf<=6HT zQN@0;cP2bErR(pGLT9*Ah^u=+28We+dPn-`rU=<~OX`k{US@`d^Y)5t*`nt*<{GhR z!YgR8J|EQpnt}Z)K77e@xS!FSrte(&p-J?8$yDBIpOqh@K}gZHPk9{4bLln<6IS_t zjM(Aeh+Y_6sqChMuO8t}J_|o?8TaHv(itlpxnx3gt*;oOI+Kg+b55J`G^nogk}2)W zc8UHlg(cj@0Yc$v_+qyIv^L9;St)+<{0QI1DnTOuN=4n;YhxcOB1GZ?9@bta(IV0* zYx`LV!>1e>`7RH6NR+>gsXBT~-mwa2N(Vge&Yb;Z{cX5Xilg@Bc~8c#>7g}n{p>7-yl-& zgC1Hh9kI(Vn=1J23_w>{pLo7}gc5lR07R$2;)wKPDt&@|oFj4zE zXOEl=C>!3-D8mmR%GOsGl_sA!3$=ATJIrK=r|xv!R6Sb^%$Tfdyfw(BJ@87kQ1^4kd)0P|;#7(lQW;Zt!ln{C zmDyhi2L#2a=`yu@R^HT#tX1-2IRH$CHkFuBmvu0Q*WaNNU`qYGpD{axe(A zzHP=#O86AU>%uc}sKUX;AXZMB%LLko-fK@{#Ng_qUIuQOR({Spr(e6%cS{QEPn&NV zG_~UVzT{sJ@%D011N2Ek2}?{qqXL!_QmEkCA92h$axi)|(LA`*!$ayd4Q{Ya z6cR`pzSEZ|@_K2pme-|9&s#dZVH-yn;U5!V63FbPWWTJ}NZ%`T)sa@E(i`YCkS~Pl zoC4;i^mtQBRuE05`BK}PR3OmM?j(;dK??7m1zjCkVlmH*3zn{Q9a#!mm!|$Lp?)VS z`@ET5g0usDli#?3?oQR+%WezfC!Z0>6kJj1O! zrRP)FGL9~rAQctuM=(zXFPgk^Wig0ukp9SDqu!h^`qscSJqScY%B+5qw)r}lhj*W~ zS1oPKm3Ls~+Bu;EZT#_Mt((Y#@CteBb2b^zkUALK)?`tPmGxEAlmyb#EZWy;B(u|m zQ}EDhYsSI;dHuTu@MM`T;_lm}0p-Q~l zXD=1{ojWJ(tC{S-_2ipxUf-h1pN6~E+YhKWFah-juz-RBu|oVPxYJoZDjWZP{9O9wwl?(IOy5LsAjSW;WiS6Yd^K#r)QVhtT@%bi$DcXM=_ zxA*hv1V6}Mb-cCygnA)GvidSYLl2Siw3$uXOI|j}SGnHOQ)g7Vo|jQosBIpOiB;O3 z-YMq*De&7nDR5U%kqZ!1XL~5=LQf}Iu0XYbVtN8m+fGt{H}0+9#N}qh1lkSzHE*A{ ziCZ5!&JtPndvZjtj(Ip+-saqO!8YTN`yJS4@)@DexqZViNT6-OC!3i|{CZ~B)T~dh z^PM(MbZEGuPiCgA>is#!ev2>$w%0{PZbaWtnJGcKJB2tHFAjI{fYMC_%93kj&NIQJ z#s|=adVVb}t?YF#f1l0Km5Y}uhFQbn4KiInIJ(|P$Y+4Ep3iL!KD1s>F4B>7hb;Ef z5L>e8m%nr?u75S*TNYDrT&9ag7Eeq*vcI0>RqrX4yVncS?yVyee3Sw7R!Jx9bDQ62 z0DfV4lQ@K`@m84OuI}3l`*FedVDsq6w}?o5umen7gAAYH-H^dA`_=ueK1gD}&vOut zUDh0P%pOmXeO`7)cm|s-pHZ*1a-%eyDYFC4x%BySu|aZ>Mj*X{m8im_-&nDy@#K!QCpI}#;EqLnLRwzk3b zyusD3${Fe=Q3~CR-@4(7>x3HJ$rQ%C)=t&D7PL}2huX^ib_kRrRkD{o$@eC``UoUW zmx~;W8$^ge21R81_yy#qf1Ts5%fjYtJ{aMfb&<#x_k9_R?WG_F^BEo!6FAkt1VQ= zsda3c*!t@&paS>o6wJ!zj?FucH9s_ZjEzcqr0Q5hv9+<8`tsbhl;B=!ylw!S%l5nGc;Fhs7N^OHnN=cPTB3U3;GS!INfwUY{&i~ zVd8lDqNsth)@))V&qci-JwI9Iuwa0*nFBpIMIzlxnb7D2jBk1u8ZV1F?T2kqGfwPE3TG zMXh_hXhz!##apBm>joCSOPY29>cIYOFRzP&+SQY)Gz}uuGc^t#jVZ%T=#EUyR?ihH zSm#%tOKN`Oe5}X*(>0$9$=g-LrA^Y|C%_EU2-|2<5zZ%znHG1St&oU8AVS;p%klT^eYayra1B{p1>=H|W2#{BE3U3t6E3%2&dVsH67B^Nk*>Q6pnEdPu3=ZyC+ zx8>v&k(g4YOF|akZlS z;m_Yjm6~?trsat9_AI<3nwgr4;mC|J71+Q`w7mhfgTlsgu(+CGrb7t6P<1kKK5nO!_ET2u5YMa&)Ve;`T}g(jI^R`|b4hl&+~WCPv{v3Tbe zs7tgL{Yjm`7uJ^R*DweIcbe!Ml1IWDf#e{<`vBZeXqYx3ZCQ6iW~He3rD1ALwahXt zyjsmkE<3Uo6jPDl-o4sI$eAMfI|iq7YV{FPBN;Kp-fu(O$^ffH{MHyU&;$qp%gwpS zw0p}9^s|}4b^px*fgae1CVtQ-(Jmt)REc~Bt=;O?EQ$ev2UwmsO|Op|4opgls?O6i zaJgIocnMZz{2X8NKG$4lLpNyU0~PM)zrFon|*<5S@eT8@#u_{Ez*B$Nc`>^ zkpvJH`gN)8dQGw*deZR^*hiX8!bwW$<~_6zSjy)t?STGO;fIFlu_*xS*B#@t^KX_- zy&J7zh|yDvf@u@;0gAIYTBD z=|NCxajIYO&Jf&&?hriR|`!^H^Y7#~D(5MmU!rImx;%z^FCBEQJMRTfcus(UfLYV{4 zU$pP@g_9JNhrVkf9a6LR-HT1V(-2vpm0nzJOkISSfmq6NEzGm>azo2lzBtSdUe?qB z;Cea46~xT?q+DFTPxx-XQUl&EU%`u7mKa)NsH>n#T?ABso6{6S5U?3?7+~QY+6zaY z)#U;a1C!OwrrAuW3Q#T%jS!ia-LvJtt@TR4<+l$cW<9(z{$*{PeAW#c!t+Vs)0!&TGGDvpsSctV}7=>a!H_ME&*n1@D?k zk%8>fF`Xw*ScU2yE7Cg3R`OSq{?iec+2m{YGS5J4h_|DjeUSLavuXa{0aP*@-_9q` z9Xr}}^~GP04lIN&KD+4sJk3%ZD01ujDIoa&dhNXm)@q)Jfh(aAgS98D?#+SzH(zSQ zhWHR0ZTs2!?Gia1CMMOuqj!NZ6p`I;@)Q#+C|wXU<(->mdHT;**5gG(pb;j*X|cjX zZYHTtxz&SDr=eoVYiC}XJ!@NU5~(y@?71I6k6fTtKscEm2IKWTIPr}qLAlJwrpuBZ zUXZuJ{wI+>FVbqS6G8S0A7=z}XE{A&4ug%NIZGna_76HAo{OxQSW#NF@|4rf@}SXW zFqXOHm5`PAH8jew`8)e$4C!f`&ePDBQh4D4qQ*@fOyiY8O7Jj^D zq)TSWuX0 z9a>7W=WBXbyjWd>oVWQ&)fq_Ch{cw#M2lyzC`8)xlMaIu)$K;~8nhn&dfeu0tL;We z39`p4>p+w`{3hN!&N1lN9Z4yafSLGz?&7F`uKR1hro`cy#h+__TOnmyHW=P&eqkOwV$X+Ei)CtiTs_!z^(~O$gy0_H zCfG%{9xl?t7cET`pkI3L%W&2m>+TpjhdREvsV&ihs$p2P-C!Gmz3Kk9eGvbqVzFOV zhZt@3@r4abknDFyVvJ1)N^|wD0f!K}h*%KUUWpWk?8~hg_JeCm01O7C+5pk@Ft5oe z-hd$Ia`Hm8Zm1MpHrq_+5PPlnpyENeZW5ya`1gH4?(V}Iz1ff2Ei+S`x=@a*+*7Bb zUtN2cR+Cqa=9W~ho4=%^%ly5)rjHXHK^94K5%)w*$LvXjjJ{XE6lYI};G7VDOD_!4 zh&?DOx3#XwY>sQ2pG`0-(S18{qZSw;@?99To}HGdk|mU9vzR7 zcxZli)I|64wp$5HaBaKQaAI?w%3ay`8UUIGx06>I(mN3Flq5_$th(m zGFkk7a*74ayEH=?q&ftLHe2{tQzJ^D2^pC4@S6y{JHPdseD6Ok2B7sL_k^L!;G=Il z?as#Cc8Pu?a_#I3qafp%!jY`Jnmn5jRn|9#sE>BFP*>#I6JOtUa<7*Um_tOZ*Gz~B z_frgO2DHuLn-3g<0?tVPlT^21uDS=s{Q0~_SNC}LmejLty8W+fsf0QB8l-Xkc`uWS z>ukK@pAPEAyte}R%W>P5)zN!8$swgEbNcYrzoesR{&lB0kAbJGmi|9nvHqL?jT37B&%4$C zTRgEnFF&m~bZ~FuRP3@C^xvmh?|H?0WY(PhfRSL@`x4KRv;VZc-Otm71JqF51DF6; z%K_jdB@N93_Y=UkS9U+syouIi69zo&-bgb}lkMZJeY=Cy#K*u>mwr7|4U80b4sZh3 z6qhy~3h>!`quxu}bo{`P&YkNrPa4D=@O(d~5ClfA{@DFSPV#WyKEPfc-W}0+0izcu z_dd5NXEDUR<%H7*W)WATyZ8#w?t=1EhWjakOgaZgEpI)sR<{h zl9#T}V>3#N>%D}Dk(iMoQ*@;1cwj}2;v%0($EiqlmF@O8xQiwB2X!A!YWktSI3phR z@Zmlfxn@it&e&bq21@CUyhR0Ly3K+4T*iR$Jb8Js&Kiw4eBMnW~ z#T}^i^S}G!GKiIre?H%47L6y->WI z$O{eWrN^lD_DjF>@D*Q6Z63?kB>|QZISpw4V_R;{7fso7O>@}zD{oPq0p4*l2jDL= zJo?d(?`3}zJWS*G2Xd{Z7xdZTk*tr@wSCqruxhZe8#45f>Q{k@LM3T6$f1ba`Q*Ap z=c&6TLMM8LAf{m}L5B{+d7G_GL{EGIfvMoREgZSCJkLsoI?;PhPaR4_<9YO_t~ecN z?CL!5(ly#>__iurF)h4=u|Z?he@+3MD{}zQ)2@kMdU=^ONKU&EG<>oF$~LP5%|zOt zIdybeR9b7Ks||E+|G2Ro#0O?m%WN1L`W0Qd@14>#r^xxk#QEwzCnk}4@fMN5MM~cw zvmtjSXaT59s@m2W=i<$38a827O%AJCk4>OM`B$`NHo~TLf(^EU0Hss^sp=12rkc*i ziKRO6wm69x^aDDb%3gaB&_(fN$hd*?{Yf&=c$?d(D?O~v6sP>e&Lvj`+M5&ZD4kSe z%02Ayx%#=O{%MVgP#8O88`5Zrs{xV}n|%21E_5>T?C&V%c?u zH9yMEk*=aEhdL5aJf#tkJc}s!yXsm$VopOIuE;#E<|qx#fa*`{FmMQ98u!!E$Rw{n zlp7M~DyE!rRnSMkXEV^4(w1St3hfLx!$;gxE>&2A@|uFhfTZ=W_?0N9x>E1Np$jGD z=LE#|(Qw}6U}inS$J#0ds7|VwYSigNB-L9pw7#G}>Uw7mvF6Ry^2sMa15ar8(dYAy z8&Av)uV;L&3!N1$v(DrQTK<)Roq$?}L$v0}yJON8E z{)@j;f>6==+tdZXw!Qsj{#YM!++f*MLi$BodSxoB6;9zECz3b>{09y^Gh$cgb+j>P zt+bNiC2Hiyg{GLw!BJ!U{fy!ZK!F{-o}BFt2nzZ~1YlDK&Z*kXJ-)6S43W-d{cxTd@rcF!l@QfBGId1Le;uj?QwEboT+y+TG%O zpYu4Z*)EakQB_(@BWQ+2)i#bT%yreHuUTI2+}-Lap_=d9RjsIWaZ!;8*B~yE$;1y) zOek!rv_t$VdT6~0??|gXLkQYFSvJOZO7rv?E8az)yAun>mt^W$^zrwN#7#0*D~p;o zt*eGcpxhWEAsU+5qkqcWqck(z@myujxyOk`Zr|b(Mh`sfTQVv>09XHbCR(gTyC?E} z8M5t^6=cf65cUU+T<7N<$)fQ)!LtImIhAGK02>2A!ZQaZg3%deNA}%-4 zTCd3vFd5!}XZ{yIT^i?iUXX@HuIArS0hWFTT{xZo<(785eCjVWA?JS{KqU{_FZ_Q( zA?uRB$=B6V*O8=f$4|~7uRN4B)CD7Bi`1aA#fbLQ&t0$@Vq!%7i|xScP1lDS^DLh) zEcc*TCyO2kC8lxWzYZBUVX+Mf$e8oSyKK|234|1zkHpy!S z*y6eye>vX20F9iB=vvLDc~3g(S*;jLx#=x~TTuq0%EGt~s7h2emBqMv%+^RW<;6R~ zwWO8)`;^--C&2`}rudk&!3v~`hs&Xa^DVukl}DIs$4Hs`X&#tz{u>J8lwhH(KYY)6 zH{o=IF3&*jn7v zIPkl5%Fw3(Arr*=3R-AOuWlqjRo6T#Z_~u5kL9PVhA$A^VD{p?P+LSpnj4>5jkJln z=!l!hec*Tul+N%)!s&z^hgJ7WFo8Y%jeafdKM-8nrSZDYI)p(-oN4S76q3qM2tv|Q z)r}6**J)~Ix^W0N8s2~)v-u_9X4YGVqw4r{>jm)Zbzb-*)Xj(1GS}rZ$Ss(OnMCuR zD`tnmBR{~xb;wak)?3X2A`|Wgk5vCqp`pq8S7_M9qO57xbe-ukHx;89H5a#Jq)2)E z2&Fw>7t$ivq7TP3rGyDo0NMTzAFNnF=hbqN37lxq*^vi5JZ!g`(T+5`NOt`B1K2BYQ%)x?i?&9X zZ;yksk7>tEJbncHDus!`&bX-nttq>}jvNr9$l+^~+g}X>5z3!7-Xz z3O6#tS!t`$)lF)6WzgXynx3N`ZErt zd1}Dijp6HB`@b5?+nCoshgS$f5A)_?;7U2FSuuddul(_^Bm+zNKM^sQd|4;X>>O<` z_9wdx7FmXcM@CvQ<{}=j)BCB;7VQQ#T@3%po)VR=j}H9>CK>*lYi^Q(l+r{=4_bfy zonJ$_9|_u7{DXJ@8SkJ3&9}Fx?%zJ``>c|e%g*W-K3lXK=xB2NCldz8?f9fXS=Yx; z#g17bB3g-^P}knhG;o1mUKe;b2zm|#W&aGrq|O8q(qzGcY&MpIbFLNVM7>9eYg?n?Z4(vAb7(LDJt>ut0{Lc~t$XR}PNkEe$Exl)FhwWcs- zZ1U6e9Kz$)V&A# zyJ}H$VX-Wn+)XIoOhv16t96$-d=YH!VYJEI+P(LMuS6&4onr}-LtdJ2roUAo_P*P| z#tW-ic_s{QgH8C+3%Zizb(fD=5L<-Tq(%^;@P$UMb=3$qDeso_NLgO;ut(%@AbI2_ zHLuCj*p4NaUBqvGhfnNFiu%vmvg2{3rlJB|Wzb__jPt9x zHzLFmQT+|7MxajfP0@Bmg}Zx>`y9p$lydfeAbe9 zpC5ae4vx>)BM;n;%ElG#-F#&I1oJ#L}5= zWUve1Y2Uo^3Q^_Z6eW59aF|}xCqCUEIcYSuY}`r|Wdto5Rc<&8ULfBVkWh%K8ujyf zhKOK`&Nr%PE9*Dq(=&yAxC6xtvoR+$x9!3wn$%nWqK+Ywz**U=Cw0FzTUc&Js5**$ zdePK~tGqT(e*o?+o2@CzvKryT73uwwWOOZI-Cy3}Hq&+8%wR1rR4S#HLS3LsB)Ql3 zOD#r8i4KVFv-27m;S@=ZB-gR#>&4B6!VMHu``9IRNT*T!8I|sQ{*_8$LjhLA8ZZYS z4xt)&hS7EJh@@)M_N>OK9i>H`xQ8oxhX`kv+Ii;RCx|G|le+&+PPh~+tsrnss!`pf z@TFGdDMQO0^C0)?Uvz`{WAJtmlctvgQgt|hG|**vBl6VF^baolo86H&8l1bRP_|>O zuB(aen773}(3~bhXq<{<_C%tnh}Vius%z<$b&N6zC61rpwbur*Rw_s zPQ3FfLuYDKr0kCuD`g<IhLRMgsr z*qh*&#O#eO-M~AfnS%?1sD*Yv^Jjlc!CgLYs7HS=x}aj%UFCjd?tf3{dmRN;N4whp z;bnKd6i|RI|C6Ni>KbY-Yiqufk5!Xy)9S{*QWg<(chB%1`0^$ZBcGXJb>Xb|RqIfi zPqe?I`k$1AURQO@s{OUB2Zo|=RR2K1TOEYq=GWDk%#&lYpSg8MibRX|sxrRVuWB1a z^U>m0Dr1KOKk0*!@WF~es0bLU7tB@>8aEYk*5qxf%dCgNCE_jNVf#KcAO9_+FlRb* z=1i1N!^Vhv>!moW)pQQ39hgM<2e0Lx;?k}AJfrMGl%+;NU^S$w(I~7m4*+o=i+*3j zz=34n@6XKj2V0=)(NN_KQ;vUZs@>DW7pa^jx6Ash#WwRm{JD{D4?cK#@wZ6i6Ziea zVM*Bd6XwaZF!r|j-d^p0S&u-UQe(n>@JfWL2nuGE2Rq;l$;+#iKtE0c@+ju%zeh#? zA(@;vNZrNa4?K-@Ad#_ZuaRr7ignt}XS$2$$aa5xx(S%1KLajvrf+QUq8(<^40drw z>&DQG0MW>w$<_yI<`>6at)oObCu4u5QVo9v*gFq>;{N!G&~o%(3|{$H<~DzC{*o`} zApY_h9f=A%1rPqDu#UvIk{8I<7@@~q)hxeq!@nXp71V|q_PiOaPhq#j`++zXG=E3Z zef7Hg)KHSsq!d^|e#&nz&ntIPS68=-iKb_7U-+*EFKxyfuWZ#E+sSRK%rLuDnK2|w zrWmh#^VD7x+11tR=!;4bEt8X6+_*9kn)iF$Gq`tXxAIn$7kj=1h?RHQ(3NBtB`GT# z@$3McrtH@n_7eK~y2xWz<*fO+$9MD1ay6VSswz>3HGzZ-z%bMah5yCgdqy?6ZDFIj zE$CK2ML{}(fGD9C>Cy!$Hb4kPq<85c1QZmM79vtYRZ)-P;7$<>@{VR~B1NnK=N2gGP!+D?6-N4jh6FeUAQr^^qPz1O;bN>(DU}k`%VM-56 z=qwgUo1K(4RsF7vo{F9NBQ;v`N?NNw?3YodbKSn z#XuIi6PtF?Cyk%;g*nu!EbW;&*^pc$;bLikQBtw(kr z10H`z*@&RCtO~@8@e-7?jfuzh1U1P6d-JiqvA`Fh)&d|89oJ}!`ccc`+mq_TsCk*o z++gsN$O8X&6_1wx_fJVD(W<4+3>o(UDs?cdX@+}E)?Ns#Q$91U9+NMh#ArO>ncA{a zn?+^(aV}AwSW*=5|BqaAUq*$LM}rWb#nRpaQ$V9^U%N>^^$WDU@hanVpJuqJE}h%N zd6+uz8u;Sj_>j6!;?{23$?~RUy2-kJOku9Xn&O3X|M65v*#T#x2Lrj)^4~f6s?xgx z>&~z0F%Z@TyRP$yNN<5h|Gr?TaW`cIjatRZL>gOm>#mk9E)Pw5#WeSBkOL{@eVAa~ zUVG|C1!lSyUuim!k%oI_%)jMKZYt~yeX%_kq6zid2Vka2?~*}7V`EI?3{NF%5?L<% z<%2;ti#5^W#W1M{1S5Mx)NZAfJBEj1BGFk;>WQ_&pM{r{y{{el=}y(t8}U=Ccgrx* z+|ooX=WRb!`Q+MMS;(`aI$CqGHKy}hzFFTvpzF$A_TME$4El$>^U`zlZ&K3~06kN3 znQqIV61DEXa3YieVzozAIJQpoaDXqddR==~dIY#{|02IOz$(Y>{%#eC>OY!MStxPK z^&C?J`u(?#0Ii zcWZmlLwq0?t9B80^^2G9LDA6y#e~l$38K7KA6@tSE4_ZP>V#ym()pBoCaUMhgM?!% z_-qjtKJ_!w|0Y?#nD^Xn8vb`2mjde+4-nMN&g0qsxpqbfZ6zjWu{kG~Phj#e3o1S3 zdpxpJN&d3WFwMx7qR!~h9GV|C{=cXB;$QzQHU6j5JZ78;|2MhY?z9nIi4p(D^nh+80tO|Rh3GP?zPAOv3jA}3qejsJ&BdnQ(Fa~YjcI@H+nQJ$-t!J|i4yzcq1 z)ABIMkQc;CN;YacgoF^q{!ZYV9?%6C}5uK)05?iW~mP zHc(22{#VE8cPdbJ>6zh+{Z9(ylNQau#KE$-{SyQy4jK8d|{jSsO`B8gXy`Cf-zIXJeL zsBN667MrufnfO6iNyZQ1+hdqCxc_M(eUM}a2MN*d1Q9LC4z+%l&rv6d?eQ*0)2koZ zlb)Z_FlDQb%jP3Hqpq&{zDnh($f;||DpzD*iaYHk`Y$C1da$2ov9m>``k~|dp8{OM zuYuxr4*#TLbCOz4w&66YHJi^iZv6h!016iwvnFln5heEy7lbS6#3(iHrh#B=fWLOy z(A2k{)pfr5l#Yg`xFUE&U&MW6s-#SNHB30^pje`VxK69?dcwg2Fu0 z18H48ell!lDp?z+ieFo{r()7v@Q7){-KqT;jXv)ieKT41yuGliJCa3 zZaF4fkfI9*n@hH9moxlo7svBT(pkN@lSZC=r9`&PHi;aX_nUs6v%31jIQ>ENE+CtEa=b#dZA`RD&-!9e=+`;)O38bXvx$a%inU4A zFMXAlnse-}>d4tLM4+SsKitPZi4|+Ldzp3b=g;J4!kAut?&TvBOdED?dJ&|qa*+3m zSu+@7Y-5PLqiCuzhti%q&^3QsYzDH2eb^Uv4*OPGETP*$`-eyUCuW|fp}GFKBit?$n`W;<^R!NbGDzYO5@T`Vj(xTO;r}*uMwL#luA?F|tj??~*`585XExPV?_C^13O5em-sCeDXLE zj#YPTj!MzeJ7_b%(RXv882y|wT49GSygA2&N(UN3j3~@G_J$hhx&YLy2?9CB?t1%* zu>m@BAL8{PX5UeH{oslFJtOk*k5Uh-#~sC~j&_IrwJ%W>(UFOKnUwPKSTEuImKI9Pd`jxmz+?d1KBYt}xx6w!E_pXgyK~yFjEna(w zN?@RLZ}!pT%#n3!{sKCC^%R%(i~Trt%cTN_O{_8lfYMREa#u9q|F$@FdoMZ$ufJp zoY%)xA}Mh_)QEG>G0z}u`3o<+xM6zz^z4I}l&=%X&s9^UCtDDW@*4hB%m&f$LjV1* zyB!=)JFNMj#7&F%I6oQ-VzBQQXb!E>b8`6}yW}L;VWrIYV)f-&H9%gU1%jm{LlS*$< zulE!A)pE9h?c)_;vxzzh!CW<{y z95-$Nhz4Y%P4npA{iT<=;45~tPJZ$Ui6tgOY~Du8@$X*ve_`-WRq35f(vuF*9VKeovvmwARpI)O9L=u^x=3>HyJOt zFF|Y78*|t;SRj@ab}mI!8P(WRan^1M`q(Zw30Fg3o0LEN(qBF^IvBbxL7_J@v3lkp z!!xa4a*0qIR)ETg00Zsc@HKtP;p=~X&+MsgVWM;Xwn9a=`w`=Ipf2O;5A!9^_0Xp) znopvtKQ5x`6IX|QtNe$T{7r|oNZ_^5{S8ssvkS0GZ^v6Z7BWyB^W0JG;Zij9qmd-) zpYSK6!=xUL%xYCBqN`X)>OC1x_MKA?o=(+f-)3pIl9euLHi!5;UI-?E?AA6vcKcfJ zH{FD8zz?Kku;HEY&pnskl&#Hw;lNEl6LH6U3$msbIb9es6R8fLKFGzAwcSnsAsbLw zUiv|{J@AOQ5Wm$bfZd6_6KSgSdmd|00JC|$KE)_3)vo%Pp{e$(M{9;Bg=tv-SUIv%5WH(<6DYuo5!G0a{h<(2|5qSVn>gsg0AGjzL znRuDhkZ;`;8n^;*-UIv9QFv<6Xd1Ki#yw^}bG-+)c@;m^Gms&lox$G^sh8i6?10mS z8Ze4;88jE_rcihq;rF>$H}THdegd@0M2^_`k=I6w4G)Ol){G}BD|JY0-OAH}9+It8 zXQ6Uvm*~!bC5A-WG-EHa%Yqd(MLWR3NF#DuO3+PNzoYQh#4fzkYIK?VP5lkbw>l|_2SkN`<^X;MnJ#I2kr_La4nGdw)Td5SVF|@e9{;4Vufj5OwKCi8!+j)F zfed+@UqH3+)lXmWNBY_pEZjLSbBFst?o$WmjpqVn9bNF6Kf)KQw`VSVa@}rHirrhx z;i6x)>)zyXg&outu76YplOn`+x0c**@)o~(khIgy+|ox4cce$o8Ho*(`hE+lm}vm>oXR>J|4LkP(Q1VX zt?qAzZhJlBD@J0i{>D$6>w*v4AM%lX^=wXkU`>%87==H(#dvJqIDh#UYhqPS-1*+O?J8u@bVmeBu=> zMS336KQu)T*1W6iRMKM0*-Z$+uBYdJd%hiJCaUNOBW44UBb9sRlJ~3E@=J97)uQz? zf$Xaym92Z@21+1ZVk=wh>?D*=1cb-*eomP0kWVgGR#+ICH$)0nSt4P}FzS&L%{LiH ztBHAYfqTEDj}gOp4lHTnt-tEBXmumR6qfcC8gc)KRII+jBQ&hf<0imm93Q9*I2*<1 zR7|40+6_e;$gY7#+(7v%)KU#4?L`IiZ$+#$5geWZ=kJ-`wU!=nDIWgpKXE@ic$9pT zicN=qOGHC`Bo&y=XMY#>(wDh|F0F!v!szFj{8tI=#9V{>HgVeG_7{utj|tsbU)QSf z;C6>T_j}a5oCYQxTid$3@sORgp(bSxYmc?ZE!DJ-i%KW)4}fI5kckSut;b~EQViB> zz>ziwm$Y(!$!B1vPds_;-zr%Z;5A~sljJ0=Jf`sCOb+7iZ$QiAPYrR%4q*YkVR=5~ z?MuHXp}M?NNnY*t1-h#jb;P>wUiL+$6>7=6>l(M@Ie#aAf0&LW*_kv5tBvqG_r2C8 z4EF#8L+i6qHiR#6{9IL9)!8|6;1JFcz5W}ZZ1Wqh|13QInETM}-oNLd)p9G1tSra2 zCkA!g0#!c1@GlGAw|>D2X1S~O9l2K9RsESVko4eC)9pMq&mg~^c0|&HmlS6KxPPuI z0zQyS-B?SVnZel?5i_zL+mL7yM;VcHVnkHhbh?a4n)ctjYqk$?(AB|RbB-%=8+SxS zHQOH}$ad5u#8FOFN^P)Mfa6&i?v@oi9mF_Jwp(cL=;u$*WTe)(a_?xfTN zFbBmEbbBXii;su$*zVRQWz4573<+AvB6|=sYewNSk*8+$37Q!KU`(Oo6(sy*Y|i>u z@NMvFJ$kD8RZ8y-wL{I2$d7Xwaa5ZNdh<&Pchh}evvH4WKDmKnIh*6uX;Y?X* z&<&F9u_0wYb4x*X>t>xv@ZQV)Q7+#lbrTM?>+Y@J(Jr59&PY*SveE#ZTM$^b6pC{A zq8GIQO2R)P;H@&s+}zwqu2`Mjqm-&CBMYKZdHOqh<-v*)FFlKO^cd8PG7vdlNz2du zPZlV2PRi-$ITJAy*kio%&UZ7M)Ega~&~?x_RKLxeYT8SHV-y<-@ecR##LU_%sH@ai zAQ3g!D%m&oE(Zs;g97ZVE62a&si?k}9vx44bUHoAfBXg8w|P*A zywC{ZNUJX2I`8vmu_7?$1bTPeQ3Rd(;|w6bKkFAZAW23l(07|}lIINB??atu`&1@t zfn2e^a+1=Ly+mRASRWu_#evL62%=&QPT)S2+n=lR<~@zQwWd%MvEyK%J}P5}zZd_Z ztSD>xZF?|nuKu8gYE5||ET1o!^g&Xgoh(gUv;9Vv20Os;fSiI(?MdSqog=tz1i0Ad z-a*9BzVh+l?Tg=w{1GUCbaneS<}rV;+!rZGdajfYXk&U zwt62$H6K`JT8c`br+k!vxN&$p1N=<6Z-!CvrRwo~bMB>w<1YDeBxk8WhF-RSoIP~f z+j>z_B>p&LdyN5?QpS=Cl7f<&?epDOHS-h`0wmxL17SZPPoDm0t^aM&iIGUM0P%i5 zsf9m&_L%WZTuUh5%G0;Qy^C&70}qO3JtuiW*H=koCfBj<^oxuBTe3Q0e)HrAK;mP)nE~**ZK(SiDj-c=QKiwvG7K z6@|Y%$S0VcZecPWRA~N1JEuJ=;HP<(;Wc+tN>kpo3v~$is(lQ`wo>fak7HMLWa|tD z*mH^^g3R~)ac2NafBYjr ztTiFWM53AaZG`tB6Gzj)Yr@N!57~G!njZMmvUcVkY9HDPmvaQYz7qyqHc?PJ(J5s` zt)V#(jgwTkDM^qKQrd6JLs%5ek)PllVySE^H||C%d{7T z2++E2zUMsip*9ZLFB_{@~38qw#=_lGKqFt?KaAI?` z$@@%4_u!aa89-kPhgxeADz7GTh0w7PO^~=exT~ocgydih-^erb@CthYFUFApo6oB} zvYScD3q(-GjubHI=@#tCwOM&9I`KUYJ{_ljf)eObWQzN*kj}*JzFh+awWmf?J zR9eUXM3lZDrnyvYg!gB5Y`K{HNJQ8@ymsi=DB|(8W%*)O9)=U>8XQrh48B# z5#nVHhvIY9E zTQlt_?(4BG$0V5Pr(VUBp4s@uTxh~S@De#(IIq{gCV`C~TUsNr3-R;DxYk1O7>w!- zi`wS(R<>8!Kzwz=Z9b|aWtKEoYa|wK6Y&Xxj?ZE>=a&MP6tyuCa@yh zZ(9hSG&ktBn3#@qVWbL!o7997M}-Royczb=IUMT8fB3;2~QKw zBCni_BTa6e-S~z`j_2z^F=h$}pO{`5a-wNt+;}8~cjOP2>TS4Gy*ktm^(wcZfk@>w zD91YkP)vJqztFX{ufa)IWNPhP;(E&K+gCT-M_!&?s0bV+=XQV!&FMvJdm#pftMIE! zDxu@#TI@+`pa=DZ>GEm?rc^Z)=Pyz*dS_A>j|z2Dawf!YhR~tc#W-Qfio3Q=CAc%{73b6o4QOA zF5-z$to`fBfw@Gsg~c+PCy9<1<>74 z+3&(3t|OG8zvnL^+-9qMOQ>!C#_l}jtV6xn$fVsLs*^;c3x+%HgC(;`n<`!acCFN> z?t>tr5Cb3Z#|3)GH)OG~l$`t@blP3M+n_}Je87pvGiXnPl8s-=Zj*9KWP@1?UyI7F z7*gN|O&{PRm(N{;4>9BO)X|^1jBg(NU-b8x-E8-ryn1NNMKjKVy7TpqFR?YCPVW3p zt@AoCdk|5S=c$rj+-GS-2l=Bi@r1*i2X`9zpZJIEEd0RMb|DoIj#b`QMMuL0w65n< zQX6~S)IsO2%2hb}qnd|}Z$A287D_b35avzqA1N`r%H!>-Z0ru9zQVWD6mLtT?SEWP z0Q46Gjkz4^>z&XbR2D_4fbRkTsEUn&YcWR&?o08i9AEZ}|J_Iw95eQE(?o-qzQXpp z@#q7vq5s85m`j17dv>I06GHbE2#yPqX->|mXdOMXd?g2OREc`fW-M3JJfLGvyO_Go z6Z#jEUS4x>*R*`qt0qiVBq*%x2i)QATSql2zwsdyWwk0lBZWLXb@YFsRyR2@D?FCR zG`Hx52wYKTTj$5Vttbi~l84PZo$YeWiQcyA+F`uk@uN%=^q z9`E8O6BrO2d(vTlMI>$nvIpD)E6#0~a!@RW|59kNl{BI9s85lB@gNJ}%wlD^b9kr7 zMMN#B2YPXcf5k z650;{D4u7>tlug~k9>|s8OV9J6SN4pFgX5vSkTy&U$;FEPsJIiY;L;63vKRj?|{UE z_O{=d{#C#bpj1ptPhrdkq;Wwi*+&Jy5q#~fR49BDs&q&Nz{IYscnX|fvm3QkITU@` z|D<$M@-~&V6{La~wf_%#Yn@mb3RWZIqb)!=wc2B7T5mc^+p6nW`h7BKBj;<(3MlhX zM&~}Coai8Z*o$gWJG)a#wh9W*K|LYnKM=4++O*2mSmzw(qqTadJ4OkS#SoH}QfZfr z+W26+_vEj%BF67mY&cM{6z*1h2|c}d*vw(Aff6wW&tiHZhNY^JXTNo?f3S=q2~z2< zMJ8nOei(ppOqfBS?-Nb@fz*GI%b7w|BGrUWzSyaNwn`3i`2a80Q-m5ONhC)syi+3O z7DUJP$30P!X|S0!?>r3xzTaq;ZL;E+3o``Cbq|L&aRtq+L~ncU3?AB@{_u~60!wg_ zq>ZmqXv$aP`7V6p7>!eRd(;Vm`NK~h1OssG?+niOgCq5+>wG&)kBxt(jq|C{a9m%@ zFk)_W?D5e16EV#~Euco47j=xHEpA@>eY9gP1ynUCwEHqS>Smr4IYYGsK~9cvW1taM zW?IIOF3F}}?AKNtd!0~^v#_VWlb1e#eS9Q05hRI!6EB!c(Nh_dHP%8OPaeU;9uS1O zn7r>z`nns~=)2Q2y#2fO}!U@d-Oq6x~i zbz`{|_#CWS-Ck0OzF}-gUHzYk=%g7c!ooOp=`R&hj#U6kYArf}`|=GyqZaoSOVGG@ zc-wSp-zTi+1Jhhm^Ey^`j~QC9MsN~N9vhMT(M9Vc#dkQuPd{Nx=s>TA%9zd{*hJlI zACl&(N%FGlTun|>Guor(Q)jxfUVQ~r z7FBoBWX{fd0W`etnodsiQ7M#9lGG7$zi+Kt{6cVKde1p*#Jw=3OZ69u*?=)q(Dqd! zJ?riGt;kb)Ahnob+b0!68*Gu#Or@mk^qga`{rN z(>b-c#`kgp+&*94J%mgK}+d`VxUU@uz}UO$8ld1mB*AM5|(1e#}l zZd@@GS;h}8ysV&Y;dS?CLvQbo+XVM**9*=qiPf85B z%(Jf6<>YFYng}2plfKoemo60!4|m9~6N-$If(7h?s^@;PCHs}tirQJFWWARZ#vkEN zGG=P8FDnJsMSxKaKgN#{x@xv-a7VlQ^Cf$c^vc-fynv)|>5_$e{gW|Br%!V2Hf_6N z{wDfBol{J60SMpl%byn@Kozj>SF5cYgQij^8;*GRnGVms?tANA0FXKQ)mHMaB zy5K>e4zm{LxCT*})PEM~Q_Hd4;pO6J+iN!rzjx>7F1J##jNKP2^gHwx$hh!K9@>ZM~s^lXoV z?vGz66EmE3Pq>OQ{}V<5T3@K08#o`n7gsW~R?Xmbcd4WRC;53$)cL71}Y(Rexn-6#|6!F@`oO65w)}S5_TG zZ_E_>kNDK~9soX{?6w3}$Ge}U=C~mJbs={Q>mw~i?}NhktN6q+`lcOfUy|3PQ}5p_ ze`SMatY5szd=ASSUBcXqhx1YMJ}Nf(xJ+p?Pw8XEE-|F}3+qua&Bt1{Fp{JPx|$rxlOX2^_)bEr#;vLvEd!VaSHK{r%It+ zZ;RmJ!LRm5j7LO^YZlBkuUbAEFyMxc#JtOJINL2yN+}l~xN&#@=Bcq^Ssh#EIQGio z6pjU)O3zATjn)$6)8xv#jI(XvTo88SQruD^w{^VC#-iLk(ke{Y@rcf~j%yDcx83XClY_jW2z6Xt|%TuB9Q`*@n>g5)tWF@^Xg7lK3$B85g0I;2{M!C(5@BR;_G-P}&Lj!`Z-j4Y6 z#^<#j4!2V9scZOP<4n!Z66P|X@mIt7R+SUJApoh`uKzb*zC}&lV^otnqB@arCZk%u zK(4Y&O7b)o3hjdKwWW8P+QA?W{m2PtEjM3s&M=gyCyTg^tDemH^&4W+RHD> zkrgz3*mf@G;tzAswMm*MOu4T?^H3gyQ!uvm%T-wwTGQ?$*Q6%TLo;Cl2lxc@oc@w0 zPXj-(8kC8t3o6|>sEedb6=uL^lJ5EqF{pHtTH|Lj8qZv=qw;@3@g z^PtkpXv3H&o@N1O>wI4{yWrcf7`*qB?>D%0!|!MwcGy<`|$G$ueqQ4mn~AS^Uu} zK)QhmN`h)c$ZsKiFQ63I^||em079P7?KfC~zY4u&c^enqQI=%;Fl87NY6D+NhmPNW z!rK-ReFE$>d|cqLp!5rydntY(Vt#zL9S)6_AaBa)>snrqk zNs)47TmdRv@8MeXrAG>lr^MK?pUY5l%?U}#jSoF1@=!4Lq4wYo28v<_P4s#Tb0h;L zuF97gBhG~FS(V@Dd4U4ibsR#ES&HWrwogp1!bTSj{W#e4%}QbL&FneW5sTLSpVm`UPx zU9W2Lgz$X_Za<|5+RqP>L1?uhre2WLlYQ*M3UjL{lyr;CH*Nr#OgP+#D)kSEUgFsp`C^Y7eoKMn_f4)VrNg(p z(ViQ-2GbhqSmZ{1fL+24!)i$fri+exc~zpQ$RBzFpf3UXyO53Q2zlt)5{2?UNrh{m zw@>@T7=Hv;EWun^!EI}gQ}mbHI4f%6Wv+=XF=BR&KjMzQzQ%Ql^fWoAg16D%w&q~Q z^A<8%kjdD0au{?p2Se7HbCl1#Z{2kb;K)^~i%%u3R=x;GTH&g@NGqcdb-Bet9%-p< z@kwK~P!}9)kYshshuukH3DC^Uh+1rj%J*rf=AfXOYiNA#XJO_Hp)hlIG53me(th|# zXR~7UB1$hY%wY)Sh2sX8LBoEnMMy22G&Tamc>df#zqeXN1>R?^wn844W4=6UYsY22 zwq{62539mptsUU%ZTAZ=4x0N!s z`|&lX(}WlYoV)t%AkIbH+}3v+%$?>)&f5hC9iO_71Qdo}*r7v*9A-aICF>ZLyiE!b zbLA>I_Rt*T!FTB6K}I5ULi9vNS{F~wi63=B+T);3XzC(x`HhYt%t`Elm-j@i5d{|A zY3qK*V%BXMwY&Hh`sv!jH$z;@S8Df;rYkr`Sx4-^r=m-zbp<4_^9( zWmT-C9^@2!llR@m-q4c%K1f0DZhM>c$*zDOMNHBFP{iaHyECV|XJ2}Enqg~kThWJs z_mk8OZF*vQ?%!5*sZ5x%UAykZ1t@cLwgb&HCtY{T{Pn zMH01I?!)m<22P4|;jO@i-ZrJ{3W-A)=_F4xD#<8<2|AuU_K;CjG;d;5ia z@&LHXEI)V@CA=d@h2W2(H-P)(_PN{s)uZ;1OkfqCc=uF>>Tr`@PoU{J!C84_Gt=?2 zMdSi%e@88t+dvC-3mx|hvz-J3)eSFmk~F|{^+je65qB+GStj=9T@&E!G`0`??DD3; zgtZKMDQ6utXx9*~`+&K!Hfe!aVmO8yxW-&rtd6Ey3|K>PAuxR$Scnlb{*D~D9xTAq z)?4(3HQI{VK$r>3{rKA~XuI3ECQz9Ww6PbjEPyY>P`SJ2mEk?kdnG4=P-ny(HY3G> zb<6YOt`7`~rxM9NSs7#R4#CMBEs2xqxt&NyJ#dQgq$Fphu)sM+YLSzixy^svH31T~ z@KqZdp;eZ+={uGdOn=^VgDDAerQ#h>Imo7IXIR?EmFLeKrkyw)R+rV> zZLK54bIp|fMzn^9UpyG9N|u`uKjknKi;|)Xu9CDFUM3-t@3&_nqw}@^4J1>PxVHxp zn9fsJRe=1I;KZKrWj1JH^nc{_=0;=VI0eviqH!BBi8xPnfy%w|;Z#J)l<9 zeW9iNChBKHaxO3clK=1k=0S0ztE-M_4pzngmItGp<=l4>T?+`lbj zA3H#6^FU@dT+pHJK`9Lw;G$v0WUJA_B=8u;xD9}Kcdvaz?LPOWtc=4wVF;+}Pm>^2 zTBVjUTh+NupR=ruFgUr+;ZYU3=>b`Ylkt07x~`Gc;9nb{m&gKi@7zH9@-ySzuA?J{ zn8{<0yAK9MWjjszpjw}!K!?eRmZY*-8?N9iu{>M+EUF0tf>19cZtmp z|FT;PL6oPV^}DjAZ*PH6J!*ylqqG}0hI-;IJhb`ZodYlwUN*j(RabR}s# z&*b9FYYJ>(OWFv1SRn(P@%0Lt8eucXTs11c$R#&gGQ51{bvaO31kCzx4bD%5O&zcw zIFOAbWq}e#LJ(mT(#>g$cS7Aw<r+Xth5j(+LVBuV zQ&V{^`uGSbH68v6qp_&6`aDx{4gihU@iPCC?%DB11UCYp{etUtpH?P zBb(@oAb)CDb8^G3hBnBSQ&0`Idt|YF zeO55m;(F3@vO-MlVXpwYLp?7d%~XEFRso$Aw!}42N>fHQVcG1JXdx;ZPKnVscC4B)Pwj{0iuM*)&`Ez-(mzvjyVX;mX6Y zhy-eJR=l7^^KvD@?bF||aN=AW{&p01!AVz_*Wt3H5TA*U0_^k@teF@4ZH%GiEPoyy zsiy3zGYc{GmO&@>3qQQ>>Zpwa1zDq?EefWD1V2Xin6l+GIj^HT7)8(n@qWvoSD1-DIUg0CWqIW zht2B+dplf2=ddti;1y|JW-!kQ=iGBxR}dJbB77YdP)~XVN(^Rb`q;_gGvOd@T??G2 zx6``!t3Yl+iab^cCPk_!Ya*~e?_uS zB>v>c_B-8{>QGEMNCvp;zJj?9ssl+WJRHjlKi*ayz2Oh!OSRcyLl5uc={)e<;>0qc zTY=#B?AuqNXfKgJJVfia+3Q!yNP(N7w@2ZOO69y<;3#g)3ETpV*=~Rv+px#}jAd07 zUKo;NqJUsST_Qe8o}zT6{HG@6V@)8GL%R2da4xd#?b zcJVn}v<(gkYg!I|vYBAZcbGEXX+S=j>T|n32a&-~6lYXh!9Jv#Ro(IXNE?6(-x(sy zm(mA%sxp1E@=qv*wUx!pURsT)_n;xE*6WF~vn;v`S27u+PZWzDJiY)!Bm^{Ybn9GWBoT9wHg=FUHccr% zb&I_T4C`PgW`=b{TynOxdV+w)C^s`w3=|1JZqt_poMT({2S1RR^SYl-0VZ=501WLf zUODZv3Sr*k;u~vUIOZ~|DrqE{K~Tr(E#-j9`?yw(;FvqFWAiKpbE(YC=_&(V-#K{5 zG@`oNrMAVW6#d}!!MXnWd$aU!O)|}pc6G-AkO2ea_IltuHd z7LG&H02;oN${!TmYMOLaAL8291NmsqloH%M&_N2>Oww8nch(R}$vn?p$s{{0wL*14gTIXIGi=3lC8kx+5alQ9;DqPv?c5#h33^N9Rmb-EW5V zZFS0-f)W52SM-O;fLryX?^gNu0}AVoIeN~zi+FC>Znhyg>)nV`&18PY`Ny{s*ZW`2 z8~izs@0W`|2#NeZY>pVoZ6CQPa~I^(YFX{VmS_gpp~+!iAK2e0(7zKN#0KMaxp6Z# zv$3rwjG@!hiLq;F$R=m_YgNs+hvQYx?Zo3j$Bx4#Z}C^RSF)73Ta1Ks{J-(;k5jIn z%{f(x+f2J6hw_iwNJN~HYR2HpBa14hofnN9)KVX>_dZ?KZ?rF7s+3U7aLn{kd%jqN zTIk_6H-PK+g;;TQI?b02uc^|Oi&p@91I@4lnGUMB--V3v~ z$EX-gg-Q@~Cly1xq{lXt20A-vZ_kqUT0YITkY;-;t(7CP8CHoMCf$1#fA9H~P$VQh zQ?d5hvC%{>A0~?KNBWm~hR%t$XI7jQ74Cv%RLAEIh}h*l_|s=ksebzySDeCkqrCaK z@Z+$UDwQc86@G)j+Y~uXw?CJorK=^Ktpt1Tg^8tnR@fa7;O1GkjGzPJ@p~feL8I)&- z<~v`{sfVygXil$mH_kZGY0*p9yC*xu?rVUQ;1i8%r>e#}X`Dk!m&LC_7#X!@X3^g5 zMlIS34(TPeeaFJd<63Tp@l@ico4pEzH>SOu8hK}nl)WchZSf!PhnyghFo?3RaX-yr z2K7y9uy0Mh1HSc#l98#_H^f8|4bWbfQHR&%G2N1rM}7IQswxLo@G3su(}#C7L~%FH7lZ$^{jW&f?msUeUIv|qd|Vi*$aES zPK}GUpSo$YbIGj^xQD-gi|E&FrsZx37L%g;Y`EyBH~cZP9BE}IQz{&pm?_3sZaCIR zA>`k*kufFPqiWmEHatjM*=9e^bjrywvw!wU9g7G=BINOK_fUt;>@o3v z!>W`n^6+(r^OWN- zA1+m{y1qX$J@m<0dH7_;I_=B8dS{o=#FYA$s)ji(J=72b8N z`Wm0QV|r5Zm6mEY>%V6(dMWc_)jp24uT5HZU-0A{f34M3Ida2!12D^ zfG;9Ph2X6l9Rsi&^=s9k5)4xtE?!GoQt?4xI&=7t4E#0vd=pGxg6#S1;_^iu1r2!M z8_%hdA8K697<|00>yg_oqHy+w-4{*mQMc6I z!|DCsGV-Exg?GidPnJX}BG~583ic#}@hr@#8BBYsWlR`LSNXaI8j2>|t0WdJrIK}w z*onj9Knu}3)f1GM z-h*-XOBs!(B-Johc#I;S-moVHVrr>r;MIIFD?@@L95?5WJrc+G+zJ6b+aWN!Q2?jN zq@!IdJ6f`F?|bq5C&2Cp&-^u3ANC2q%hfJ*T$PGer#K-l=8dy$*%)c`#M)z8Z_}VS z&}S3U0)C?@Z?3A3C1dViDhR(@e#L;AVLGj;zI%fAY0{E&w9~80B4pROPiI#xjNBal zn6TL6YCLiHv0?av3jM&|Lvb>eQd(Y^dyav9_@m>yE-blTui%5HJ+|l@;+XG3vhl^c zI-kU#rg&o4v!dpdbJSbB&77PmUA4D;aNcm3E-vN@XH&lNRIex_lkm_V#gb^X zKW)D(>;E%DiU0373B>yEwyO@nmj5yRcGs@|TO`PR-|L`@rY9i`mVMm;i@vlN4=Pz( zhMwrdY~6we#?ac!fq^fZ*uBZ0*u8DTAi!d`mcHdYen-4)KWFgN_x1k(fPw6U9T;Ez zb7n*KY^=EZRW^2Z+Ef$5sed2I`wMDhoA1@>Ch&rekx0Yd^Fr%0HeX)M)nvE*-Ke7Y>uej;s#oE zE9oQ(8V(w|zE}}5YuemeLVX42<`kSWB*T||O2oHr*Gn?Kdav>n$9EP>=jTF1`pAWP z^HyKv*sVOrT<*@M-t~0u7r+h}#Lz3m<$!xTCTm=N`)buCQjug=c9X}Uj!k}d1J@aaiMrkUzL}O zya`I779SOg47E($CNr2|Ns*RWuvYHCOnI6xGQMe+>o#{aNv*;h0A|o6%@8X1D!wRf(h za%A$lEWdDQ;GWkPo<{6-w!LYZoaR^ZNO;LC+RVH}^M=0elZ5iI=TE%nB?gQ>>fs6s z+k$$e#H~NJJw26Js;_jHn&h>u+gQY-`3l$T&%1A3TUx#8#Cg5K$e|1^D(U3a;O5<} z%#;ceAYgSSH1(j(xLf`gG84M+W;Tm95TnkLwsZyK75=;o=+^7=E?Bh~2L2 zwP|nfGg(vJXv@nMDV|iM=NAy!YPEGM1EnRfv^l4c2g;ps3(L1hwF3J-p0Ku3Nw$Ip z{IzRqZuwRk<$1SqD8MJgslPNao!1`l)^=w%_3?|Jad1~08TmyYsw$szeT^vZ4lqn zVj~)UxA`q8{kh3ZfsOvQbq3PFf_r3HhXV`Z^wjcWX)s}zd9)#kEy*3cdXhR>pWI4A z`qzLpk|mm~ANmZSaqnjWJmpU%d2`4@1lU_L&VaR#V6GP35nM8iS*ij`fWx15I->UG zYpc4^W{KIo6XvCklEo?=|-(k)_ z7e;vKr}q9i{9~~M+IX6X_x{r>O(70~D;5CxRxQH6J7-`v2e>SNsENPf3zxXRJnnUceX(JR zWc^Bc>@VKXy~iR>uO4I4MjgB1Rx>Dk>d3{iuKT-o`)lOU#=d7Wrhj&q=Vs2$3$Iyj zpFH-LDKiCAl-z+zeP0u6=IVRJ11}Ki2=UM6rs!sKh4zaq^}~e=nAC~P%sDADwS`uZ zlH*9nz_m8eZF_#$^iAe?Im((UB9hynzcnYr{_ITYkcNFvMsTma zdTfST92nW#TWqE7Y%HCuuG;Rt*xzHcRN-(G{9`rcT%F9Qsp!SE4p%^olgYh@I|a)z zIzlP6H3PV~fk%yNb&Www6-(tUCgZntBGdqmae>QXH9VlMbnv1x=VVe~oy6->y%6AK zOl1qC4F}X@S~PmYNWeIB`%m?E^;%Xbkf)bSn!?pU0Im<=x)uJLy#VQ-?NVWG` zwR!~XqnW3IIJg}qtQK^OE!wR!&!IB!9IEe0W6r&DQLVQAmUcO-nTCrRw-)3Q)KG{# z5^9`T-aGHf5ugr)De$E>L`%g*)nj9laQR2jHQW7D(fx&}^ulbFVdJx0*B)s{BAp*0 z$&yJB2zqLV=HePcTC$GbU&CHuW6QnZFCJ!F!P7U zEpVAmR#|ljih%L2*{q+_BhImvCQ0BI=)s3RatQBnQN=)w+3GQsD$rX=gV;pLiNtmY zZwqds@AINn9E5MFPY`lqw4lQJ1hVTL?}-FQsAfT6UAwbphel-VBf>>5ot74wwI{;# zTH^65WZE0UzpStONhejx&7Q zZd3*jk3UBp-;}TKe;{54ZoP>3I=KjdliB{nQSFld|0Vw~(PaLrafP+|W|{XP>Ofe` z?p2i;QR0CaFV_IkNc?_U2W5&^epJEeR%S5T*PIVZL^DCXi7xz!yYe4IoVL zXb%UkC+NXV{!!TcrzL{7QsbkM7|Fu@a|n_pn3jpEl#tk%fk-dV++$b=lTILF`waN+>tie z;GwQjEtHMPESe7S7_=7kT!}AojIkYvEuQS7D77n>m=Ti@$*EaIaJ6{F)>=>SzOr8A zjS3Mv*nrGiN(jM>lvNw7%rJ+MN-u9Z~>w^r3ey?o ziA*(%lOGAINzZT4Ex#u~i!-{sE2948rO3gJRZPP1=$IP2wU2i1iycQ(FAdxP^abqn z=5OQ^bzz#;)aSP|J!1~LFFe&G^$p%_%@?XsWvKnCo;)C|C2i6sLf1^aF*3+RL^DHJ z&kP2IJ{Z&$5pnf^7D8MN*gg$r>yfE-Kn<4W?}X*ebY*v!HMTH2)L+)=j~DEa(zdhu zKkU7ERFmiSHf-zZ@ojD8P#Nj~v`|H61(~N-MN}qblvyQ!$Pghym_qbe3xa@%5W|!z zQ<;@87>0n#oIsQaVF-x^k_Z7}fRF?-d{1mkmD6Lt-&)`LruF{ia)sQvpMCG)+SlIq zetaPWoN&!39?x(LwMycK6DeNfu(ayiCk0oE ze+<_b;uW((w>1i0|LW;OuPw$@$5AG(q=fCb!%{#lJgGJvYjn~}LzAByYl(*AQgQQI zwZ$#iD-$zDLzW85tO#o6_*&k;q)|JVSPc_SAi{OM*YCOAIMa@~)M;vMo# zur6;d=S+A);PS{V#lJb6u=Gn!C*chf1#!b)y5e=S>qr;zCiDWMe0UiI4;sa2#W*oY z7huV(xJyRFvX>Al`OA@_n$f}=43b+>9d3=~`N+@RFJ{U!On=)b(?2-qpXc_KOm#B` z2GVK3tH{Rm1N%+(lrZdnZUP>z=7HA9t!FI1b*L>i{jv!^)EZ+6L4?Z%pzFgI0_U#P z>}{JUM?CXRI#4x~0FJaAIf#hx7Wkq@M5qL925G8|(4Th!{2*>JZ?ra4bge1U7s)J( z8^=tZMj1np7$#(JcDcExA`V3&`vauFoftA+J~+E4Z&;%$r|2jq^a)>iHZpN$GJd0@TTD0Ay90;oEv?qg@5Tw z!+y}^&>feS5l)Cgt;Y;Ul*%tbOr34@#Y z7My=Il`ES(X#BrMw@wLL@_DA7F7pm|o1qI_f*#6hmHy)C1W}zdk`z%n&E*bF{k+!H zte1TlMZm1zbLA~#8IYp8%@tTe=puMpaLLLjd)-K>0mlqjnEpqn{vYaPI4Nh~kmociHpgZU`$! z;Hl>RS#C(kdFS(|NV7^wk`r~iGv*~zoU-b?)#c=&P z9;1r(bvMd{9?)mvo(iTHh(`LR?3t$UB+*nVzHG@1AHJnZV?o!*!#zSyy3noTyK}$l z+VTvD81J?hl;tTOm+pKyJj>MoQ(Yg>D66`;h-&XhiioYU_K!WE+NSE) z6rAf}s|h6$8XyA0k&bCDt+za8LLan>8>}n@Ka)Yma9U()Ww=o;uZ%v*9^cJwv{%< zy848kCuqG)Xb~*I!dIta`-@fjmS1ExBOp+_|w;uzuSbgu2#GL5aB8;CFbt~X?{uMxPnGY zgNAjFxpq)adQ=YCSea-){%c>D zpMQIzGsx(R5zyco^m*cU$e6FZ7yaC`KJTmt;Oqw&^4qHReStN)`@v;>)oHkVNONT61R;eC=#;=8$v^i1jUfVG1dOzBmeC zq{`r&^yn97>q1rKv(rC~@;dH*Bd`U_2)8FSuh;|yc+@wZ{Lxd}+uJp_lv!O;ed_sF zgsoddo40oED6vM3b@>AZEA%?~Rc9y;6{H08I(8xF-n;|} z9-S;>60}=mm-tu4AMFest-teIE(_!*#}5o%!d$2iomlp#0{W zOc5DOJN77X*v@e%+mWE7m3Br%ZJUG8&q~-DYn1@Go*lQ zBrD_-0|POh;H--0+CX@hqC6lMI-NHX)eRywFgvh7ru9%{Ht!YlKvR>sdw^Vj#EZbm z?luI@Rto2Vj0lh#kgpHUQ`*g2+mOH0CQbebRkQ_L7=j7RnOzL}Ex7{{sc-JTV?@j< zx~j9y+Z}(WC?>(nHbDYd-ZGA)oqre6J3Vvb!e9ugv8k`1?_FB>=Qr;L-Ha>NNB?$0 zfK0EeLRu2W1Z=Ff6G5j6H+Z-H`LTD&p4qz-gX{|`Bnu{glipdgioP&7Z(remd(!uE z%K58*ofmy@9Dk4wz{$jPO2Iz}H;}&MOXd1XQ>g#`r@!Y6nF))SV4`W<5-56hkKJF; zPJeZdf9AMXm0p?a<33DE&Ci*4s4cg%VPNI~=}Xlf@7jo2?>;i{aHSlOE_-g=r#(Kp zRR7Ax=W2I@45j(+!ToyeE6#h(W5V{`S!6xmkk$Ov1z`r>=S*C&bmgmo#eI8oH*o7QNDbiK ze(mLu&j6?Rr_#AdRW;zj`?~;CK2Ms5f3wqu@$mPgw({~@IQZ%e(V2NRT25u)dMseLIBoOx0(Jq^zy)pgeiM?xe&;WJ5fl+W-+) zC2La#Fl+AWsd({!p5w0XZOZyC9m}fdlgk>psfvb&xPN$hcJ81eAYG4jBR2u0Y};jk z)L0(%=w5`4aH30b6nWd>C&Y5vJZRi2nB5NB$Z=0?{uU@+) z#YM>}jv(NOZ(i6W%fYV|kkD=zvHnqXBGnh5qU$Q1b8iBVqqS)79Z89VlC!_)S+jtW zcLrD0nBtbH%Dh6q^1iTY-cW*!9LZDJk!fCEg-y(AD7^>~r9XwVP%zQqPyFwq(qaQO zHPWtEJZ@_>q2#6x0hm$g<{N;`bXQBYw<$}hc8sX)Bkxo(0mfaFn5^_+ zDj!ytOb2>;76jMM*ds9ecKB6hWB|VZ105K21_1$VDjq>vkfy}mYr^5Q|H>U-MT zpT*nt_b6ni`*u2#vNZHr(GCvV2WU$~dcp;$rd~daAcqVc9cgaJ33LOMJ&hW4&rK(W zR=nLy9KCl_)dgf&KW)SKTKHXxRB!ef1IVJOUONrn5Xc|{f2pHS^(<6J$`?%MY|=P% zflZLxZe>pwD+L^@MvWA99;ug0-EzJ&l^l)lDr05e~rR!l-(A##cZ> z&zhSA{&5@IZE!}a(J}RX1&ckc17yNzcnP4EZvDCPNEdXCk;nN^{P%;GqIbm^{7&1S zL;vo2f`I$|@rYXbYfr(L_5rXD$f!BPo|P|;q}MlA;zmCUY_xfE%&hIlEYCW4StQoV zFtwjt8%RA}FzOSt_WCR^^DN*Osk)A-x@2jnR%Pc?8^b*9o1i-8f<-)}TV}Z-FsMCr zobg)r!(d=!_q&J61p;E&=cXDvg6RQk&1AEB6KrTxwf4XAKMQK)f&xMZrdkJv4|#x$ zI<`T)&XwT~Hh>MKzMb~c~dBRzXG!%t-p%r8;2{kx-CEuZuB`c zZnz+*V*~&FFr01%gMaS*FF@OWgxvQEQSTkep9t=cic4pZ#-=d2QRF$L=%kEFIu}W^ zZ@fHrp}@iq_DiG?6d4v2g>eAl{MAFJ|Cn}CfZD{09b$uLT6gZme<(ojtMuG8wL(_w5tA__V zwxT)h$dT9mvo%rr&D>HQ03Lm#wV|4uj`vx-bDU>*DR6Z5n-R^%Q2`0S|6g=+=(aBa zRhM1^>_33Bz1H9%wJUo9dfvj{Zotp?bDf{H<&hYfy*>9s_DkrRMb| zQVJED5S!8Ff^$@sH1jb4NzHZ4Zv*K3o`f{o_1g>s#mS-DQV@GCVY9W2>h4oKDNSsS zO5x?zg+E;7hVlRJ4g7Bj{7)rd?g_$zB_pxp6ebOpJx%BeT|L$ocBhKdb2NQf`LT~~ z9UCp_g4R!6|<$e~d`Gfx> z2|jOm>Od-9|G()f$x$9zQIOb88CO7N*BG#L^u2E|60?wBJjZdGdS>&}Kla;dHbIS{ zW6gsLUQLdg+AfjfXGg~x!&>lKtJWS1KR0>bujtc$S8bJ&1e>jF;aHYu#QAY?23|m1 zZq2A^>ap);V1vt8o(b4QIb`5Q^o-HKoRG35yY!{WY*^ibo_-ZD3b0IbAF+cwi}`od z+SI*Ne17VG712&_&?piZK(P?fbSA9U629Deh^7CsHc;_hd%|BzHQR;}W1Ho$4Os8uwD8q&tLs}AF{yV-N0 zvtQK@*Y?swgQqcCGB-hx*}39KdW$6Ulh9JnkFDeEwA@1+QQVr=cevT2tglU(MC%O^)+$`+J>pyg9snljzSgrf8zUqEmM(mE;U#?W8P_jIl zJbrj1Dn0%BG$?hZ7{F))>A<;-W(PHXApFkMs#IJ{pM5E#q|+Xk_N*M&y>qq!poGg6 znQ(ZdP}Dn`TS&gQ;eXG64gkreGayZUdJn)uu}-4BJ}{GWjit^@IY4Fr5Weomvpf`% zUCw*vYv~hbYaV0|{>de1e_~U1!b+O*kWEtU&sm1Je|R3%wGCBcuIC(~I^H=5mOK(5 zQyl?yLOAux=K(41?-T(fJ7YfHjT{r2Vg${%hSs)O)U#F}gSSbJsfrqVW{!$i6+iK?-b)M_ zgoPTzZ`T(9>83!M&qhrj(7(R|Q2PVC?(ybc;z&Ib9r<=1Hw`fTju{Sw1O`C4hSXoY zyV=Umo*97#7-d^!JE6MkcuL5|wbn!V-d8mBjlt!q_@lkbeY(_v1NyiF`o_~MmXS;) z-Z8-=TVm0eFSyjyROCvA_EufsVb>aT4p>xmjGm|pmt3{8-a?eY_V1r_>qm+|o%w1c zDi`R;`BNa@C(yPAQ}$-cB`cfqmiRxkf-!tX|9YvmJ~0mWsR8h+J>y=0$oVj|GK{{z zBv&vVRnHNKuRJSRGs&-?vSApj#F|+}vPb$MYfJ5V-PV?|MhV!O6sco@pq-K96OHpg zUqsz0+*53;v>QTxoAb3ek9mrlVxDEFaG$LRv8Qp?KcV5}XX-e+ER8#a3`OJ!`~bT3 zlkxqj_Lwtr$kUHkR%4xjoU}%ViR-VQ5q9Pc*ON~{9A-^bH|t90>co)hvf(z=KvIw( z_nA7H$u^hzNBn0cpBm)kLa6!U2TUegA1sUj$>m!5OKn|wB+g0tvyKYL+2G*irumnI zGk_)nq}ze8)1cAbNpJtMX~ER<4BNbsiP9NWX_6V-Cv{&B+efwZC9D_Ik<=H-(-gJL zE&(5rfMC-zZpAyU!9h`qcBxPdueNO(JhjS znG)4y2Bl)Qvsqryv7_iFVQhVv%)B0r<#rOsC!a&#xyt<0iU^mHQ3S*rBy*4#q5{J}xu_EbE*yicybR{@}YFp+Y|+jzEaOjAxEW+ZUgGTYJn#@7a$ z4RlLO9OF_~vEK1{Jw`K6VY;(jQ8npxVpoT`B4^skM)-C%#oW;*kvT0Zd398lx^%Hi zvUZ}&Kkfhxr*ez`w3sxFUTyyd)xtZKnAnITEesj&H9?82A=ERV+%uqvQ-Uc3QAe+w ziWui-;8mfN_1YfZg@_kWkV2XF=Wc<4Ys&bO_dQ{+iX72^lae|Uj?(YDKYhyEFPEJ* z4h?K-?kw1g3Trn_kTE0}7;QfAhm}MA>9KEbq);`*b`m?LS`w`y1sEJdGgMc@a$ls> zNzQe7C};A8#j^1E(_QL|1h^PloOmP>W9{W5CMd83sTW}WFk!qfN2E9n-W$6gJKIU* z@MMbe=RY+Hw-JbdV^T2{c7a$pl&SEnket;^TU5*JfgiZjG%Td6c0|o&@kdlUVrH&$ zhKf~gL6u-86yF)+Oe?t#w;6G9@AOuir1dw~Inmp6L15xScQjp0Apdr*5C8GL_qfI_ z*%Tollnl7Dp4WJKUSMYlwnx+dh4llSknw-Y!WFi zQY3;WDIKeqyQn&&yiCVjZ$I(bNF{60>lFW_{j@2<@%dJ{NLrNVT}E|t=y=TuO-t)Q zb`EBJ`Lp$+OsTIa>R`X~HGk*auLeSAy}F(u)F>W8lKH|@u0ybzSWuDTVK~rlUEd$I z)G_H1p(L=8IhHv1OqRNO7E=rNtX^I$$EijUN>gMMNYKSo(Ocu23T=W0C;>8#SdZoP z9hRno+CeffE%6Cn^3ygr)p_LM{5nOw8n>Dnz|l3S@rT zcRM~0?k0R7nAxg2Y(G&hu2e*JtNlyHVtp-^=K?njVfOLZjSH>qKmk-mblr^FBycpU1fg#CdEZJn3uQqO$!!LTlb;sHy znEGlVRvov*u>-90WpvDXUQ>w>ZL208Rii814I$O49MF}FrHy=Hdurn+f2S}$zyVu) z+b`?TI^-l@Ia1vOgX*ec@}m@L#}nOi*2|~g8I!fOYs48rEv+LkGo!7VoU#E4ey@Oi z5I3%ltfVn|qWIP0iej}vRm?$b+Iblvk3u8XW>JwO%(KTLxz6zXZh!I45p<~xOv z@IhVqgyZCd?`YQ)RsDPfmN_*aPt4A70e>Y>)u-DjALsxt8KZQX8xlu~W&JN}(|CtZ z;WBlXOq1381W%$o9!fZgz=EtRzf>Rns3bb)1h=BG==wryxHe6t@D_ho##Xorq3Z@> z;y#{cy7#(xs3tLdunqPnsnG(p-~o0=TNvV@_wkSa;Ny1YcnUlIfwl&O9%#VU9m0*e zY~pA>1d#`=GJP`m>Y28}CtJ~XmZ0o({?azJq_v&0UMT3g{$ zK}yFJ3tI8+&*+Ml|C0KDpT}N>kCr;O2i_Mb0qZk^D{q8t-$0YUa~vOr#Ah)}-6@kr zDg0;YHG0;fb6wV2xt56-vz>M}AAjzHmi!}K#sw+-H|S^<$G<3aB<`%^ z$3^{fsqeX@rEAhqWkH;X)&@|-hxMUNDEDB)cD$4MH=G^=S;Dw5>hfl0=|k|T-5hgG z&?{pj>~fJs)OiG-`@6p{PxDX{zwY*bOZ->#6g~Ju?Ss}q;I1lbxyaX-qE(CUf0R8r$1G*H3Kv~uJA%QU z7m{YL%i0#zIfAZyTm(=ja`uM4=PC_(jPVV04lNPg5w#KNnI)8jqVJJkcZ6ax>tO~jr&q{r!r+_=F^U+P6 zyWwN9tG9}p*6UO&YLDg2aAw(qJA+k+3hqgKz5EzbepJs~wtIZ6#Mb{702t`*_d-JP zG8P_%odbVnE5B7fKQzLs0B8HMpQD^FHl6D0ub+-wIugmp-!zF4g6!cCBgg!^g(D#$ zFwBv=jY)K5O6H_T?mZwP3Pks+bw6rP5C83a2J>X}?qI!szhW*u7nzDrFMYa{+dJ^7 zfjqL~GS=)`&d2#O_d%q|8E_oZWg#iX>$Fkx%di8`qpeTkQ?~i1@X2a{H<2+K`7t`% z|D%TcfeSxtQ-Rk8Y7~B$HjH6V0tFheR3puq@9-%_d*3+VPsOgfs_+4@Q|YCzJCTLuE{bBZbzvv zU829l5HZlQsHk7G4@oj*Y}zh*)QA1>ag`kuD3+^P0oKRXiQ*AQj*MOXcpU7uNHC;T zspvL86&5F}?y&SP++zI8NA2~O)?__fVgIwhn*ukPcm8-w{@;8fe+70A_0XDDyn2i* zKK4`Hvy%Uyr9X7p`8)=i&O4tpk4^D!Kdtjq#z%#XA z2lnvMGT;$u*MWyXWY}6vuOGym{J5cFN~t4GR)U!U(746hBV8)iu7JI8V!IkInUcT! zxWSNz*>gM99YOHsbRC>K2s?eGtCy{3hlfk9H2y38&J`+?r6?vwqvO7`Z&;0 zAIk&oA+~RYr4b;pQsn z^x6&_%#*zaW?-wKqBJss960Lj=MM+gsei$}(mr@HL69B+LG*?dM^C*u#ESBtV}bQL zUbGms=yiyR)=H8!!#bqc%C1AERVcQr&?C~Jv&TE&&9wO8*YM>+s%g7qk7W?cGT1m? zXuK?Biz%dDD!;cb)4G9W$*~5|11c##6eX$U?b|nwhgG=Lg_U|wb~^b3A`o(Za3w_{l#?C57KjR;Vuf zx37(GxOxfWKEq2ZuTLe!nSsp~8vR#nLLI%qzTD))3-o4qw}<^AbgdcYwRTyfv|Y&T z;3c3D6{H+Tu&z^ub>Ai6p#xid;DA^HCvYxzbV+c7W=y-qzS$_`+8Trhf5Gx+W4~XZ z@9~A#_en(xUBX^z7pE`U!H~L{vbL=LWV2b*J9&*zP_~ z34-Im>V)xhWr;39V%)x(Fg?Jnfs0!yh`S{?SOuw3DKijI76YC&6j_1`cRZmIoeu?zOecBm?4jo_MEC#Dtg}}Sm z6MHzoeTW+mZGaZCKO2LjmCO$#LPRRQhF2V{D)oQmdsFyz1h?(yw|^AoT&0)wkwMjpD<$Z~ZuGk=kXbh4?}jQ$@!P-?4tND~(jO^W2_~#~R?r=BZ$- zJ)Hh+1MFvs%j1{63sd_W=K)l>Pww$e@{g34x&Y#i^3#4OHN(kPRD$za-TVV9A@Agvf?1a&~xI&c3IS8YnqZG za8S=a>aRPaTtK=EOsFh;IZL`dfHOF%=dRufP{ALza(x(NapAUU_^pWPzy#gi$^E?q zY007c6>il^k)3bOo^K%SJ%DHrr_R3(B6-uG#`h#xPw!>lW=ocCn<_@9R$P42JoV<`vS_I^{NBnayZhE$jd;VwU$}==`Z4ol^ zbYP`{W?FQhh&g-SZ1RzgzRT{`Tn9S6#fAa2mf&g*QSYS^Xt9VRkQb`U2-HpfPa;_r zUmvPz%;8C-tjX4sT6)r$Ngjqo8-(`jPJ&xC`^+~9W zPv0`Lsj#rZ@afXx-TjPF(sxP0W42*y_tNqE+@I&#zI2Hf@Bb_)-jN9L)-5_w8f{jS zl#w%P91QdxeAV0+RzlvrjEtS809C53!1#0IGJUwQPuSO(0*-6YJ}ph>L{^O8Z~s)d zJ&(O+cqJY8G?VV9^{p~`(R`A2Fv9lktr$2xvfF^D7 zC3)oRiig7a$Bb0`TwL9tJdz9XFFX##IfEkjN=V{MYC3WV-Dtyr5soMJ++X7I($!UU z%Vd5$|Nbt3vc`OvlqTx8&H6m-_@-Y%<+bJxhL<98%XOCb?>pPLe&z<_pdnNze^^^T zAD~ae(mnS&?SrlxK4EA(WQ~u(!rB9a%m>*$1z}G-3!2%!IKRicIW;3dHUO;;X|6wb zcmuus^c6r_#u!OU7eHO*A4X~)ddYKkpcrGXs%BuNMvnr*X#7^v*w@hXOQCq4F>1L% zUcGSsVisl)b0$lW##FQ7o)g63s7WRK@k7F-pXC3fI{;wm0t%KEZcYJZl5+-|_@8(e zd9R9J^TLpx-K5VRw8CzKMOKxV>|wQRC9>e15D&OQ~Oe6nj3>k7NH=w;M=!_W8PNHS3X zM2dTJOuc>Hj%62Gbxo@7OajF>^Z?L_LBX-DPACAxB`Ms$TMSUEq7P3_Te@W-oA6-~ah`|*6%*qQ8wO~*qYa~MqhI12 zrshai+?%njoT+5T-txr`Gl4d}&T?<$8^PQsS7kQ}rHuXfruynT0|P!G$S(#Ng8kl+ ztMmV~2@<`AubY28!3=~*4>7+GC@>KQ8R6pH64wm+o&M&kZQA(EG;IkO%B~POwa04!pRHd3J{3sKt zW!!usfb|RDY}Eq;&73sN_b?s2gC-iopa7R#2oxiKKb&EAg!KN(f-f$Ynm(Q=d0DQZ zZ;X0Z;|*h`;SD%F?Xq=a@Sr5YH5^@1E{%M4y)u@Hb-&8!0i7zpz=Iv})J% z`A&}Fs(;?Rm5~T5fvoFx6I9}^tSKQ6#%w5L+qD(Tdr(S3rI~vr!U1R#&!EM+T^b5+ zH}$x>8r+g|x-pQ^P*QDm*{OWQdxt~wu(s~e-=32FyQ?@G>TK^wQA4u_!XmDPq~W`z zm-L5jFT?^jv~m|zm&3ciT+Z8CG1ImiY;i@K(kz*qs7D73hhEb;tYKC8Z+&^?>^su{ zFOsC~SP~P^^5r*oPM$H1+esSWac43W_OsXPJKJKHVp)kRfk1CtopRjwSOHKxx!KKL zU1^nMr@}ibOvEag_{DtHY_|=NhrbaB^hIh1+rMwupXO)1SB|a+Mh~EFlIAhH>-xfg zYh<9ZW#5cuLr4GbXB#_5PX^ky8wS-rzo%V=lDV<}%cI4=7F-gaInvsF_7&J>j+Tn9{dAui6 zDM>Yf!c&4>jG8c3GEqUbvTyL;%n4CVyi$J41xv8W3aE|s=Gi!o@t_anw&BTl8r@C> zSXDUyZ65=vFXd&Y;%Txe+lpYYw)F~YHT}W++o{TWz6nj6@k&^4Dn339cRcDd`bz5| zd8A#q!y)7s>F!ykll59I83!*W=-SLDnbE8yje8LIh(Pu_aIDX4hq@Een28U7hL5TE zY;2fq7KS%=o#USY3|!%zA6#Va1_-XQvvKHln3J2$(7;8VA_q)@?&LjvlGNf z`UGSo^dH{>&Fg=L8FQfZurM9CCYNL!WL99M-55Gf&!)P~he5O5Wc4*HDgy^H&cuqn zy8L0W*m?fI^vKOG{X@6oR>(#u)(W|t)S1UKD=h8h&_+_q>*O+-cCt=#aVy;^00p!c zD1&rDNVKBlq7;9VrHjIK)pup)m#?!qbvvw;0qecwtV~IIW1raj1kG94X1y2Aeud%p zY-PSHX^Q8>X`GyORC>o_3FhP{f^uH_zfg_r?tX=AR0D7B3aKuC5T&Ur0ZO6Ga-^)L zCLymoTu@Ui<5qVK=udjjcK^7jw9)7>KBS18L;$UOYk>D-re&zWE6uhRXB832kV3z$ zuv0fE5Sya*-I4eE1eN6V%UNg}#}2v3$`fnXMKbay1F32VHLJ5T_f=)ErxoC~MVTh9 zGe2a(5Cb(o!^`{N3A(fn1xyEjMC5Hl@t(0H92XwXf+}>VF17fEB!?~9TQGx%fcm(Qlu#)HSv2|s7eOk zKf^W=;KAXGXS+~VUvoT@)vOlIb*V29tcxbI8sq~fT9$}BtQ`CTmCl#UCwzzfHTE_( zOk%qqD3dI7+-)bTXzFIKPfY*?-S-UjFNz}veJWnYy}FUYvsUEP(&(Zp#eq*5RBI&; zQ6(c5=OPPrzky=`^|6JU^|5QKKrwd`C|O>);QqLcc0Z-gM46)qRP|`v255AGays9@ zR#m*Nih`NYGL=*L4KNQ{HqS5JX)HmZi4@#C0dI z(o%Wj2t{_YYmAsq6Co(caM5*Hi8rT6Xx^Q$47_oTS}w8WB#BmgYkQlOFec-}OQWiF zR{Yno3s1^vO-F-iTmS~hD4EHF&4kB>M};9 z@bi*!H7(8+eA^wz6X&slf1lU7lZtR6`{(MRynLADih_V`wOuL>-jPbFe3x~-UiM@@ zbsrta^S!C3=_sk6mi3Q%<($ZQQk8wj_2+!oPI?imA@Ka7(DzP^XZ}E&Ygzdbc2ycF zA%0NOsOryqQcq3@%yMm!)TmNVx2a%< zjXPfm;m$KCg_GV+WzLdSRjO|rVOSxD$^&)D6lL&Vt6ICaNhpa%H1(oXeua#!u$5*h zfY?x0uXO1ZWyU6OtlF7Jy3Bzga^IftzEWnre+*tBeD?3&RNj>d%diQzF#o1$bNzhN z%k$Uxw%bs7^Qo(nX301ok&Nmq0&9KfkN>lu@TYuUR%l>cR?K;1%o)LUpd@>`?85gs z>J=AHmfCBoNnU29SSNn#|Hae+l#h>Y_$~o6(CSvvsxjrbHiZ1hwjd4lRn^Wrb@1qR)Yt*7$ zi`e{=?DqS1gpfr{)1KE0kAU7GN8pOVpK%yy|C3%S`+$issCaH3X?g)%eA~YCRfmah zM1M}x+S;1T@kGzYXwX`1s}Bb4H|(CFr>C2Z)zYg`3y;ff(za`jckupM^adG)c-SGd z?}Bx6%XQj@p<80~b~lf9XZKEz-K6y>438G0AdlPNAqX_plZ={}N$kz&?cYA~=|6RH z=qTr$x3i{B1k1^zS)WgTaCuJJy}aQ7k*e7>cVaO2Vwm^t>;RWH0KJ-9;7sQ(^E8$t zbr-V9)xI@Ulnj*R-_}{Y>qgj)Y*>F8^!l-%^UZ85brEjizW9Z~!lFJb*|#OLVkXWd zQYnP=Fof0UdaUfoN)}F*S?g*Xc5MGXIw$*3+$BOm3e{@ul`t34UB7}w8#4GS7@VsS zkoo@2CU0XCYJEQ~3-m3lYr1XU8s^Y?+`(J3Oh7yA7 ztZ;Q0dpg)KhLDGa=+!V$imdglOKE)#=A%^YS~k#>lG*W1X}Au`Uwe#U+-? zoX>hy#r*sAVR&C`VZ%N;)4erB)FPAHnk4E-<%*xrroh)JC) zKM!_%$*wqle>@gdx}@Ly<=Fz&Y@HYU>ARrCU})p|<)(~}5L5V($i?m|&YSHX)6aoe zD^4TPds$BDewL=r3ywtB~d&#gDoK~9>3;xJ7FQyv`TXd zYO`Ntwo`q7W}Gj`?^bqi<_&Sw`k!!Yv-#&*?Hz=@dYBVO0$SBNlwb!FCfxV8E982v zQ7~u^Rpo?7@c=E~@vis2 zvZ?B_sj@P5lCa^rYzO$s1J3lSUz&9T`%)qUtHm*0_fI95UpB?^JGXP_+fbPuiEo?} zMfWqisHVcO#L28Qj!dT2PHNmrVj}NR`E;~R6w6L90-&2`nHENZACk1t12?u+?Sq*V zB*@%46u|#B@l2}ynp;k;dOpzY+a0v)2IC`r5n<+O_*MWGq0sE5hL>x3TNMKcj~28J z6GP9dDORd-j_MuX3;&Mx3h1j6`E;$Pmh*kPZP=K9ae(%*7$72pEAfH-#Sej81`>5hH(C4&E+6~C>f2JP51NzAnGYeAuEtmRAe8d3g z>IU;%4@zpb5~?xja8{>aK5pG{?V&VfdHW;((rHlrJ}-c_GIMDg9fp4EVLQG%DM{vo zYq@`V(6H82JX8f1>+mRN$%E&QFX|W)2gukrQG|N7DtXn@68Oq>4l2{)>y9fk$H*`s ziX)$8c6;wY72JhbUw+g7So9nTgnl$fW=E0#`~&m#lkBkGfj`o+ZWFx!O1<6%I5FDk`V(@6gO7!yFnGumVvCb*cH=FpK`Ko6CqQeJ}I#T0L^hRDO(A)7W^Zg4P{Tc z;~BobYnM%U-+!Yvov~Qdn7zDc;Vo;kzp887>#85K`g70L0Xe3%tkD43YqNYd z?s^?<^0i?|U~gQTgzLYmqT1Qr{98Z|s|qDypH&EZgbz!o4J1q&c33a$Zam8jUW6U8 zf&w^@rsgqS1;DPgO8YkMqZD>*8#zxu+zr`OTb^Vay&*J zY5mwS3%P{g`QNIuy!@SC+$$viX8_-O%UCGEC9g2C+$ZJxy*;pt&yoDaqvg}yE?;iwc-+OOF!lvts{~!EhSF)7SeRnVY#nUX` z=3ZENm73j~`SxbQ6osq3E;@VMHl|%OLd#Dkdw#-2XX;3~Q}_7XtK;r-7&6p@!zWOw z{mW6kmih73WvN_p9J^w!Z|&!|V-c&C9iG8yq^V1$^1P`Ru|-Y*cWwk~vL~xyU?DB#dgt1&1&3 zM6q`z9YqOb{SGR)`Yd68k_5cSt;a+k(tCdh!bTlD@__ZlklgPBG}7geN~JPmyHzZ} zqUmOV%uDy0zP9Cz$vrAbWxW+9SQ{0qTA!?O&Q{T*6Pd49k`yzaEc&MN5?JllK#}2B z*VHL_+sXhsTM(XvN+>d~#4*R0M_GjS&$ztx^YYsp8arBv)5xR_aCbuFErs2yfBVR6NIPAmnH%T_X4N)F z7R`p%w+w3Wu7TsVD|w??G>paY0fhl(cueA&i9%+a=kZ1q>q;Bv8)IFBGKsw08dDfn zxFUSXCZ6J0l=4mre2j}adLX7EBso@M4=*|l&u}GB=Y?oqHZ$(b2xK%Cl}m6lYv2S} zi;!av8J1@)C1_OLZ04>B!g`o*HcL!jV+O~y4xrGXVBCDmTwwaL+)3W~c%|-H3`%C& zRc!Adk#Nk%P*@5X((Yc5at8LxO<}lqkl8>m488tKmYbfU?6|_5*X9mRA4nv;Dq8eoC zc*^ZB0wxo!_7BHbF)J|?MnXNzmuog!U4@<6i6(WrGJ=0xZ(;bWS>)VzO)j8tr#KMX>QTa^20IToMt*NCIk0N|;6&1~>4w$tJvB?%GHba43pj-E<6>kLT)_iZcH?|> z{IbYRCtrNt|GvPiq6UWD&C2r$UX0={gq&bA<2@OTr&VhGjZcaukl3F!2bW*6ykg!T zaI%2|ZXKQ8nzr&|ckK#kiJq)Y>}aUHV-QuN6Q!Ru!&DrXm3Le~(qK^dJ(n85xp!Gug`o=Q{663o!r z<;B&(+!2*wdj<68vyp{Kr*m90;fs+GMa_v;d4v$rm@zOB7GJ%?$q#}i6*24YCrx>% zh;uR_+_Q$(_62QK%XYS?1|r6OGXsHW)ITFafDcowTthES{3Ou6JcNIG(Sx$(3_ z`=UQdC>lT4p57Y}4@^~UxDe#+rlyx#j~ux4p$9UKBrW?UQ%xwm|Cn?%`RO}rU_^HE6?uE#pVcP z^cjQj&KGMB_rp#&*QmjsTL~_gtUdfbq!toy0$ne|MP$xHC6BJnJ!@J!RO7FR>@~IA zw!qf5#|73k3VuL6;$;> z9)AAq1n*6{^5n|3H)H{^WscdpruMUd*&y;l)&LVgz8DN7;zm;^nehYXdx57p z7bWBaDkOP>xG)w|Ps4u?J;1f2XLn`*2Y8ovL2wc4l4l$+mqhqmWa?RVD@|&5qHepf zWQuH8&a-}|ZoP9i8$}L17b6~zw;-gQ%4ARQ7%NP;$iyRaR=7MS-bE22sqwl7*2NH3 zC89A6-&l8@!C8Z@&j*(lJ zpK>NTNjZ3e;K9P2MC4<8L($Pg?MbOz1a59}%ny3rMY8Q;tu<7HQ~xV;qjc77P{zvh z6UdE!ZaI2b%hKg@!&TKRN;O~d$;P;<_W|lPx6(cz07qYaogZ`;Qp6 zG1dKlfASXy;QwOjTwUPd4G3K~cd8rNo5^hH9Nat{?zchvR~Hbc1_F9B!tZ^RhIi>u z0LP11)7qS%CYyo9^cQf3F`SZ4d1`KHcHnh&#yy|hU!|%L}NUy)rrg?aHPs{ zZ=t*_F zujy?$bBE!lLSzkwC?H3|1&xe_0zXvEp-*DGejDqy@-0`}IzDIto@X<|h{6S=;#rY8 z89s@=ZOeFEX?Yv(8T>oH$1Rj&UmjF`f9(GxJ17kgT8OZ!QiViay6+j|fS?N9Z-dvf6#b2R#jvHxRVg*LO#vr4y>JSY5_7?+lai;=j zN)7`L!=6lEvqjk;nCpI!?+^VnswiTfTr%Cc#4c%-HlmTwVz(0pGpWUoj8doRCsVQ(i|A3 zMO+SR@Y~XK>i^8;<_y(aX`mjrxw7 zsRC?5wl~P#2tDM#R@#VY-9@VPxrqXX#AAd@G9qBJPzjz;ZMs0->16MiZ>*&mN?gD5 z#ZZIYSWhmJMP-jqn|lXOHy+e!HC|u4SPQOMJ5*HJ+!H4B25WrLxhkCohz_@5NN3zn z4N?!6vu~P1&GP=DVq1 zd6wu(f^egH#R}ZddPj${{a4RL(^`yl_0@Uh*ufC3ry;UN-bKJwk%mNv*O(zQ1H@z` z@!yZ`d;TBooq0@?XB@|w*+Lc*9cSaPVHO9NWKPD>5-CtCP{vWNLOUR3R)K~>+m$lAwsc)N%ANw6cJuFLf9(C^{o_58 z=e_bi&*%5~`h?+%+Lc=re+9c;G=2B$FYrQ0t^qY(S}5QICqQ|+Rg{hC4GdyHK2|qC99ZJ1+u@)RrCWRYhGx(L zj7!&#TBcavVGdZAPVKb9y9i0sQ$=MYJDp+6T%w!;z<<60VAH*mSj?nrZ8_6AU{RlO zlEN|H;heE~k%TcBNCX%g05KyzCB;y@oczEEDRqHY(X?H%-?~8rKyMyY2dwz`Y&*GJ z@ymw1&(lR;j|bjge%UtedQaR*f5lDF5jf<@djJ55wE`?09*p0)*r6JgaPDNbC287X5c}WHI zjt7TU{tR}!tQUDIwN9Oszp)9+PqoQa%=gd7Gf^^a`UAhe9NTahmR47$;Wl0%<2(y zcM_@!35c-?6z;AT&8^}Vp$_-tcpsqD2P&Q$KgPi2v=fU06~geeZ(AADt!MimvF7H) zGrA9xeV={N?eg!~EFmia$}Uv!S`&LUvt^8- zC6D=Mo;KAQQZ>@qn><36CZ__?S(l^l#UyY2M@fyqn+O2D(^q3KB(}JS%lm`rnX8-> z&(>9_E3qc7)m;OrKnbI}9=P&@cscX?JLu%aIvC!Wc#k^Iu&u5q-JYZ#jjf(+YrE%O z5|R~D>{~vFojnI{!R0G5fMCF0-{|J$B&Ak1t}dG^6AcJq-D(oyv-wZ*iDNuI^x^1g zAQ(++u(PQJ(F0U?%b=Kd3y9|mNFK)5_H~#BM+dE~P*Bi_glBPT)uh`I`2fDz*`71w zB_u>+9V3HD<7!eP)!(nA*N*yWeOJGGCWQao=NPC4M?R_LR*iGlg_4xC_kjR%b_lox zcAk~t9~ZEzFR&agvBru%|1+&yBRTW7tCQ~H$q4PVk<^4TESt}c%e0p+*gplSHEv~f zvDN|@UJh{CE=QKbc>>c0pq!^dS^M#-061zq;_Bo3o?E^fhfcVh;4F2G&Y=zcsz%+B z4d#H%q5=)!&Yt^PV1|#Us%=}-$I8f-;*9fpyjU=+JOeSCYS7VTfa0&8ngPf~t3eCW zzOFTpjc;OVfB?oHz{CG|Ddc*ePk6j%(|zrj4L*qBsnk*j)>@E-%quV_h7j$yFH6K^ z44G5(g|g!t@Xf93$`@rVBarx>ofjaPdG0Oe1IRSM@vmI8Bm}6@N=Z9Hf&x5Ih=`pU zj>HxC(Bk@{{0Gc{Me)sFzo{|yZ^k*JfRlfJP55poPg3_ZFn8>R`|Qo9LBWa1g7zGs zaomlP*$b4J1{|u%CMXC{8X8Zo*#8gm-)R%l-J$LMLFox2k3LxG0QaOB5)t+VEja$G FzW_XJ^)dhe diff --git a/tests/stability_tests.py b/tests/stability_tests.py deleted file mode 100644 index 727745c..0000000 --- a/tests/stability_tests.py +++ /dev/null @@ -1,396 +0,0 @@ -''' -====================================================================================================================== ----------------------------------------------------- STABILITY TESTING --------------------------------------------- -====================================================================================================================== -TEST stability of partitioning and binning --> binning IS stable, but partitioning is not --> as a result, number of binns and their size is not stable --> therefore, incidence replacement will differ --> which means that the logit will be trained on different data, giving different coefficients and AUC --> which leads to instability - i) forward selection stops because there are no positive coefs - ii) it has an effect on AUC - it can be lower so we will drop the variable even if it is useful --> I found this behavior only for one variables, but I belive it will be case of more -''' - -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -import seaborn as sns -from sklearn.linear_model import LogisticRegression -from sklearn import metrics -import math -pd.set_option("display.max_columns",50) - -data_path = 'C:/Local/pers/Documents/GitHub/COBRA/datasets/data.csv' -data_types_path = 'C:/Local/pers/Documents/GitHub/COBRA/datasets/data_types.csv' - - - -def __eqfreq(var, train, autobins=True, nbins=10, precision=0, twobins=True, catchLarge=True): - ''' - Special method for binning continuous variables into bins - ---------------------------------------------------- - var: input pd.Serie with continuous columns - train: mask with rows which belongs to train - autobins: adapts number of bins - nbins: number of bins - precision: precision to form meaningful bins - twobins: if only two bins are found, iterate to find more - catchLarge: check when groups are too big - ---------------------------------------------------- - - This function is a reworked version of pd.qcut to satisfy our particular needs - - Takes for var a continuous pd.Series as input and returns a pd.Series with bin-labels (e.g. [4,6[ ) - - Train takes a series/list of booleans (note: we define bins based on the training set) - - Autobins reduces the number of bins (starting from nbins) as a function of the number of missings - - Nbins is the wished number of bins - - Precision=0 results in integer bin-labels if possible - - twobins=True forces the function to output at least two bins - - catchLarge tests if some groups (or missing group) are very large, and if so catches and outputs two groups - - note: catchLarge makes twobins irrelevant - ''' - - # Test for large groups and if one exists pass them with two bins: Large_group,Other - if catchLarge: - catchPercentage=1-(1/nbins) - groupCount = var[train].groupby(by=var[train]).count() - maxGroupPerc = groupCount.max()/len(var[train]) - missingPerc = sum(var[train].isnull())/len(var[train]) - if maxGroupPerc>=catchPercentage: - largeGroup = groupCount.sort_values(ascending=False).index[0] - x_binned = var.copy() - x_binned.name = 'B_'+var.name - x_binned[x_binned!=largeGroup]='Other' - cutpoints=None - info = (var.name+": One large group, outputting 2 groups") - return x_binned, cutpoints, info - elif missingPerc>=catchPercentage: - x_binned = var.copy() - x_binned.name = 'B_'+var.name - x_binned[x_binned.isnull()]='Missing' - x_binned[x_binned!='Missing']='Other' - cutpoints=None - info = (var.name+": One large missing group, outputting 2 groups") - return x_binned, cutpoints, info - # Adapt number of bins as a function of number of missings - if autobins: - length = len(var[train]) - missing_total = var[train].isnull().sum() - missing_perten = missing_total/length*10 - nbins = max(round(10-missing_perten)*nbins/10 ,1) - # Store the name and index of the variable - name = var.name - series_index = var.index - # Transform var and train to a np.array and list respectively, which is needed for some particular function&methods - x = np.asarray(var) - train = list(train) - # First step in finding the bins is determining what the quantiles are (named as cutpoints) - # If the quantile lies between 2 points we use lin interpolation to determine it - cutpoints = var[train].quantile(np.linspace(0,1,nbins+1),interpolation = 'linear') - # If the variable results only in 2 unique quantiles (due to skewness) increase number of quantiles until more than 2 bins can be formed - if twobins: - extrasteps = 1 - # Include a max. extrasteps to avoid infinite loop - while (len(cutpoints.unique())<=2) & (extrasteps<20): - cutpoints = var[train].quantile(np.linspace(0,1,nbins+1+extrasteps),interpolation = 'linear') - extrasteps+=1 - # We store which rows of the variable x lies under/above the lowest/highest cutpoint - # Without np.errstate(): xcutpoints.max() can give if x contains nan values (missings) - # However the function will result in False in both >&< cases, which is a correct result, so the warning can be ignored - with np.errstate(invalid='ignore'): - under_lowestbin = x < cutpoints.min() - above_highestbin= x > cutpoints.max() - - - def _binnedx_from_cutpoints(x, cutpoints, precision, under_lowestbin, above_highestbin): - ### Attributes the correct bin ........................ - ### Function that, based on the cutpoints, seeks the lowest precision necessary to have meaningful bins - ### e.g. (5.5,5.5] ==> (5.51,5.54] - ### Attributes those bins to each value of x, to achieve a binned version of x - - # Store unique cutpoints (e.g. from 1,3,3,5 to 1,3,5) to avoid inconsistensies when bin-label making - # Indeed, bins [...,1], (1,3], (3,3], (3,5], (5,...] do not make much sense - # While, bins [...,1], (1,3], (3,5], (5,...] do make sense - unique_cutpoints = cutpoints.unique() - # If there are only 2 unique cutpoints (and thus only one bin will be returned), - # keep original values and code missings as 'Missing' - if len(unique_cutpoints) <= 2: - cutpoints = None - x_binned = pd.Series(x) - x_binned[x_binned.isnull()] = 'Missing' - info = (var.name+": Only one resulting bin, keeping original values instead") - return x_binned, cutpoints, info - # Store info on whether or not the number of resulting bins equals the desired number of bins - elif len(unique_cutpoints) < len(cutpoints): - info = (var.name+": Resulting # bins < whished # bins") - else: - info = (var.name+": Resulting # bins as desired") - # Finally, recode the cutpoints (which can have doubles) as the unique cutpoints - cutpoints = unique_cutpoints - - # Store missing values in the variable as a mask, and create a flag to test if there are any missing in the variable - na_mask = np.isnan(x) - has_nas = na_mask.any() - # Attribute to every x-value the index of the cutpoint (from the sorted cutpoint list) which is equal or higher than - # the x-value, effectively encompasing that x-value. - # e.g. for x=6 and for sorted_cutpoint_list=[0,3,5,8,...] the resulting_index=3 - ids = cutpoints.searchsorted(x, side='left') - # x-values equal to the lowest cutpoint will recieve a ids value of 0 - # but our code to attribute bins to x-values based on ids (see end of this subfunction) requires a min. value of 1 - ids[x == cutpoints[0]] = 1 - # Idem as previous: x-values below the lowest cutpoint should recieve a min. value of 1 - if under_lowestbin.any(): - ids[under_lowestbin] = 1 - # Similar as previous: x-values above the highest cutpoint should recieve the max. allowed ids - if above_highestbin.any(): - max_ids_allowed = ids[(above_highestbin == False) & (na_mask==False)].max() - ids[above_highestbin] = max_ids_allowed - # Maximal ids can now be defined if we neglect ids of missing values - max_ids = ids[na_mask==False].max() - - # Based on the cutpoints create bin-labels - # Iteratively go through each precision (= number of decimals) until meaningful bins are formed - # If theoretical bin is ]5.51689,5.83654] we will prefer ]5.5,5.8] as output bin - increases = 0 - original_precision = precision - while True: - try: - bins = _format_bins(cutpoints, precision) - except ValueError: - increases += 1 - precision += 1 - #if increases >= 5: - #warnings.warn("Modifying precision from "+str(original_precision)+" to "+str(precision)+" to achieve discretization") - #print("Modifying precision from "+str(original_precision)+" to "+str(precision)+" to achieve discretization") - else: - break - - # Make array of bins to allow vector-like attribution - bins = np.asarray(bins, dtype=object) - # If x has nas: for each na-value, set the ids-value to max_ids+1 - # this will allow na-values to be attributed the highest bin which we define right below - if has_nas: - np.putmask(ids, na_mask, max_ids+1) - # The highest bin is defined as 'Missing' - bins = np.append(bins,'Missing') - # ids-1 is used as index in the bin-labels list to attribute a bin-label to each x. Example: - # x=6 sorted_cutpoint_list=[0,3,5,8,...] ids=3 levels=[[0,3],(3,5],(5,8],...] - # The correct bin level for x is (5,8] which has index 2 which is equal to the ids-1 - x_binned = bins[ids-1] - return x_binned, cutpoints, info - - - def _format_bins(cutpoints, prec): - # Based on the quantile list create bins. Raise error if values are similar within one bin. - # On error _binnedx_from_cutpoints will increase precision - - fmt = lambda v: _format_label(v, precision=prec) - bins = [] - for a, b in zip(cutpoints, cutpoints[1:]): - fa, fb = fmt(a), fmt(b) - - if a != b and fa == fb: - raise ValueError('precision too low') - - formatted = '(%s, %s]' % (fa, fb) - bins.append(formatted) - - bins[0] = '[...,' + bins[0].split(",")[-1] - bins[-1] = bins[-1].split(",")[0] + ',...]' - return bins - - - def _format_label(x, precision): - # For a specific precision, returns the value formatted with the appropriate amount of numbers after comma and correct brackets - - if isinstance(x,float): - frac, whole = np.modf(x) - sgn = '-' if x < 0 else '' - whole = abs(whole) - if frac != 0.0: - val = '{0:.{1}f}'.format(frac, precision) - val = _trim_zeros(val) - if '.' in val: - return sgn + '.'.join(('%d' % whole, val.split('.')[1])) - else: - if '0' in val: - return sgn + '%0.f' % whole - else: - return sgn + '%0.f' % (whole+1) - else: - return sgn + '%0.f' % whole - else: - return str(x) - - - def _trim_zeros(x): - # Removes unnecessary zeros and commas - while len(x) > 1 and x[-1] == '0': - x = x[:-1] - if len(x) > 1 and x[-1] == '.': - x = x[:-1] - return x - - x_binned, cutpoints, info = _binnedx_from_cutpoints(x, cutpoints, precision=precision, under_lowestbin=under_lowestbin, above_highestbin=above_highestbin) - x_binned = pd.Series(x_binned, index=series_index, name="B_"+name) - return x_binned, cutpoints, info - -def __increp(b_var, target, train): - ''' - Method for incidence replacement - Returns replaced pd.Serie - ---------------------------------------------------- - b_var: input pd.Serie to be replaced - target: pd.Serie with target variable - train: pd.Serie with parition variable - ---------------------------------------------------- - ''' - - #get variable name - name = b_var.name - #get overall incidence - incidence_mean = target[train].mean() - #get incidence per group - incidences = target[train].groupby(b_var).mean() - #construct dataframe with incidences - idf = pd.DataFrame(incidences).reset_index() - #get values that are in the data but not in the labels - bin_labels = incidences.index - newgroups = list(set(b_var.unique()) ^ set(bin_labels)) - #if newgroups, add mean incidence to incidence dataframe for each new group - if len(newgroups)>0: - #make dataframe: - ngdf = pd.DataFrame(newgroups) - ngdf.columns = [name] - ngdf["TARGET"] = incidence_mean - #dataframe with incidences: - idf = idf.append(ngdf) - #dataframe with the variable - vdf = pd.DataFrame(b_var) - #discretized variable by merge - d_var = pd.merge(vdf,idf,how='left',on=name)["TARGET"] - return pd.Series(d_var, name="D_"+name[2:]) - - - - -# -# LOAD CSV -# -df = pd.read_csv(data_path, header=0, sep=None, engine='python') - -_partitioning_settings = {'train':0.5, - 'selection':0.3, - 'validation':0.2} - -df_simulation = pd.DataFrame(None,columns=[ - 'variable', - 'iteration', - 'length', - 'coef', - 'AUC' - ]) - -cont_clmns = ['age', 'fnlwgt', 'education-num','capital-gain', 'capital-loss', 'hours-per-week', - 'scont_1', 'scont_2', 'scont_3', 'scont_4', 'scont_5', 'scont_6', 'scont_7', 'scont_8', - 'scont_9', 'scont_10', 'scat_1','scat_2', 'scat_3', 'scat_4', 'scat_5', 'sflag_1', - 'sflag_2', 'sflag_3','sflag_4', 'sflag_5'] - -row = 0 -for clmn in cont_clmns: - - df_prep = df[['TARGET',clmn]] - - for i in range(1,51): - #PARTITION - df_prep = df_prep.iloc[np.random.permutation(len(df_prep))].sort_values(by='TARGET', ascending=False).reset_index(drop=True) - partition = [] - sorted_target=df_prep['TARGET'] #Just the target since it is allready sorted (see above) - for target in [sorted_target.iloc[0],sorted_target.iloc[-1]]: - target_length = (sorted_target==target).sum() - - for part, size in _partitioning_settings.items(): - partition.extend([part]*math.ceil(target_length*size)) - - df_prep["PARTITION"] = partition[:len(df_prep)] - - #Binns - result = __eqfreq(var=df_prep[clmn], - train=df_prep["PARTITION"]=="train", - autobins=True, - nbins=5, - precision=0, - twobins=True, - # TRUE OPTION STILL PRODUCES ERROR IN SORTNUMERIC function AND SCORING procedure !!!!!!!!! - catchLarge=False) - - bin_serie = pd.Series(result[0]) - # uncommemt this to see the counts - they change! - #print(bin_serie.groupby(bin_serie).count()) - - #REPLACE INCIDENCE - inc_rep = __increp(b_var=bin_serie, - target=df_prep['TARGET'], - train=df_prep['PARTITION']=="train") - - df_prep['D_'+clmn] = inc_rep - - #PREDICT - y_train = df_prep['TARGET'][df_prep['PARTITION'] == 'train'].as_matrix() - x_train = df_prep['D_'+clmn][df_prep['PARTITION'] == 'train'].as_matrix().reshape(-1,1) - - logit = LogisticRegression(fit_intercept=True, C=1e9, solver = 'liblinear') - logit.fit(y=y_train, X=x_train) - y_pred_train = logit.predict_proba(x_train) - - AUC_train = metrics.roc_auc_score(y_true=y_train, y_score=y_pred_train[:,1]) - - coefs = logit.coef_[0] - - df_simulation.loc[row] = [ - clmn, - i, - len(np.unique(result[0])), - coefs[0], - np.round(AUC_train,3) - ] - row +=1 - - #print('ITERATION {}, lenght: {}, coef: {}, AUC: {}.'.format(i, len(np.unique(result[0])),coefs,np.round(AUC_train,3))) - - -print(df_simulation.groupby(['variable','length'])['length'].count()) -print(df_simulation.groupby(['variable'])['coef'].mean()) -print(df_simulation.groupby(['variable'])['coef'].std()) -print(df_simulation.groupby(['variable'])['AUC'].mean()) -print(df_simulation.groupby(['variable'])['AUC'].std()) - - - - - -''' -df_transformed.groupby('D_scont_1')['D_scont_1'].count() -df_transformed.groupby('B_scont_1')['B_scont_1'].count() - -res = pd.Series(result[0]) -res.groupby(res).count() -''' - - - - - - - - - - - - - - - - - - diff --git a/tests/tests.py b/tests/tests.py deleted file mode 100644 index 1c35c0a..0000000 --- a/tests/tests.py +++ /dev/null @@ -1,394 +0,0 @@ -''' -====================================================================================================================== ---------------------------------------------------------- TESTING --------------------------------------------------- -====================================================================================================================== -This is my (Honza) script to test and develop in Cobra -import sys -sys.path.append('C:/Local/pers/Documents/GitHub/COBRA/source_code') -''' - -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -import seaborn as sns -pd.set_option("display.max_columns",50) - -data_path = 'C:/Local/pers/Documents/GitHub/COBRA/datasets/data.csv' -data_types_path = 'C:/Local/pers/Documents/GitHub/COBRA/datasets/data_types.csv' - -''' -TO-DO --sometimes error - no variables with positive coef. Even if error is thrown, return stuff! --the functions can be applied in a vectorized way --further improve the forward selection - -TEST -''' - - -'''=================== TEST COBRA INTERFACE ===================''' -import cobra.cobra as c - -build = c.COBRA(data_path, - data_types_path, - partition_train=0.5, - partition_select=0.3, - partition_valid=0.2, - sampling_1=1, - sampling_0=1, - discret_nbins=5, - regroup_sign=0.001, - rseed=0) -df_transformed = build.transform() - - -#I want to try more unisel -df_unisel, df_corr = build.fit_univariate(df_transformed, - preselect_auc=0.53, - preselect_overtrain=5) - -build.plotPredictorQuality(df_unisel) -build.plotCorrMatrix(df_corr) -build.plotIncidence(df_transformed, 'age') - -#I want to try more models -#first model -df_model1 = build.fit_model(df_transformed, - df_unisel, - modeling_nsteps=30, - forced_vars=['scont_1', 'scont_2'], - excluded_vars=None, - name='All variables') - -build.plotAUC(df_model1) -build.plotVariableImportance(df_model1, 5) -build.plotCumulatives([(df_model1,3)], df_transformed) - - -#second model -df_model2 = build.fit_model(df_transformed, - df_unisel, - modeling_nsteps=30, - forced_vars=None, - excluded_vars=None, - name='Experiment') - -build.plotAUC(df_model2) -build.plotVariableImportance(df_model2, 6) -build.plotCumulatives([(df_model2, 5)], df_transformed) - -#Model comparison -build.plotAUCComparison([(df_model1,3), (df_model2,5)]) -build.plotCumulatives([(df_model1,3), (df_model2,5)], df_transformed) - -'''=================== PROFILING & SPEED ===================''' -import cProfile -import timeit - -cProfile.run('build.fit_univariate(df_transformed,preselect_auc=0.53, preselect_overtrain=5)') - -def classWrapper(): - build = c.COBRA(data_path, - data_types_path, - partition_train=0.5, - partition_select=0.3, - partition_valid=0.2, - sampling_1=1, - sampling_0=1, - discret_nbins=5, - regroup_sign=0.001, - rseed=0) - df_transformed = build.transform() - - df_unisel, df_corr = build.fit_univariate(df_transformed, - preselect_auc=0.53, - preselect_overtrain=5) - - df_model1 = build.fit_model(df_transformed, - df_unisel, - modeling_nsteps=30, - forced_vars=None, - excluded_vars=None, - name='All variables') - -%timeit classWrapper() -''' -Improvements: -------------- -- 06/03/2018, JBE - 11 s ± 267 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - -''' - - - -def _getTrainSelectValidXY(df): - ''' - Method split given DF into train/test/validation set in respect to X and Y. - Returns dictionary with DFs - ---------------------------------------------------- - df: transformed dataset - ---------------------------------------------------- - ''' - - dvars = [n for n in df.columns if n[:2] == 'D_'] - - mask_train = df['PARTITION']=="train" - mask_selection = df['PARTITION']=="selection" - mask_validation = df['PARTITION']=="validation" - - y_train = df.loc[mask_train,'TARGET'] - y_selection = df.loc[mask_selection,'TARGET'] - y_validation = df.loc[mask_validation,'TARGET'] - - x_train = df.loc[mask_train,dvars] - x_selection = df.loc[mask_selection,dvars] - x_validation = df.loc[mask_validation,dvars] - - dict_out = {'y_train':y_train, 'y_selection':y_selection, 'y_validation':y_validation, - 'x_train':x_train, 'x_selection':x_selection, 'x_validation':x_validation} - - return dict_out - -_partition_dict = _getTrainSelectValidXY(df_transformed) - -''' -============================================================================================================= -======================================= FORWARD SELECTION =================================================== -============================================================================================================= --Only boolean target -''' -from sklearn.linear_model import LogisticRegression -from sklearn import metrics -import numpy as np - -df_sel = df_unisel -forced_vars = ['scont_1', 'scont_2'] -excluded_vars = None -positive_only = True - -#if None, replace by empty list -if not excluded_vars: - excluded_vars = [] - -if not forced_vars: - forced_vars = [] - -#Sort -df_sel = df_sel.sort_values(by='AUC selection', ascending=False) - -#Build list of variables to be used for Forward selection -preselected_vars = df_sel['variable'][df_sel['preselection'] == True].tolist() -preselected_vars = [var for var in preselected_vars if var not in forced_vars+excluded_vars] -all_vars = ['D_' + var for var in forced_vars + preselected_vars] - - - -''' ------------------- MAIN LOOP ------------------ -''' -df_forward_selection = pd.DataFrame(None,columns=[ - 'step', - 'coef', - 'all_coefs_positive', - 'AUC_train', - 'AUC_selection', - 'AUC_validation', - 'predictors_subset', - 'last_var_added', - 'AUC_train_rank', - 'selected_model', - 'pred_training', - 'pred_selection', - 'pred_validation' - ]) - -f_position_forced = lambda i, forced, all_vars: len(forced) if i <= len(forced) else len(all_vars) - -n_steps = min(30,len(all_vars)) -predictors = [] -row = 0 - -for step in range(1,n_steps): - print('*******************Iter {}*******************'.format(step)) - - pos = f_position_forced(step, forced_vars, all_vars) - remaining_predictors = [var for var in all_vars[:pos] if var not in predictors] - - for predictor in remaining_predictors: - predictors_subset = predictors + [predictor] - #Train - train model - logit = LogisticRegression(fit_intercept=True, C=1e9, solver = 'liblinear') - logit.fit(y=_partition_dict['y_train'], X=_partition_dict['x_train'][predictors_subset]) - - #Train - predict and AUC - y_pred_train = logit.predict_proba(_partition_dict['x_train'][predictors_subset]) - AUC_train = metrics.roc_auc_score(y_true=_partition_dict['y_train'], y_score=y_pred_train[:,1]) - - #Selection - predict and AUC - y_pred_selection = logit.predict_proba(_partition_dict['x_selection'][predictors_subset]) - AUC_selection = metrics.roc_auc_score(y_true=_partition_dict['y_selection'], y_score=y_pred_selection[:,1]) - - #Validation - predict and AUC - y_pred_validation = logit.predict_proba(_partition_dict['x_validation'][predictors_subset]) - AUC_validation = metrics.roc_auc_score(y_true=_partition_dict['y_validation'], y_score=y_pred_validation[:,1]) - - #check if coefs are positive - all_coefs_positive = (logit.coef_[0] >= 0).all() - - df_forward_selection.loc[row] = [ - step, - logit.coef_, - all_coefs_positive, - AUC_train, - AUC_selection, - AUC_validation, - predictors_subset, - predictors_subset[-1], - 0, - False, - y_pred_train, - y_pred_selection, - y_pred_validation - ] - row +=1 - - #Only positive coefs - if positive_only: - if len(df_forward_selection[(df_forward_selection['all_coefs_positive'] == True) & (df_forward_selection['step'] == step)]) == 0: - raise ValueError("No models with only positive coefficients","NormalStop") - - - ##Find best model - #Sort AUC by size - df_forward_selection['AUC_train_rank'] = df_forward_selection.groupby('step')['AUC_train'].rank(ascending=False) - - #Find model where AUC is highest AND all coefs are positive - convert to boolean flag - df_forward_selection['selected_model'] = df_forward_selection[df_forward_selection['all_coefs_positive'] == True].groupby(['step'])['AUC_train'].transform(max) - df_forward_selection['selected_model'] = (df_forward_selection['selected_model'] == df_forward_selection['AUC_train']) - else: - ##Highest AUC, regardless of coefs - df_forward_selection['selected_model'] = (df_forward_selection.groupby(['step'])['AUC_train'].transform(max) == df_forward_selection['AUC_train']) - - ##Add next predictor - add_variable = df_forward_selection.loc[(df_forward_selection['selected_model'] == True) & (df_forward_selection['step'] == step), 'last_var_added'].iloc[0] - predictors.append(add_variable) - - clmns_out = ['step', 'coef', 'AUC_train', 'AUC_selection', 'AUC_validation', 'predictors_subset', 'last_var_added', - 'pred_training','pred_selection','pred_validation'] - -df_tst = df_forward_selection[clmns_out][df_forward_selection['selected_model'] == True] - - - - -''' -============================================================================================================= -=========================================== CSV IMPORT ====================================================== -============================================================================================================= --Only boolean target -''' - - - - -#Loads Data types -types_exist = True - -#load data_types -try: - df_types = pd.read_csv(data_types_path, header=None) - df_types.columns = ['variable','data_type'] -except FileNotFoundError: - types_exist = False - df_types = pd.DataFrame() - -#load data -df = pd.read_csv(data_path, header=0, sep=None, engine='python') - - -#change datatypes -if types_exist: - for row in df_types.itertuples(): #0:index, 1:variable, 2:data_type - if row[2] == 'int': - df[row[1]] = df[row[1]].astype(np.int64) - if row[2] in ['str', 'bool']: - df[row[1]] = df[row[1]].apply(str) - -return df, df_types - - - -types_exists = True -data_types_path = 'C:/Local/pers/Documents/GitHub/COBRA/datasets/data_types.csv' - -if types_exists: - try: - df_dtypes = pd.read_csv(data_types_path, header=None) - df_dtypes.columns = ['variable','data_type'] - - - - dict_dtypes = {k: getattr(__builtins__, v) for k, v in df_dtypes.to_dict('dict')['data_type'].items()} - except FileNotFoundError: - raise - - df = pd.read_csv(data_path, header=0, sep=',', engine='c', converters=dict_dtypes) - -else: - - - -set(np.unique(df_dtypes['data_type'])) < set(['int','str','bool']) - - - - - - - - - - - - -df_types = pd.read_csv(data_types_path, header=None) -df_types.columns = ['variable','data_type'] -dict_types = df_types.to_dict('dict') - -tst = dict_types['data_type'] - -df = pd.read_csv(data_path, header=0, sep=None, engine='python', converters=dict_types['data_type']) - -string = 'str' -string = string[1:] -print(string) - - -import re -string = 'str' -string = re.sub(r'^"|"$', '', string) -print(string) - -d2 = {k: re.sub(r'^"|"$', '', v) for k, v in tst.items()} -d2 = {k: getattr(__builtins__, v) for k, v in tst.items()} - - - - - - - - - - - - - - - - - - - From 447e7fae280f2b97d7a1f3f51d2d338b1a034ede Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 19 Dec 2019 09:20:12 +0100 Subject: [PATCH 22/98] Add more unittests for KBinsDiscretizer --- tests/preprocessing/test_kbins_discretizer.py | 57 ++++++++++++++++--- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py index 4c39e90..ced0ddc 100644 --- a/tests/preprocessing/test_kbins_discretizer.py +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -42,11 +42,7 @@ def test_attributes_to_dict(self): ["n_bins", "strategy", "closed", "auto_adapt_bins", "starting_precision", "label_format", "change_endpoint_format", - "_bins_by_column"], - ids=["n_bins", "strategy", "closed", - "auto_adapt_bins", "starting_precision", - "label_format", "change_endpoint_format", - "_bins_by_column"]) + "_bins_by_column"]) def test_set_attributes_from_dict(self, attribute): discretizer = KBinsDiscretizer() @@ -76,12 +72,57 @@ def test_set_attributes_from_dict(self, attribute): assert actual == expected + # no further tests here as this is just a wrapper around _fit_column! + @pytest.mark.parametrize("strategy, expectation", + [("trees", pytest.raises(ValueError)), + ("quantile", does_not_raise())]) + def test_fit_exception(self, strategy, expectation): + discretizer = KBinsDiscretizer(strategy=strategy) + + data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) + + with expectation: + discretizer.fit(data, ["variable"]) + + # no further tests here as this is just a wrapper around _transform_column! + @pytest.mark.parametrize("scenario, expectation", + [("raise", pytest.raises(ValueError)), + ("regular_test", does_not_raise()), + ("constant_data", does_not_raise())]) + def test_transform(self, scenario, expectation): + + discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform") + + data = pd.DataFrame({"variable": ([1] * 10)}) + expected = data.copy() + + if scenario == "regular_test": + # overwrite data and expected with DataFrame containing + # a non-constant variable + data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) + expected = data.copy() + + discretizer.fit(data, ["variable"]) + + categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"] + expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4 + + ["3.0 - 6.0"]*3 + + ["6.0 - 9.0"]*3 + + ["Missing"], + categories=categories, + ordered=True) + elif scenario == "constant_data": + discretizer.fit(data, ["variable"]) + + with expectation: + actual = discretizer.transform(data, ["variable"]) + pd.testing.assert_frame_equal(actual, expected) + ################# Test for private methods ################# @pytest.mark.parametrize("n_bins, expectation", [(1, pytest.raises(ValueError)), (10.5, pytest.raises(ValueError)), - (2, does_not_raise())], - ids=["invalid_int", "float", "normal"]) + (2, does_not_raise())]) def test_validate_n_bins_exception(self, n_bins, expectation): with expectation: assert KBinsDiscretizer()._validate_n_bins(n_bins=n_bins) is None @@ -89,7 +130,7 @@ def test_validate_n_bins_exception(self, n_bins, expectation): def test_transform_column(self): data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) - discretizer = KBinsDiscretizer(n_bins=3, strategy="unform") + discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform") bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)] From c0dcc5a400b096249dff474e7905d9f8f8fa2979 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 19 Dec 2019 17:03:18 +0100 Subject: [PATCH 23/98] Refactor and rename categorical_regrouper Refactor is required to make it compatible with the other modules. The following changes where made: - Rewritten logic to improve readability - Made regrouping optional so that only Missing values are replaced and datatypes are changed to "category" - Great care was taken to avoid accidental modification of original data (as variables in Python are passed by reference!) --- cobra/preprocessing/__init__.py | 4 +- .../categorical_data_processor.py | 358 ++++++++++++++++++ .../test_categorical_data_processor.py | 59 +++ 3 files changed, 419 insertions(+), 2 deletions(-) create mode 100644 cobra/preprocessing/categorical_data_processor.py create mode 100644 tests/preprocessing/test_categorical_data_processor.py diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py index 98c3ef9..2008235 100644 --- a/cobra/preprocessing/__init__.py +++ b/cobra/preprocessing/__init__.py @@ -1,7 +1,7 @@ from .kbins_discretizer import KBinsDiscretizer from .target_encoder import TargetEncoder -from .categorical_regrouper import CategoryRegrouper +from .categorical_data_processor import CategoricalDataProcessor __all__ = ['KBinsDiscretizer', 'TargetEncoder', - 'CategoryRegrouper'] \ No newline at end of file + 'CategoricalDataProcessor'] diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py new file mode 100644 index 0000000..705d018 --- /dev/null +++ b/cobra/preprocessing/categorical_data_processor.py @@ -0,0 +1,358 @@ +""" +This class implements the Python Prediction's way of dealing with +categorical data preprocessing. There are two steps involved here: +- An optional regrouping of the different categories based on category size + and significance of the category +- Missing value replacement with the additional category "Missing" + +Authors: +- Geert Verstraeten (methodology) +- Jan Benisek (implementation) +- Matthias Roels (implementation) +""" +# standard lib imports +import re +from typing import Optional + +import logging +log = logging.getLogger(__name__) + +# third party imports +import numpy as np +import pandas as pd +from scipy import stats + +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError + + +class CategoricalDataProcessor(BaseEstimator): + """ + Regroups categories in categorical variables based on significance + with target variable. + + Attributes + ---------- + category_size_threshold : int, optional + minimal size of a category to keep it as a separate category + forced_categories : dict, optional + Map to prevent certain categories from being group into "Other" + for each colum - dict of the form {col:[forced vars]}. + keep_missing : bool + Whether or not to keep missing as a separate category + p_value_threshold : float + Significance threshold for regroupping. + regroup : bool + Whether or not to regroup categories + regroup_name : str + New name of the non-significant regrouped variables + scale_contingency_table : bool + Whether contingency table should be scaled before chi^2.' + """ + + def __init__(self, regroup: bool=True, regroup_name: str="Other", + keep_missing: bool=True, + category_size_threshold: Optional[int]=None, + p_value_threshold: float=0.001, + scale_contingency_table: bool=True, + forced_categories: Optional[dict]=None): + + self.regroup = regroup + self.regroup_name = regroup_name + self.keep_missing = keep_missing + self.category_size_threshold = category_size_threshold + self.p_value_threshold = p_value_threshold + self.scale_contingency_table = scale_contingency_table + self.forced_categories = forced_categories + + # dict to store fitted output in + self._combined_categories_by_column = {} + + def fit(self, data: pd.DataFrame, column_names: list, + target_column: str): + """Fit the CategoricalDataProcessor + + Parameters + ---------- + data : pd.DataFrame + data used to compute the mapping to encode the categorical + variables with. + column_names : list + Columns of data to be processed + target_column : str + Column name of the target + """ + + if not self.regroup: + # We do not need to fit anything if regroup is set to False! + log.info("regroup was set to False, so no fitting is required") + return None + + for column_name in column_names: + + if column_name not in data.columns: + log.warning("DataFrame has no column '{}', so it will be " + "skipped in fitting" .format(column_name)) + continue + + combined_cats = self._fit_column(data, column_name, target_column) + + # Add to _combined_categories_by_column for later use + self._combined_categories_by_column[column_name] = combined_cats + + def _fit_column(self, data: pd.DataFrame, column_name: str, + target_column) -> list: + """Compute which categories to regroup into "Other" for a particular + column + + Parameters + ---------- + data : pd.DataFrame + Description + column_name : str + Description + + Returns + ------- + list + list of categories to combine into a category "Other" + """ + + X = data[column_name] + y = data[target_column] + incidence = y.mean() + + combined_categories = set() + + # replace missings and get unique categories as a list + X = CategoricalDataProcessor._replace_missings(X) + unique_categories = list(X.unique()) + + # get small categories and add them to the merged category list + small_categories = (CategoricalDataProcessor + ._get_small_categories( + X, + incidence, + self.category_size_threshold)) + combined_categories = combined_categories.union(small_categories) + + for category in unique_categories: + if category in small_categories: + continue + + pval = (CategoricalDataProcessor + ._compute_p_value(X, y, category, + self.scale_contingency_table)) + + # if not significant, add it to the list + if pval > self.p_value_threshold: + combined_categories.add(category) + + # Remove missing category from combined_categories if required + if self.keep_missing: + combined_categories.discard("Missing") + + return combined_categories + + def transform(self, data: pd.DataFrame, + column_names: list) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Data to be discretized + column_names : list + Columns of data to be discretized + + Returns + ------- + pd.DataFrame + data with additional discretized variables + """ + + if self.regroup and len(self._combined_categories_by_column) == 0: + msg = ("{} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + for column_name in column_names: + + data = self._transform_column(data, column_name) + + return data + + def _transform_column(self, data: pd.DataFrame, + column_name: str) -> pd.DataFrame: + """Given a DataFrame, a column name and a list of categories to + combine, create an additional column which combines these categories + into "Other" + + Parameters + ---------- + data : pd.DataFrame + Original data to be tranformed + column_name : str + name of the column to transform + + Returns + ------- + pd.DataFrame + original DataFrame with an added processed column + """ + + column_name_clean = column_name + "_processed" + data[column_name_clean] = data[column_name] + + # Fill missings first + data[column_name_clean] = (CategoricalDataProcessor + ._replace_missings(data, + column_name_clean)) + + if self.regroup: + categories = self._combined_categories_by_column.get(column_name) + + data[column_name_clean] = (CategoricalDataProcessor + ._replace_categories( + data[column_name_clean], + categories)) + + # change data to categorical + data[column_name_clean] = data[column_name_clean].astype("category") + + return data + + def fit_transform(self, data: pd.DataFrame, + column_names: list) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Data to be discretized + column_names : list + Columns of data to be discretized + + Returns + ------- + pd.DataFrame + data with additional discretized variables + """ + self.fit(data, column_names) + return self.transform(data, column_names) + + @staticmethod + def _get_small_categories(predictor_series: pd.Series, + incidence: float, + category_size_threshold: int) -> set: + """Fetch categories with a size below a certain threshold. + Note that we use an additional weighting with the overall incidence + + Parameters + ---------- + predictor_series : pd.Series + Description + incidence : float + global train incidence + category_size_threshold : int + minimal size of a category to keep it as a separate category + + Returns + ------- + set + List a categories with a count below a certain threshold + """ + category_counts = predictor_series.groupby(predictor_series).size() + factor = max(incidence, 1 - incidence) + + # Get all categories with a count below a threshold + bool_mask = (category_counts*factor) <= category_size_threshold + return set(category_counts[bool_mask].index.tolist()) + + @staticmethod + def _replace_missings(data: pd.DataFrame, + column_names: Optional[list]=None) -> pd.DataFrame: + """Replace missing values (incl empty strings) + + Parameters + ---------- + data : pd.DataFrame + data to replace missings in + column_names: list, optional + list of predictors to replace missings in + + Returns + ------- + list + list of unique values in the data + """ + # replace missings (incl. empty string) + regex = re.compile("^\\s+|\\s+$") + + temp = None + if column_names: + temp = data[column_names] + else: + temp = data.copy() + temp = temp.fillna("Missing") + temp = temp.replace(regex, "") + temp = temp.replace("", "Missing") + + return temp + + @staticmethod + def _compute_p_value(X: pd.Series, y: pd.Series, category: str, + scale_contingency_table: bool) -> float: + """Summary + + Parameters + ---------- + X : pd.Series + Description + y : pd.Series + Description + category : str + Description + scale_contingency_table : bool + Description + + Returns + ------- + float + Description + """ + df = pd.concat([X, y], axis=1) + df["other_categories"] = np.where(X == category, 0, 1) + + contigency_table = pd.crosstab(index=df['other_categories'], columns=y, + margins=False) + + # if true, we scale the "other" categories + if scale_contingency_table: + size_other_cats = contigency_table.iloc[1].sum() + incidence_mean = y.mean() + + contigency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats + contigency_table.iloc[1, 1] = incidence_mean * size_other_cats + contigency_table = contigency_table.values.astype(np.int64) + + return stats.chi2_contingency(contigency_table, correction=False)[1] + + @staticmethod + def _replace_categories(data: pd.Series, categories: set) -> pd.Series: + """replace categories in set with "Other" + + Parameters + ---------- + data : pd.Series + Description + categories : set + Description + + Returns + ------- + pd.Series + Description + """ + return data.apply(lambda x: x if x not in categories else "Other") diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py new file mode 100644 index 0000000..5155127 --- /dev/null +++ b/tests/preprocessing/test_categorical_data_processor.py @@ -0,0 +1,59 @@ +import pytest + +import numpy as np +import pandas as pd + +from cobra.preprocessing import CategoricalDataProcessor + + +class TestCategoricalDataProcessor: + + @pytest.mark.parametrize("scale_contingency_table, expected", + [(False, 0.013288667), + (True, 0.434373)]) + def test_compute_p_value(self, scale_contingency_table, expected): + + X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10)) + y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2)) + category = "c1" + + actual = (CategoricalDataProcessor + ._compute_p_value(X, y, category, scale_contingency_table)) + + assert pytest.approx(actual) == expected + + def test_get_small_categories(self): + + data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5)) + incidence = 0.35 + threshold = 10 # to make it easy to manualy compute + expected = {"c3", "c4"} + + actual = (CategoricalDataProcessor + ._get_small_categories(data, incidence, threshold)) + + assert actual == expected + + def test_replace_missings(self): + + data = pd.DataFrame({"variable": ["c1", "c2", np.nan, "", " "]}) + expected = pd.DataFrame({"variable": ["c1", "c2", "Missing", "Missing", + "Missing"] + }) + actual = (CategoricalDataProcessor + ._replace_missings(data, ["variable"])) + + pd.testing.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("combined_categories, expected", + [({"c3", "c4"}, + pd.Series(data=["c1", "c2", "Other", "Other"])), + ({}, pd.Series(data=["c1", "c2", "c3", "c4"]))]) + def test_replace_categories(self, combined_categories, expected): + + data = pd.Series(data=["c1", "c2", "c3", "c4"]) + + actual = (CategoricalDataProcessor + ._replace_categories(data, combined_categories)) + + pd.testing.assert_series_equal(actual, expected) From 77a43363e96e1aeca92462cb3a8fce24d6fb677a Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 24 Dec 2019 10:12:12 +0100 Subject: [PATCH 24/98] Added forced category option to CategoricalDataProcessor --- .../categorical_data_processor.py | 33 +- cobra/preprocessing/categorical_regrouper.py | 391 ------------------ 2 files changed, 23 insertions(+), 401 deletions(-) delete mode 100644 cobra/preprocessing/categorical_regrouper.py diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 705d018..a77851f 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -1,9 +1,10 @@ """ This class implements the Python Prediction's way of dealing with -categorical data preprocessing. There are two steps involved here: +categorical data preprocessing. There are three steps involved here: - An optional regrouping of the different categories based on category size - and significance of the category + and significance of the category w.r.t. the target - Missing value replacement with the additional category "Missing" +- Change of dtype to "category" (could potentially lead to memory optimization) Authors: - Geert Verstraeten (methodology) @@ -33,9 +34,9 @@ class CategoricalDataProcessor(BaseEstimator): Attributes ---------- - category_size_threshold : int, optional + category_size_threshold : int minimal size of a category to keep it as a separate category - forced_categories : dict, optional + forced_categories : dict Map to prevent certain categories from being group into "Other" for each colum - dict of the form {col:[forced vars]}. keep_missing : bool @@ -52,10 +53,10 @@ class CategoricalDataProcessor(BaseEstimator): def __init__(self, regroup: bool=True, regroup_name: str="Other", keep_missing: bool=True, - category_size_threshold: Optional[int]=None, + category_size_threshold: int=5, p_value_threshold: float=0.001, scale_contingency_table: bool=True, - forced_categories: Optional[dict]=None): + forced_categories: dict={}): self.regroup = regroup self.regroup_name = regroup_name @@ -97,11 +98,15 @@ def fit(self, data: pd.DataFrame, column_names: list, combined_cats = self._fit_column(data, column_name, target_column) + # Remove forced categories + forced_cats = self.forced_categories.get(column_name, set()) + combined_cats = combined_cats.difference(forced_cats) + # Add to _combined_categories_by_column for later use self._combined_categories_by_column[column_name] = combined_cats def _fit_column(self, data: pd.DataFrame, column_name: str, - target_column) -> list: + target_column) -> set: """Compute which categories to regroup into "Other" for a particular column @@ -117,15 +122,13 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, list list of categories to combine into a category "Other" """ - - X = data[column_name] y = data[target_column] incidence = y.mean() combined_categories = set() # replace missings and get unique categories as a list - X = CategoricalDataProcessor._replace_missings(X) + X = CategoricalDataProcessor._replace_missings(data[column_name]) unique_categories = list(X.unique()) # get small categories and add them to the merged category list @@ -179,6 +182,11 @@ def transform(self, data: pd.DataFrame, for column_name in column_names: + if column_name not in data.columns: + log.warning("Unknown column '{}' will be skipped" + .format(column_name)) + continue + data = self._transform_column(data, column_name) return data @@ -213,6 +221,11 @@ def _transform_column(self, data: pd.DataFrame, if self.regroup: categories = self._combined_categories_by_column.get(column_name) + if not categories: + log.warning("Column '{}' is not in fitted output " + "and will be skipped".format(column_name)) + return data + data[column_name_clean] = (CategoricalDataProcessor ._replace_categories( data[column_name_clean], diff --git a/cobra/preprocessing/categorical_regrouper.py b/cobra/preprocessing/categorical_regrouper.py deleted file mode 100644 index a6276d4..0000000 --- a/cobra/preprocessing/categorical_regrouper.py +++ /dev/null @@ -1,391 +0,0 @@ - -import pandas as pd -import numpy as np -from scipy import stats -from typing import Dict -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.exceptions import NotFittedError -import logging - -log = logging.getLogger(__name__) - - -class CategoryRegrouper(BaseEstimator, TransformerMixin): - """ - Regroups categories in categorical variables based on significance - with target variable. - - Parameters - ---------- - scale_cont : bool, default=True - Whether contingency table should be scaled before chi^2.' - - pval_thresh : float, default=0.001 - Significance threshold for regroupping. - - regroup_rename : str, default='non-significant' - New name of non-significant regroupped variables. - - missing_rename : str, default='Missing' - New name of missing categories. - - keep_missing : bool, default=Falsse - Whether missing category should be kept in the result. - - forced_categories : Dict, default=None - Dictionary to force categories - - for each colum dict of {col:[forced vars]}. - - Attributes - ---------- - all_category_map_ : Dict - Dictionary with mapping for each variable. - """ - def __init__(self, scale_cont: bool = True, - pval_thresh: float = 0.001, - regroup_rename: str = "non-significant", - missing_rename: str = "Missing", - keep_missing: bool = False, - forced_categories: Dict = None): - self.scale_cont = scale_cont - self.pval_thresh = pval_thresh - self.regroup_rename = regroup_rename - self.missing_rename = missing_rename - self.keep_missing = keep_missing - self.forced_categories = forced_categories - - def fit(self, X: pd.DataFrame, - y: pd.Series, - columns: list = []): - """ - Method regroups categories whole DataFrame. - - Parameters - ---------- - X : pd.DataFrame - Dataframe with all the columns. - - y: pd.Series - Series with target variable. - - columns : list, default=[] - Columns to be regrouped. - - Raises - ------ - ValueError - In case X and y are not of the same length. - - Returns - ------- - None - Only fits the instance of the class. - """ - self.all_category_map_ = {} - - if len(X.index) != len(y.index): - raise ValueError("The length of X is {}, but the length of y is {}" - .format(len(X.index), len(y.index))) - - if not columns: - columns = CategoryRegrouper._get_categorical_columns(X) - log.warning("All object-type columns have been selected") - - for column in columns: - if column not in X.columns: - log.warning("DataFrame has no column '{}', so it will be " - "skipped in fitting" .format(column)) - continue - - self.all_category_map_[column] = self._fit_column(X=X, - y=y, - column=column) - - def _fit_column(self, X: pd.DataFrame, - y: pd.Series, - column: str) -> Dict: - """ - Method regroups categories in given column. - - Parameters - ---------- - X : pd.Series - Series with one column to be transformed. - - y: pd.Series - Series with target variable - - column : str - Column to be regrouped. - - Raises - ------ - ValueError - in case input column is not a string. - - Returns - ------- - Dict - Returns dictionary as {old category : new category} for - specific column. - """ - category_map = {} - keep_categories = [] - incidence_mean = y.mean() - - # Rename target - y.rename("TARGET", inplace=True) - - # Replace missings - X = self._replaceMissings(X=X, column=column, - replace_with=self.missing_rename) - - all_uq_categories = X[column].unique().tolist() - - # Remove small categories - categories = self._removeCategories(X=X, y=y, column=column) - - # Inspect remaining categories and test significance - for category in categories: - df_aux = pd.concat([X[column], y], axis=1) - df_aux['other_cats'] = np.where(df_aux[column] == category, 0, 1) - cont_table = pd.crosstab(index=df_aux['other_cats'], - columns=df_aux['TARGET'], - margins=False) - - # if true, we scale the "other" categories - if self.scale_cont: - size_other_cats = cont_table.iloc[1].sum() - cont_table.iloc[1, 0] = (1-incidence_mean)*size_other_cats - cont_table.iloc[1, 1] = incidence_mean*size_other_cats - cont_table = cont_table.values.astype(np.int64) - - pval = stats.chi2_contingency(cont_table, correction=False)[1] - - # If significant, keep it - if pval <= self.pval_thresh: - keep_categories.append(category) - - # Keep "Missing" even if it wasn't selected if - # it is in the original categories and set to True - if ((self.missing_rename not in keep_categories) and - (self.missing_rename in all_uq_categories) and self.keep_missing): - keep_categories.append(self.missing_rename) - - # Keep forced categories - if self.forced_categories is not None: - # If doesnt exists, give warning - forced = [col for col in self.forced_categories[column] - if col in all_uq_categories] - - # Extend list and remove duplicates - keep_categories = list(set(keep_categories.extend(forced))) - - difference = set(forced) - set(self.forced_categories[column]) - if len(difference) > 0: - log.warning("Following forced categories: {} " - "are not in column: {}.".format(difference, - column)) - - # Return dictionary as {old column : new column} - for category in all_uq_categories: - if category in keep_categories: - category_map[category] = category - else: - category_map[category] = self.regroup_rename - - return category_map - - def transform(self, X: pd.DataFrame, - columns: list = []) -> pd.DataFrame: - """ - Method transforms specified columns. - - Parameters - ---------- - X : pd.DataFrame - Dataframe with all the columns. - - columns : list, default=[] - Columns to be regrouped. - - Raises - ------ - NotFittedError - If fit() method has not been called. - - ValueError - If columns to be transformed have not been fitted. - - Returns - ------- - pd.DataFrame - Returns transformed DataFrame with new columns as "col_regrouped". - """ - if len(self.all_category_map_) == 0: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - - raise NotFittedError(msg.format(self.__class__.__name__)) - - fitted_columns = list(self.all_category_map_.keys()) - - # if specified columns not in fitted Dict, raise error - if not set(columns).issubset(set(fitted_columns)): - diff_cols = set(columns).difference(set(fitted_columns)) - raise ValueError("Following columns are not fitted: " - "{}".format(diff_cols)) - - X_tr = X.copy() - for column in columns: - X_tr[column + "_regrouped"] = self._transform_column(X=X, - column=column) - - return X_tr - - def _transform_column(self, X: pd.DataFrame, - column: str) -> pd.Series: - """ - Method transforms specified columns. - - Parameters - ---------- - X : pd.DataFrame - Dataframe with all the columns. - - column : str - Column to be regrouped. - - Returns - ------- - pd.Series - Returns DataFrame with regrouped variable as category datatype. - """ - X_tr = X[column].copy() - X_tr[column + "_regrouped"] = X_tr.replace( - to_replace=self.all_category_map_[column]) - - X_tr[column + "_regrouped"] = X_tr[column + - "_regrouped"].astype('category') - - return X_tr[column + "_regrouped"] - - def fit_transform(self, X: pd.DataFrame, - y: pd.Series, - columns: list = []) -> pd.DataFrame: - """ - Auxiliary method fits and transforms specified columns. - - Parameters - ---------- - X : pd.DataFrame - Dataframe with all the columns. - - y : pd.Series - Series with target variable - - column : list, default=[] - Columns to be regrouped. - - Returns - ------- - pd.DataFrame - Returns DataFrame with regrouped variable as category datatype. - """ - self.fit(X=X, y=y, columns=columns) - - X_tr = self.transform(X=X, columns=columns) - - return X_tr - - def _replaceMissings(self, X: pd.DataFrame, - column: str, - replace_with: str = 'Missing') -> pd.DataFrame: - """ - Method replaces missing and empty cells with `Missing` (default) in - a pd.DataFrame. - - Parameters - ---------- - X : pd.DataFrame - Dataframe where a value will be replaced if empty or nan. - - column : str - Column to be analyzed for missings. - - replace_with : str default='Missing' - String to replace the missings. - - Raises - ------ - ValueError - In case input column is not a string. - - Returns - ------- - pd.DataFrame - Modified dataframe with replaced missings. - """ - if X[column].dtype != 'O' or X[column].dtype != 'object': - raise TypeError("column {} must be a string".format(column)) - - X[column].fillna(replace_with, inplace=True) - X[column] = X[column].astype(str).str.strip() - X[column].replace('', replace_with, inplace=True) - - return X - - def _removeCategories(self, X: pd.DataFrame, - y: pd.Series, - column: str, - threshold: int = 5) -> np.ndarray: - """ - Method removes category which fail to meet certain condition - - Parameters - ---------- - X : pd.DataFrame - Dataframe with columns to be inspected for group removal. - - y : pd.Series - Series with target. - - column : str - Column to be analyzed group removal. - - threshold : int default=5 - Threshold for group removal. - - Returns - ------- - np.ndarray - Numpy array with groups to be kept. - """ - category_cnts = pd.DataFrame(X.groupby(column)[column].count()) - train_inc = y.mean() - factor = max(train_inc, 1-train_inc) - keep_categories = category_cnts.where((category_cnts*factor) > - threshold) - - return np.array(keep_categories.index.tolist()) - - @staticmethod - def _get_categorical_columns(data: pd.DataFrame) -> list: - """Get the columns containing categorical data - (dtype "object" or "category") - - Parameters - ---------- - data : pd.DataFrame - Dataframe from which categorical variables - will be extracted. - - Returns - ------- - list - List of column names containing categorical data. - """ - object_columns = data.dtypes[data.dtypes == object].index - categorical_columns = data.dtypes[data.dtypes == "category"].index - - return list(set(object_columns).union(set(categorical_columns))) From 49b922781b4d26c444a5a8e086b4d6e5a8b70a0d Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 24 Dec 2019 10:13:01 +0100 Subject: [PATCH 25/98] Add additional log statement to target_encoder --- cobra/preprocessing/target_encoder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 86ff882..328d431 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -197,6 +197,10 @@ def transform(self, data: pd.DataFrame, for column in column_names: if column not in data.columns: + log.warning("Unknown column '{}' will be skipped" + .format(column)) + continue + elif column not in self._mapping: log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column)) continue From 287270711a285ee1fd2647b992f97ad661d044d5 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 24 Dec 2019 10:15:13 +0100 Subject: [PATCH 26/98] Remove legacy code from repo as part of clean-up --- legacy_code/modeling[171005].ipynb | 1383 ----------------------- legacy_code/notes on backend.txt | 4 - legacy_code/scorecode.py | 131 --- legacy_code/scoring[171005].ipynb | 649 ----------- legacy_code/script_sprint2_.py | 991 ----------------- legacy_code/script_sprint3.py | 662 ----------- legacy_code/script_sprint4_1.py | 38 - legacy_code/script_sprint4_2.py | 458 -------- legacy_code/script_sprint4_3.py | 25 - legacy_code/univariate[171005].ipynb | 1516 -------------------------- 10 files changed, 5857 deletions(-) delete mode 100644 legacy_code/modeling[171005].ipynb delete mode 100644 legacy_code/notes on backend.txt delete mode 100644 legacy_code/scorecode.py delete mode 100644 legacy_code/scoring[171005].ipynb delete mode 100644 legacy_code/script_sprint2_.py delete mode 100644 legacy_code/script_sprint3.py delete mode 100644 legacy_code/script_sprint4_1.py delete mode 100644 legacy_code/script_sprint4_2.py delete mode 100644 legacy_code/script_sprint4_3.py delete mode 100644 legacy_code/univariate[171005].ipynb diff --git a/legacy_code/modeling[171005].ipynb b/legacy_code/modeling[171005].ipynb deleted file mode 100644 index ef957db..0000000 --- a/legacy_code/modeling[171005].ipynb +++ /dev/null @@ -1,1383 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Modeling" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### General Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import time\n", - "import math\n", - "import random\n", - "import csv\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import statsmodels.api as sm\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from scipy import stats\n", - "from sklearn import metrics\n", - "from sklearn.linear_model import LogisticRegression\n", - "from itertools import chain " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Miscellaneous" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "log = []" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# When code is in script, we define the path of the script's parent folder location as the root directory\n", - "# From this root we can travel to the relevant folders with minimal adjustment\n", - "try:\n", - " root = os.path.dirname(os.path.realpath(__file__))\n", - " root = \"/\".join(root.split('\\\\')[:-1])\n", - " log.append('Dynamic paths'+'\\n')\n", - "except:\n", - " root = 'C:/wamp64/www/python_predictions_4/assets/scripts'\n", - " log.append('Static paths'+'\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# To allow pandas dataframes to display more columns\n", - "pd.set_option(\"display.max_columns\",50)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Read data and organize" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Read-in univariate output with asssumed ID, TARGET, PARTITION and D_VARS" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "df_univariate_path = root+\"/data/univariate/df_univariate.csv\"\n", - "df_in = pd.read_csv(df_univariate_path, sep=\";\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Reference X and Y for each partition individually" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "dvars = [n for n in df_in.columns if n[:2] == 'D_']" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "mask_train = df_in.PARTITION==\"train\"\n", - "mask_selection = df_in.PARTITION==\"selection\"\n", - "mask_validation = df_in.PARTITION==\"validation\"" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "y_train = df_in.loc[mask_train,'TARGET']\n", - "y_selection = df_in.loc[mask_selection,'TARGET']\n", - "y_validation = df_in.loc[mask_validation,'TARGET']" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "x_train = df_in.loc[mask_train,dvars]\n", - "x_selection = df_in.loc[mask_selection,dvars]\n", - "x_validation = df_in.loc[mask_validation,dvars]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Analysis settings" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df_settings = pd.read_csv(root+'/python/analysis_settings.csv', sep=',', index_col=0, header=None).T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Modeltab info" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df_modeltab = pd.read_csv(root+'/data/univariate/modeltab_info.csv',sep=';', index_col=0, header=None).T\n", - "modelrun = df_modeltab.run[1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Variable selections" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df_selections = pd.read_csv(root+'/data/univariate/variable_selections.csv',sep=';')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model making and recording" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define functions" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Function to make logistic model on a predefined set of predictors + compute train AUC of resulting model \n", - "def processSubset(predictors_subset):\n", - " from sklearn.linear_model import LogisticRegression\n", - " from sklearn import metrics\n", - " # Fit model on predictors_subset and retrieve performance metric\n", - " model = LogisticRegression(fit_intercept=True, C=1e9, solver = 'liblinear')\n", - " modelfit = model.fit(y=y_train, X=x_train[predictors_subset])\n", - " # Position of the TARGET==1 class\n", - " pos = [i for i,h in enumerate(modelfit.classes_) if h==1]\n", - " # Prediction probabilities for the TARGET==1\n", - " y_pred = modelfit.predict_proba(x_train[predictors_subset])[:,pos]\n", - " auc = metrics.roc_auc_score(y_true=y_train, y_score=y_pred)\n", - " return {\"modelfit\":modelfit,\"auc\":auc,\"predictor_names\":predictors_subset,\"predictor_lastadd\":predictors_subset[-1]}" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Function for computing AUC of all sets (train, selection & validation)\n", - "def getAuc(df_without_auc):\n", - " import pandas as pd\n", - " from sklearn import metrics\n", - " df_with_auc = df_without_auc[:]\n", - " for x,y,part in [(x_train,y_train,'train'),\n", - " (x_selection,y_selection,'selection'),\n", - " (x_validation,y_validation,'validation')]:\n", - " pos = [i for i,h in enumerate(df_without_auc.modelfit.classes_) if h==1]\n", - " y_pred = df_without_auc.modelfit.predict_proba(x[df_without_auc['predictor_names']])[:,pos]\n", - " df_with_auc[\"auc_\"+part] = metrics.roc_auc_score(y_true=y, y_score=y_pred)\n", - " df_with_auc[\"pred_\"+part] = y_pred\n", - " return(df_with_auc)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "# Forward selection function that uses processSubset and getAuc\n", - "def forward(current_predictors, pool_predictors, positive_only=True):\n", - " import pandas as pd\n", - " import numpy as np\n", - " tic = time.time()\n", - " \n", - " #Pull out predictors we still need to process\n", - " remaining_predictors = [p for p in pool_predictors if p not in current_predictors]\n", - " # If there are no more predictors left to use, raise an error we can easily identify as normal\n", - " if len(remaining_predictors)==0:\n", - " raise ValueError(\"No more predictors left to use\",\"NormalStop\")\n", - " \n", - " #Create a model for each combination of: current predictor(s) + one of the remaining predictors\n", - " #Keep track of the submodels and their performance\n", - " #If error skip to next and do not include in comparison table\n", - " results = []\n", - " errorcount = 0\n", - " for p in remaining_predictors:\n", - " try:\n", - " results.append(processSubset(current_predictors+[p]))\n", - " except:\n", - " errorcount += 1 \n", - " models = pd.DataFrame(results)\n", - " \n", - " # If we require all coefficients to be positive...\n", - " if positive_only:\n", - " #Create a flag for each submodel to test if all coefficients are positive \n", - " all_positive = pd.Series(None, index=models.index)\n", - " for i in range(0,len(models)):\n", - " all_positive[i] = (models.modelfit[i].coef_ >= 0 ).all()\n", - " \n", - " # if no model exist with only positive coefficients raise error we can easily identify as normal\n", - " if (all_positive==0).all():\n", - " raise ValueError(\"No models with only positive coefficients\",\"NormalStop\")\n", - " \n", - " #Choose model with best performance and only positive coefficients\n", - " best_model = models.loc[models[all_positive==1].auc.argmax()]\n", - " best_model = getAuc(best_model)\n", - " \n", - " # If we don't require all coefficients to be positive... \n", - " else:\n", - " #Choose model with best performance\n", - " best_model = models.loc[models.auc.argmax()]\n", - " best_model = getAuc(best_model)\n", - "\n", - " \n", - " tac = time.time()\n", - " info = (\"Processed \"\n", - " + str(models.shape[0])\n", - " + \" models on \"\n", - " + str(len(current_predictors)+1) \n", - " + \" predictors in \" \n", - " + str(round(tac-tic,2)) \n", - " +\" sec with \" \n", - " + str(errorcount) \n", - " +\" errors\")\n", - " \n", - " return best_model, info" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create recipient vars" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "best_models = pd.DataFrame(columns=[\"modelfit\",\n", - " \"predictor_names\",\n", - " \"predictor_lastadd\",\n", - " \"auc_train\",\n", - " \"auc_selection\",\n", - " \"auc_validation\",\n", - " \"pred_train\",\n", - " \"pred_selection\",\n", - " \"pred_validation\"])\n", - "predictors = []" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define number of steps depending on settings and total number of predictors" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "step_setting = int(df_settings.modeling_nsteps)\n", - "n_steps = min(step_setting,len(x_train.columns))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define which variables to pass, force and filter" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "mask_pass = (df_selections.preselect == 1) & (df_selections[modelrun]==0)\n", - "varname_list_pass = 'D_'+df_selections.loc[mask_pass,'variable']\n", - "length_pass = len(varname_list_pass)\n", - "mask_force = (df_selections.preselect == 1) & (df_selections[modelrun]==1)\n", - "varname_list_force = 'D_'+df_selections.loc[mask_force,'variable']\n", - "length_force = len(varname_list_force)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execute forward modeling process" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "tic = time.time()\n", - "use_predictors = varname_list_force #x_train.columns\n", - "for i in range(1,n_steps+1):\n", - " try:\n", - " # Use predictors to be forced first. Once through the list, append the remaining variables to be passed.\n", - " use_predictors = varname_list_force.append(varname_list_pass[[i>length_force]*length_pass]).reset_index(drop=True)\n", - " result = forward(current_predictors=predictors\n", - " ,pool_predictors= use_predictors\n", - " ,positive_only=True)\n", - " best_models.loc[i] = result[0]\n", - " predictors = best_models.loc[i].predictor_names\n", - " log.append(result[1])\n", - " except Exception as e:\n", - " # Normal errors (i.e. no more predictors to be used / no models with only positive coefficients)\n", - " if e.args[-1]=='NormalStop':\n", - " log.append(\"Stopped modeling at \"+str(i)+\" predictors: \"+ e.args[-2])\n", - " # Other unknown errors\n", - " else:\n", - " log.append(\"Stopped modeling at \"+str(i)+\" predictors: unknown error\")\n", - " break\n", - "toc = time.time()\n", - "log.append(\"Forward selection modeling: \" + str(round((toc-tic)/60,0)) + \" min\"+\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# WIP" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "i = range(1,n_steps+1)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "use_predictors = varname_list_force.append(varname_list_pass[[i>length_force]*length_pass]).reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "use_predictors" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "current_predictors=predictors\n", - "pool_predictors= use_predictors\n", - "positive_only=True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "remaining_predictors = [p for p in pool_predictors if p not in current_predictors]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "results = []\n", - "errorcount = 0" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "for p in remaining_predictors:\n", - " try:\n", - " results.append(processSubset(current_predictors+[p]))\n", - " except:\n", - " errorcount += 1 \n", - "models = pd.DataFrame(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "all_positive = pd.Series(None, index=models.index)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "for i in range(0,len(models)):\n", - " all_positive[i] = (models.modelfit[i].coef_ >= 0 ).all()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "if (all_positive==0).all():\n", - " raise ValueError(\"No models with only positive coefficients\",\"NormalStop\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "best_model = models.loc[models[all_positive==1].auc.argmax()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "best_model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df_without_auc = best_model\n", - "df_with_auc = df_without_auc[:]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pos = [i for i,h in enumerate(df_without_auc.modelfit.classes_) if h==1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "x = x_validation\n", - "y = y_validation\n", - "part = 'validation'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "y.mean() # THIS IS WHY THERE IS AN ERROR. ONLY ZEROS IN THE VALIDATION SET.............. ????!!!!!!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = df_without_auc.modelfit.predict_proba(x[df_without_auc['predictor_names']])[:,pos]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "metrics.roc_auc_score(y_true=y, y_score=y_pred)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# WIP/" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Optimal model criterion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def comparefit(p,g=2):\n", - " # We fit a second degree (g=2) polyline through our auccurve \n", - " # This serves as a starting base for finding our optimal stopping point\n", - " import numpy as np\n", - " import pandas as pd\n", - " z = np.polyfit(p.index, p, g)\n", - " f = np.poly1d(z)\n", - " y_new = f(p.index)\n", - " return pd.Series(y_new,index=p.index)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def slopepoint(p,p_fit,thresh_ratio=0.2):\n", - " # We take the polyline from comparefit and look for the point of which the slope lies just below some percentage of the max. slope\n", - " slopes = [p_fit[i+1]-p_fit[i] for i in range(1,len(p_fit))]\n", - " slopes = pd.Series(slopes, index=range(1,len(p_fit)))\n", - " thresh = slopes.max()*thresh_ratio\n", - " p_best_index = (slopes[slopes>thresh])[-1:].index\n", - " p_best = p.loc[p_best_index]\n", - " return p_best" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def moveright(p,p_fit,p_best,n_steps=5,dampening=0.01):\n", - " # We look nsteps right on the polyline (starting from the slopepoint) and take the point with largest difference with real line\n", - " # We move to that point if that difference is larger than some multiplication of the difference at the slopepoint\n", - " # That multiplication gets larger as current the current difference gets smaller with a certain amount of dampening. \n", - " # The rationale behind this is as follows: \n", - " # if the current difference is already large than the larger difference will definitely be noteworthy\n", - " # if however the current difference is near zero than there needs to be much larger difference to be noteworthy\n", - " in_index = p_best.index.values[0]\n", - " lower = (in_index-1)\n", - " upper = (in_index+n_steps-1)\n", - " p_diff = p[lower:upper]-p_fit[lower:upper]\n", - " out_index = p_diff.argmax()\n", - " factor = 1/abs(p_diff[in_index])\n", - " if (p_diff[out_index]>p_diff[in_index]+(abs(p_diff[in_index])*factor*dampening)):\n", - " p_best_new = pd.Series(p[out_index],index=[out_index])\n", - " else:\n", - " p_best_new = p_best\n", - " return p_best_new" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def moveleft(p,p_fit,p_best,rangeloss=0.1, diffshare=0.8): #diff_min=0.005):\n", - " # Starting from whatever point we end up with (either the slopepoint or a move to the right)\n", - " # We look left on the polyline and take the point for which the real line is largest (current point included)\n", - " # We move left if we stay within [a specific % loss of range] AND [a minimum % of current difference]\n", - " # i.e. we don't won't to go to low compared to the overall real line\n", - " # and we don't won't to move to a point that does not make a significant increase in AUC (i.e. difference between polyline and real line)\n", - " p_left = p[:p_best.index.values[0]]\n", - " p_best = p_left[p_left==p_left.max()]\n", - " p_diff = p-p_fit\n", - " p_range = p.max()-p.min()\n", - " s = p[(p >= p_best.values[0]-(rangeloss*p_range)) \n", - " & (p.index <= p_best.index.values[0]) \n", - " & (p_diff>=diffshare*p_diff[p_left.index[-1]])\n", - " ]\n", - " p_best_new = s[s.index == s.index.values.min()]\n", - " return p_best_new" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execute functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "points = best_models.auc_selection\n", - "points_fit = comparefit(p=points, g=2)\n", - "points_slope = slopepoint(p=points, p_fit=points_fit, thresh_ratio=0.2)\n", - "points_right = moveright(p=points, p_fit=points_fit, p_best=points_slope, n_steps=5, dampening=0.01)\n", - "points_left = moveleft(p=points, p_fit=points_fit, p_best=points_right, rangeloss=0.1, diffshare=0.8)\n", - "\n", - "optimal_nvars = points_left.index.values[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Inspect" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "%matplotlib inline\n", - "from pylab import rcParams\n", - "rcParams['figure.figsize'] = 10, 5\n", - "\n", - "plt.plot( points.index , points , color=\"blue\")\n", - "plt.plot( points_fit.index , points_fit , color=\"red\")\n", - "plt.plot( points_slope.index, points_slope,'o', color=\"lightgreen\", markersize=12)\n", - "plt.plot( points_right.index, points_right,'o', color=\"black\" , markersize=8)\n", - "plt.plot( points_left.index , points_left ,'o', color=\"gold\" , markersize=4)\n", - "\n", - "axes = plt.gca()\n", - "axes.set_ylim([0.45,1])\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cumulative gains/response" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compute cumulative response/gains\n", - "def cumulatives(y,yhat,perc_as_int=False,dec=2):\n", - " nrows = len(y)\n", - " npositives = y.sum()\n", - " y_yhat = pd.DataFrame({\"y\":y, \"yhat\":yhat}).sort_values(by='yhat', ascending=False).reset_index(drop=True)\n", - " cresp = []\n", - " cgains = [0]\n", - " for stop in (np.linspace(0.01,1,100)*nrows).astype(int):\n", - " cresp.append(round(y_yhat.loc[:stop,'y'].mean()*max(100*int(perc_as_int),1),dec))\n", - " cgains.append(round(y_yhat.loc[:stop,'y'].sum()/npositives*max(100*int(perc_as_int),1),dec))\n", - " return cresp,cgains" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execute functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cresp_all = [None]\n", - "cgains_all = [None]\n", - "for i in range(1,len(best_models)+1):\n", - " out = cumulatives(y=y_selection\n", - " ,yhat=best_models.pred_selection[i][:,0]\n", - " ,perc_as_int=True\n", - " ,dec=2)\n", - " cresp_all.append(out[0]) \n", - " cgains_all.append(out[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Inspect" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "%matplotlib inline\n", - "from pylab import rcParams\n", - "rcParams['figure.figsize'] = 10, 5\n", - "\n", - "cmap = plt.get_cmap('hot')\n", - "colors = [cmap(i) for i in np.linspace(0, 1, n_steps)]\n", - "for i in range(1,len(best_models)):\n", - " plt.plot(range(1,101), cresp_all[i], color=colors[i-1])\n", - "plt.plot(range(1,101), cresp_all[-1], color=\"black\")\n", - " \n", - "axes = plt.gca()\n", - "axes.set_ylim([0,max(max(l) for l in np.array(cresp_all)[1:])])\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "%matplotlib inline\n", - "from pylab import rcParams\n", - "rcParams['figure.figsize'] = 10, 5\n", - "\n", - "cmap = plt.get_cmap('hot')\n", - "colors = [cmap(i) for i in np.linspace(0, 1, n_steps)]\n", - "for i in range(1,len(best_models)):\n", - " plt.plot(range(0,101), cgains_all[i], color=colors[i-1])\n", - "plt.plot(range(0,101), cgains_all[-1], color=\"black\")\n", - " \n", - "axes = plt.gca()\n", - "axes.set_ylim([0,max(max(l) for l in np.array(cgains_all)[1:])])\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Variable Importance" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Compute variable importance based on correlation between predictor and prediction (on selection set)\n", - "def getImportance(model):\n", - " from scipy import stats\n", - " \n", - " predictors = [pred[2:] for pred in model.predictor_names]\n", - " pearcorr = []\n", - " for predictor in predictors:\n", - " pearsonr = stats.pearsonr(x_selection.loc[:,'D_'+predictor].values, model.pred_selection[:,0])\n", - " pearcorr.append(pearsonr[0].round(2))\n", - " df_result = pd.DataFrame({'variable':predictors,'importance':pearcorr}, columns=['variable','importance'])\n", - " return df_result" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execute function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "importance_all=[None]\n", - "for i in best_models.index:\n", - " importance_all.append(getImportance(best_models.loc[i,:]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Inspect" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "scrolled": true - }, - "source": [ - "%matplotlib inline\n", - "from pylab import rcParams\n", - "rcParams['figure.figsize'] = 10, 5\n", - "\n", - "#nvars = optimal_nvars\n", - "nvars = len(best_models)\n", - "\n", - "fig, ax = plt.subplots()\n", - "predictors = importance_all[nvars].variable\n", - "y_pos = np.arange(len(predictors))\n", - "importance = importance_all[nvars].importance\n", - "\n", - "ax.barh(y_pos, importance, align='center',\n", - " color='darkblue', ecolor='black')\n", - "ax.set_yticks(y_pos)\n", - "ax.set_yticklabels(predictors)\n", - "ax.invert_yaxis()\n", - "ax.set_xlabel('Importance')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Coefficients" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Store all variable names + coefficients for every best model (with 1,2,3,... variables) \n", - "vars_out = []\n", - "coef_out = []\n", - "nmod_out = []\n", - "for i in best_models.index:\n", - " modout = best_models.loc[i,:]\n", - " vars_out_st = ['Intercept']+[var[2:] for var in modout.predictor_names]\n", - " vars_out.append(vars_out_st)\n", - " coef_out_st = list(modout.modelfit.intercept_)+list(+ modout.modelfit.coef_[0])\n", - " coef_out.append(coef_out_st)\n", - " nmod_out.append([i]*(i+1))\n", - " \n", - "vars_out = list(chain.from_iterable(vars_out))\n", - "coef_out = list(chain.from_iterable(coef_out))\n", - "nmod_out = list(chain.from_iterable(nmod_out))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_coeff = pd.DataFrame({'nstep':nmod_out,'varname':vars_out,'coeff':coef_out}, columns=['nstep','varname','coeff'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Export Files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "nmods = len(best_models)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Auc curve" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "filename = root+\"/data/modeling/\"+modelrun+\"_auccurve.csv\"\n", - "with open(filename, 'w') as csvfile:\n", - " write=csv.writer(csvfile, delimiter =';')\n", - " write.writerow([\"optimal\" ,optimal_nvars])\n", - " write.writerow([\"selected\",optimal_nvars])\n", - " write.writerow([\"variable\",\"train\", \"selection\",\"validation\"])\n", - " write.writerows([best_models.predictor_lastadd[i][2:]\n", - " , best_models.auc_train[i].round(3) \n", - " , best_models.auc_selection[i].round(3)\n", - " , best_models.auc_validation[i].round(3) ] for i in range(1,nmods+1))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Cresp" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "for v in range(1,nmods+1):\n", - " filename = root+\"/data/modeling/\"+modelrun+\"_cresp_\"+str(v)+\".csv\"\n", - " with open(filename, 'w') as csvfile:\n", - " write=csv.writer(csvfile, delimiter =';') \n", - " write.writerows([i+1, cresp_all[v][i]] for i in range(0,100))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Cgains" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "for v in range(1,nmods+1):\n", - " filename = root+\"/data/modeling/\"+modelrun+\"_cgains_\"+str(v)+\".csv\"\n", - " with open(filename, 'w') as csvfile:\n", - " write=csv.writer(csvfile, delimiter =';') \n", - " write.writerows([i, cgains_all[v][i]] for i in range(0,101))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Variable importance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for v in range(1,nmods+1):\n", - " filename = root+\"/data/modeling/\"+modelrun+\"_importance_\"+str(v)+\".csv\"\n", - " with open(filename, 'w') as csvfile:\n", - " write=csv.writer(csvfile, delimiter =';') \n", - " write.writerow(['variable','importance'])\n", - " write.writerows([importance_all[v].iloc[i,0],importance_all[v].iloc[i,1]] for i in range(v))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Model coefficients" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "out_path = root+\"/data/modeling/\"+modelrun+\"_modelcoeff.csv\"\n", - "df_coeff.to_csv(path_or_buf=out_path, sep=';', index=False, encoding='utf-8', line_terminator='\\n', quoting=csv.QUOTE_NONNUMERIC)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Log messages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "log.append(\"-- Modeling phase completed --\"+\"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "log_file = open(root+\"/python/\"+modelrun+\"_modeling.log\",'w')\n", - "log_file.write('\\n'.join(log))\n", - "log_file.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Stop script" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "print(\"ok\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# WIP" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Scoring all rows" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scoring of all rows\n", - "import re\n", - "tic = time.time()\n", - "df_score = pd.DataFrame([])\n", - "df_score['ID'] = df_in['ID']\n", - "scores = []\n", - "for i in range(len(df_in)):\n", - " ### METHOD 1: using function\n", - " score = [optifit.predict_proba(df_in[optivars])[i,:][-1]]\n", - " ### METHOD 2: with coefficients (same method as in scoring)\n", - " #exponent = optiint + ((df_in[optivars].iloc[i,:])*(opticoef[0])).sum()\n", - " #score = [(math.exp(exponent)) / (1+math.exp(exponent))]\n", - " \n", - " scores.extend(score)\n", - " try:\n", - " zeros = re.findall('[0]+$',str(i))\n", - " if len(zeros[0])>=3:\n", - " print(i)\n", - " except:\n", - " a=1\n", - "df_score['score']=pd.Series(scores)\n", - "tac = time.time()\n", - "print((tac-tic)/60)\n", - "\n", - "\n", - "df_in.to_csv('df_mod.csv', sep=';', index=False, encoding='utf-8', line_terminator='\\n')\n", - "df_score.to_csv(path_or_buf='scores_modeling.csv', sep=';', index=False, encoding='utf-8', line_terminator='\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# /WIP" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/legacy_code/notes on backend.txt b/legacy_code/notes on backend.txt deleted file mode 100644 index 7a05ea6..0000000 --- a/legacy_code/notes on backend.txt +++ /dev/null @@ -1,4 +0,0 @@ -The notebooks are written so they can be transformed to a script (that goes in the WAMP local folder) without any reworking necessary -In jupyter notebook: File > Download as > .py file -or -Via cmd line: jupyter nbconvert --to Python univariate.ipynb \ No newline at end of file diff --git a/legacy_code/scorecode.py b/legacy_code/scorecode.py deleted file mode 100644 index 4a4dd31..0000000 --- a/legacy_code/scorecode.py +++ /dev/null @@ -1,131 +0,0 @@ -### Importing libraries & basetable to score -# Importing Libraries -import time -import math -import csv -import re -import pandas as pd -import numpy as np -# Importing Types -typevariables=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'TARGET', 'ID', 'scont_1', 'scont_2', 'scont_3', 'scont_4', 'scont_5', 'scont_6', 'scont_7', 'scont_8', 'scont_9', 'scont_10', 'scat_1', 'scat_2', 'scat_3', 'scat_4', 'scat_5', 'sflag_1', 'sflag_2', 'sflag_3', 'sflag_4', 'sflag_5'] -typetypes=['int', 'str', 'int', 'str', 'int', 'str', 'str', 'str', 'str', 'str', 'int', 'int', 'int', 'str', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'str', 'str', 'str', 'str', 'str', 'int', 'int', 'int', 'int', 'int'] -df_types=pd.DataFrame({'var':typevariables,'type':typetypes},columns=['var','type']) -df_types_copy = df_types.copy() -bool_mask = df_types_copy.loc[:,'type']!='bool' -df_types_copy.loc[bool_mask,'type'] = [getattr(__builtins__, type_str) for type_str in df_types_copy.loc[bool_mask,'type']] -df_types_copy.loc[bool_mask==False,'type'] = getattr(__builtins__, 'str') -types = df_types_copy.set_index('var').T.to_dict('records') -# Importing Basetable with similar typing as in univariate analysis -df_base = pd.read_csv('df_base.csv',header=0,sep=None,engine='python',converters=types[0]) - -### Creating dataframe containing model rules -modvariables=['Intercept', 'relationship', 'education', 'capital-gain', 'occupation', 'hours-per-week', 'marital-status', 'workclass'] -modcoefficients=[-7.089251945907992, 2.857566239388173, 3.9559266007760656, 3.805293440526549, 3.0368562075260153, 2.8679282172225298, 3.8409050507426072, 1.270430053023763] -df_modrules=pd.DataFrame({'varname':modvariables,'coeff':modcoefficients}) - -### Creating dataframe containing incidence translation rules -prepvariables=['relationship', 'relationship', 'relationship', 'relationship', 'relationship', 'relationship', 'education', 'education', 'education', 'education', 'education', 'education', 'education', 'education', 'education', 'education', 'education', 'education', 'education', 'education', 'education', 'capital-gain', 'capital-gain', 'occupation', 'occupation', 'occupation', 'occupation', 'occupation', 'occupation', 'occupation', 'occupation', 'occupation', 'occupation', 'occupation', 'occupation', 'hours-per-week', 'hours-per-week', 'hours-per-week', 'hours-per-week', 'marital-status', 'marital-status', 'marital-status', 'marital-status', 'marital-status', 'marital-status', 'marital-status', 'workclass', 'workclass', 'workclass', 'workclass', 'workclass', 'workclass', 'workclass'] -prepbins=['Husband', 'Wife', 'Not-in-family', 'Own-child', 'Unmarried', 'Other-relative', 'Some-college', '7th-8th', 'HS-grad', 'Bachelors', '5th-6th', 'Doctorate', 'Non-significants', 'Masters', 'Prof-school', '10th', '11th', '12th', '9th', '1st-4th', 'Preschool', '[..., 2105]', '(2105,...]', 'Exec-managerial', 'Non-significants', 'Sales', '?', 'Prof-specialty', 'Farming-fishing', 'Handlers-cleaners', 'Other-service', 'Protective-serv', 'Machine-op-inspct', 'Adm-clerical', 'Priv-house-serv', '(48,...]', '(40, 48]', '(35, 40]', '[..., 35]', 'Married-civ-spouse', 'Divorced', 'Never-married', 'Married-spouse-absent', 'Separated', 'Non-significants', 'Widowed', 'Private', 'Self-emp-not-inc', '?', 'Local-gov', 'Self-emp-inc', 'Non-significants', 'Federal-gov'] -prepincids=[0.4474003641513251, 0.4803571428571429, 0.10295278698878887, 0.015203145478374838, 0.056633663366336635, 0.04037685060565276, 0.18898367952522252, 0.0641025641025641, 0.16431451612903225, 0.42006504878659, 0.05303030303030303, 0.7353951890034365, 0.2429441062534588, 0.5435276305828918, 0.7382198952879581, 0.05128205128205128, 0.05584281282316442, 0.08433734939759037, 0.04702970297029703, 0.060869565217391314, 0.022727272727272728, 0.20500310476359446, 0.6513859275053305, 0.4710552431359577, 0.23353535353535354, 0.2739825581395349, 0.10049893086243764, 0.4519454605919521, 0.11946308724832215, 0.06824644549763033, 0.043478260869565216, 0.3169734151329243, 0.11971372804163954, 0.14123893805309734, 0.015503875968992246, 0.4302445038011095, 0.3498478922207736, 0.207983367983368, 0.08491107286288009, 0.4459652889604581, 0.10560344827586207, 0.044271796769022084, 0.09846153846153846, 0.07431551499348109, 0.4736842105263158, 0.07603092783505154, 0.2187813366365835, 0.2839756592292089, 0.1010028653295129, 0.2814526588845655, 0.554320987654321, 0.26436781609195403, 0.3900709219858156] -df_prep = pd.DataFrame({'var':prepvariables,'bin':prepbins,'incid':prepincids}, dtype=object) -df_prep.loc[:,'incid']=df_prep.loc[:,'incid'].astype('float64') - -### Grouping basetable predictors along their types and trimming basetable accordingly -predictors = list(df_modrules.loc[df_modrules.varname!='Intercept','varname'].values) -not_predictors = [column for column in df_base.columns if column not in predictors] -mask_FloatOrInt = (df_types.type=='int')|(df_types.type=='float') -numeric_headers=[var for var in df_types.loc[mask_FloatOrInt,'var'].values if var in predictors] -object_headers=[var for var in df_types.loc[df_types.type=='str','var'].values if var in predictors] -bool_headers=[var for var in df_types.loc[df_types.type=='bool','var'].values if var in predictors] -df_base = df_base[predictors+['ID']] - -### Preprocessing the basetable -# Strip quot function -def strip_quot(x_in): - try: - x_out = x_in.strip().strip('"').strip("'") - except: - x_out=x_in - return x_out -# Lower/upper function -def lower_upper(x_in): - if ((x_in.lower() == 'id')|(x_in.lower() == 'target')): - x_out = x_in.upper() - else: - x_out = x_in.lower() - return x_out -# maskmissing function in str/bool columns -def maskmissing(var): - crit1 = var.isnull() - modvar = pd.Series([str(value).strip() for value in var]) - crit2 = modvar==pd.Series(['']*len(var)) - return crit1 | crit2 -# Apply preprocessing functions -df_base = df_base.rename(columns=strip_quot) -df_base = df_base.rename(columns=lower_upper) -df_base = df_base.applymap(strip_quot) -for header in object_headers+bool_headers: - mask = maskmissing(df_base[header]) - df_base.loc[mask,header]='Missing' - -### Incidence replacement -# Recipient dataframe -df_out = pd.DataFrame() -df_out['ID']=df_base['ID'] -# Incidence replacement for string columns -for header in object_headers+bool_headers: - mask = df_prep.loc[:,'var']==header - bins = df_prep.loc[mask,'bin'] - incidences = df_prep.loc[mask,'incid'] - nonsig_bins = [] - nonsig_incidences = [] - if (bins == 'Non-significants').any(): - nonsig_bins = [binn for binn in df_base[header].unique() if binn not in list(bins)] - nonsig_incidences = list(incidences[bins=='Non-significants'])*len(nonsig_bins) - keys = list(bins) - keys.extend(nonsig_bins) - values = list(incidences) - values.extend(nonsig_incidences) - keys_and_values = zip(keys,values) - transdic = dict(keys_and_values) - items_to_translate = df_base[header] - df_out.loc[:,'D_'+header]= pd.Series([transdic[item] for item in items_to_translate]) -# Incidence replacement for numeric columns -for header in numeric_headers: - mask = df_prep.loc[:,'var']==header - bins = df_prep.loc[mask,'bin'] - incidences = df_prep.loc[mask,'incid'] - index_missing = bins.index[bins=='Missing'] - incidence_missing = incidences[index_missing] - upper_values = pd.Series([]) - for i,binn in enumerate(bins.values): - upper_value = binn.split(',')[-1] - try: - upper_value = re.findall('[0-9]+',upper_value)[0] - except: - upper_value = math.inf - upper_values[i] = upper_value - upper_values.index = bins.index - upper_values.drop(index_missing, inplace=True) - upper_values = upper_values.astype(float) - upper_values.sort_values(inplace=True) - upper_values_incidences = incidences[upper_values.index] - upper_values.reset_index(drop=True, inplace=True) - upper_values_incidences.reset_index(drop=True, inplace=True) - mask_npnan = df_base.loc[:,header].isnull() - lowest_memberships = upper_values.searchsorted(df_base.loc[:,header],side='left') - incidences_to_attribute = upper_values_incidences[lowest_memberships].reset_index(drop=True) - incidences_to_attribute[mask_npnan] = incidence_missing - df_out['D_'+header] = incidences_to_attribute - -### Scoring -df_scores = pd.DataFrame([]) -df_scores['ID'] = df_out['ID'] -scores = [] -intercept=-7.08925194591 -coefficients=np.array([2.857566239388173, 3.9559266007760656, 3.805293440526549, 3.0368562075260153, 2.8679282172225298, 3.8409050507426072, 1.270430053023763]) -productsums = (df_out['D_'+pd.Series(predictors)]*coefficients).sum(axis=1) -exponents = intercept + productsums -scores = exponents.apply(func=lambda x:(math.exp(x)) / (1+math.exp(x))) -df_scores['score']=scores - diff --git a/legacy_code/scoring[171005].ipynb b/legacy_code/scoring[171005].ipynb deleted file mode 100644 index 76723bf..0000000 --- a/legacy_code/scoring[171005].ipynb +++ /dev/null @@ -1,649 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scoring" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Purpose of this script is to output (an)other script(s) with self-contained code to score out a basetable\n", - "\n", - "Call this script: the scriptmaker\n", - "\n", - "Call the created self-contained script: the scoring script" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import time\n", - "import math\n", - "import csv\n", - "import re\n", - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Miscellaneous" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "log = []" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# When code is in script, we define the path of the script's parent folder location as the root directory\n", - "# From this root we can travel to the relevant folders with minimal adjustment\n", - "try:\n", - " root = os.path.dirname(os.path.realpath(__file__))\n", - " root = \"/\".join(root.split('\\\\')[:-1])\n", - " log.append('Dynamic paths'+'\\n')\n", - "except:\n", - " root = 'C:/wamp64/www/python_predictions_4/assets/scripts'\n", - " log.append('Static paths'+'\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Read the data and create variables to be exported" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 1A - Retrieve Modeltab info to find out for which modeltab we want a scoring script for\n", - "Intermediate step for 1C" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df_modeltab = pd.read_csv(root+'/data/univariate/modeltab_info.csv',sep=';', index_col=0, header=None).T\n", - "modeltabtoscore = df_modeltab.score[1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 1B - Retrieve number of vars selected to identify which n-th model of the 'modeltab'-models we are interested in\n", - "Intermediate step for 1C" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "df_auccurve_path = root+\"/data/modeling/\"+modeltabtoscore+\"_auccurve.csv\"\n", - "df_auccurve = open(df_auccurve_path).read()\n", - "selected_nvars = int(re.findall(r\"selected;[0-9]+\",df_auccurve)[0].split(';')[-1])\n", - "selected_nvars = len(pd.read_csv(df_auccurve_path,skiprows=3, sep=';')) # USED FOR TESTING ALL VARS, TO BE DELETED !!!!!!!!!!!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 1C - Retrieve model coefficients of the specific n-th model of the 'modeltab'-models\n", - "\n", - "Result is to be stored in text in the scoring script, to achieve self-containment" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "df_modelcoeff_path = root+\"/data/modeling/\"+modeltabtoscore+\"_modelcoeff.csv\"\n", - "df_modelcoeff = pd.read_csv(df_modelcoeff_path, sep=';')\n", - "mask = df_modelcoeff.nstep == int(selected_nvars)\n", - "df_modrules = df_modelcoeff.loc[mask,:] # TO BE STORED in SCORING SCRIPT -------------------------------------------------------" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 2 - Retrieve Data types of the predictors\n", - "\n", - "Result is to be stored in text in the scoring script, to achieve self-containment" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "types_path = root+\"/python/data_types.csv\"\n", - "df_types = pd.read_csv(types_path, header=None)\n", - "df_types.columns=['var','type'] # TO BE STORED in SCORING SCRIPT ---------------------------------------------------------------" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 3 - Univariate table output for deriving the translation from original VAR to discretisized D_VAR\n", - "Result is to be stored in text in the scoring script, to achieve self-containment" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "df_univariate_path = root+\"/data/univariate/df_univariate.csv\"\n", - "# we create str converters for all the B_variables coming from the univariate table\n", - "# we need this because bool & str variables (defined by ...types.csv) from the basetable are converted to objects\n", - "# and will have to be compared to the B_variables values, which better have the same type\n", - "# e.g. we have a varflag in our basetable which is converted to an object, but assume B_varflag is not converted and will be automatically read as float 1.0/0.0\n", - "# in our incidence replacement we will thus be comparing '1'/'0' with 1.0/0.0, which won't work \n", - "uni_iterable = [(variable,getattr(__builtins__, 'str')) for variable in 'B_'+df_modrules.varname[1:].values]\n", - "uni_dict = dict(uni_iterable)\n", - "df_uni = pd.read_csv(df_univariate_path, sep=\";\", converters=uni_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "dcolumns = ['D_'+name for name in df_modrules.varname[1:]]\n", - "bcolumns = ['B_'+name for name in df_modrules.varname[1:]]\n", - "gvar = []\n", - "gincid = []\n", - "gbin = []\n", - "\n", - "for i in range(len(dcolumns)):\n", - " # Select B_varname and D_varname\n", - " # Then take unique combinations of B_var and D_var in the univariate dataframe\n", - " # These combinations give the incidence value to attribute to the (possibly discretisized/regrouped) variables\n", - " columns_set = dcolumns[i:i+1]+bcolumns[i:i+1]\n", - " df_dupli = df_uni.loc[:,columns_set].drop_duplicates()\n", - " n_occurences = len(df_dupli) \n", - " \n", - " gvar.extend([df_dupli.columns[0][2:]]*n_occurences)\n", - " gincid.extend(df_dupli.iloc[:,0].values)\n", - " gbin.extend(df_dupli.iloc[:,1].values)\n", - "\n", - " \n", - "df_prep = pd.DataFrame({'var':gvar,'bin':gbin,'incid':gincid} \n", - " ,columns=['var','bin','incid']) # TO BE STORED in SCORING SCRIPT --------------------------------" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Writing scoring scripts" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### for R" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "score_code = open(root+\"/Python/scorecode.R\",'w')\n", - "\n", - "score_code.write(\"### Importing libraries & basetable to score\\n\")\n", - "score_code.write(\"# Importing libraries\\n\")\n", - "score_code.write(\"#library(dplyr)\\n\")\n", - "score_code.write(\"# Importing Types\\n\")\n", - "score_code.write(\"typevariables=c\"+str([var for var in df_types.loc[:,'var']]).replace(\"[\",\"(\").replace(\"]\",\")\")+\"\\n\")\n", - "score_code.write(\"typetypes=c\"+str([vartype for vartype in df_types.loc[:,'type']]).replace(\"[\",\"(\").replace(\"]\",\")\")+\"\\n\")\n", - "score_code.write(\"df_types=data.frame(var=typevariables,type=typetypes, stringsAsFactors='False')\\n\")\n", - "score_code.write(\"df_types_copy = df_types\\n\")\n", - "score_code.write(\"df_types_copy$type[df_types_copy$type=='int'|df_types_copy$type=='float']='numeric'\\n\")\n", - "score_code.write(\"df_types_copy$type[df_types_copy$type=='str'|df_types_copy$type=='bool']='character'\\n\")\n", - "score_code.write(\"coltypes = df_types_copy$type\\n\")\n", - "score_code.write(\"names(coltypes) = df_types_copy$var\\n\")\n", - "score_code.write(\"# Importing Basetable (with similar typing as in univariate analysis)\\n\")\n", - "score_code.write(\"df_base = read.csv('df_base.csv', check.names='False', colClasses=coltypes )\\n\")\n", - "\n", - "score_code.write(\"### Creating dataframe containing model rules\\n\")\n", - "score_code.write(\"modvariables=c\"+str([var for var in df_modrules.loc[:,'varname']]).replace(\"[\",\"(\").replace(\"]\",\")\")+\"\\n\")\n", - "score_code.write(\"modcoefficients=c\"+str([coeff for coeff in df_modrules.loc[:,'coeff']]).replace(\"[\",\"(\").replace(\"]\",\")\")+\"\\n\")\n", - "score_code.write(\"df_modrules=data.frame(varname=modvariables,coeff=modcoefficients, stringsAsFactors='False')\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.write(\"### Creating dataframe containing incidence translation rules\\n\")\n", - "score_code.write(\"prepvariables=c\"+str([var for var in df_prep.loc[:,'var']]).replace(\"[\",\"(\").replace(\"]\",\")\")+\"\\n\")\n", - "score_code.write(\"prepbins=c\"+str([bin for bin in df_prep.loc[:,'bin']]).replace(\"[\",\"(\").replace(\"]\",\")\")+\"\\n\")\n", - "score_code.write(\"prepincids=c\"+str([bin for bin in df_prep.loc[:,'incid']]).replace(\"[\",\"(\").replace(\"]\",\")\")+\"\\n\")\n", - "score_code.write(\"df_prep =data.frame(var=prepvariables,bin=prepbins,incid=prepincids, stringsAsFactors='False')\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.write(\"### Grouping basetable predictors along their types and trimming basetable accordingly\\n\")\n", - "score_code.write(\"predictors = df_modrules$varname[df_modrules$varname!='Intercept']\\n\")\n", - "score_code.write(\"not_predictors = subset(colnames(df_base),!(colnames(df_base) %in% predictors))\\n\")\n", - "score_code.write(\"mask_FloatOrInt = df_types$type=='int'|df_types$type=='float'\\n\")\n", - "score_code.write(\"numeric_headers = subset(df_types$var[mask_FloatOrInt], df_types$var[mask_FloatOrInt] %in% predictors)\\n\")\n", - "score_code.write(\"object_headers = subset(df_types$var[df_types$type=='str'], df_types$var[df_types$type=='str'] %in% predictors)\\n\")\n", - "score_code.write(\"bool_headers = subset(df_types$var[df_types$type=='bool'], df_types$var[df_types$type=='bool'] %in% predictors)\\n\")\n", - "score_code.write(\"df_base = df_base[c(predictors,'ID')]\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.write(\"### Preprocessing the basetable\\n\")\n", - "score_code.write(\"# Strip quot function\\n\")\n", - "score_code.write(\"strip_quot<-function(x){\\n\")\n", - "score_code.write(' x = gsub(\"')\n", - "score_code.write(\"'\")\n", - "score_code.write('\",\"\",x)\\n')\n", - "score_code.write(\" x = gsub('\")\n", - "score_code.write('\"')\n", - "score_code.write(\"','',x)\\n\") \n", - "score_code.write(\" x = trimws(x)\\n\")\n", - "score_code.write(\" return(x)\\n\")\n", - "score_code.write(\"}\\n\")\n", - "score_code.write(\"# Lower/upper function\\n\")\n", - "score_code.write(\"lower_upper<-function(x){\\n\")\n", - "score_code.write(\" if (tolower(x)=='id'|tolower(x)=='target'){\\n\")\n", - "score_code.write(\" x = toupper(x)\\n\")\n", - "score_code.write(\" }\\n\")\n", - "score_code.write(\" else {\\n\")\n", - "score_code.write(\" x = tolower(x)\\n\")\n", - "score_code.write(\" }\\n\")\n", - "score_code.write(\"}\\n\")\n", - "score_code.write(\"# maskmissing function in str/bool columns\\n\")\n", - "score_code.write(\"maskmissing<-function(var){\\n\")\n", - "score_code.write(\" crit1 = is.na(var)\\n\")\n", - "score_code.write(\" crit2 = var==''\\n\")\n", - "score_code.write(\" return(crit1|crit2)\\n\")\n", - "score_code.write(\"}\\n\")\n", - "score_code.write(\"# Apply preprocessing functions\\n\")\n", - "score_code.write(\"colnames(df_base) = sapply(colnames(df_base), lower_upper)\\n\")\n", - "score_code.write(\"colnames(df_base) = sapply(colnames(df_base), strip_quot)\\n\")\n", - "score_code.write(\"df_base[] = lapply(df_base, strip_quot)\\n\")\n", - "score_code.write(\"for (predictor in c(object_headers,bool_headers)){\\n\")\n", - "score_code.write(\" df_base[maskmissing(df_base[predictor]),predictor]='Missing'\\n\")\n", - "score_code.write(\"}\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.write(\"### Incidence replacement\\n\")\n", - "score_code.write(\"# Recipient dataframe\\n\")\n", - "score_code.write(\"df_out = data.frame(ID=df_base$ID)\\n\")\n", - "score_code.write(\"# Incidence replacement for string columns\\n\")\n", - "score_code.write(\"for (header in c(object_headers,bool_headers)){\\n\")\n", - "score_code.write(\" mask = df_prep$var==header\\n\")\n", - "score_code.write(\" bins = df_prep[mask,'bin']\\n\")\n", - "score_code.write(\" incidences = df_prep[mask,'incid']\\n\")\n", - "score_code.write(\" nonsig_bins = c()\\n\")\n", - "score_code.write(\" nonsig_incidences = c()\\n\")\n", - "score_code.write(\" if (sum(bins == 'Non-significants')>0) {\\n\")\n", - "score_code.write(\" nonsig_bins = subset(unique(df_base[,header]), !(unique(df_base[,header]) %in% bins))\\n\")\n", - "score_code.write(\" nonsig_incidences = rep(incidences[bins=='Non-significants'],length(nonsig_bins))\\n\")\n", - "score_code.write(\" }\\n\")\n", - "score_code.write(\" keys = c(bins,nonsig_bins)\\n\")\n", - "score_code.write(\" values = c(incidences,nonsig_incidences)\\n\")\n", - "score_code.write(\" df_out[paste('D_',header, sep='')] = values[match(df_base[,header], keys)]\\n\")\n", - "score_code.write(\"}\\n\")\n", - "score_code.write(\"# Incidence replacement for numeric columns\\n\")\n", - "score_code.write(\"for (header in numeric_headers){\\n\")\n", - "score_code.write(\" mask = df_prep$var==header\\n\")\n", - "score_code.write(\" bins = df_prep[mask,'bin']\\n\")\n", - "score_code.write(\" incidences = df_prep[mask,'incid']\\n\")\n", - "score_code.write(\" index_missing = which(bins=='Missing')\\n\")\n", - "score_code.write(\" incidence_missing = incidences[index_missing]\\n\")\n", - "score_code.write(\" upper_values = c()\\n\")\n", - "score_code.write(\" last <- function(x) { return( x[length(x)] ) }\\n\")\n", - "score_code.write(\" for (binn in bins){\\n\")\n", - "score_code.write(\" upper_value = last(unlist(strsplit(binn,',')))\\n\")\n", - "score_code.write(\" upper_value = tryCatch(as.numeric(gsub('([0-9]+).*$', '\\\\\")\n", - "score_code.write(\"\\\\\")\n", - "score_code.write(\"1',upper_value)), warning=function(e) Inf)\\n\")\n", - "score_code.write(\" upper_values = c(upper_values,upper_value)\\n\")\n", - "score_code.write(\" }\\n\")\n", - "score_code.write(\" if(!identical(index_missing,integer(0))) upper_values = upper_values[-index_missing]\\n\")\n", - "score_code.write(\" if(!identical(index_missing,integer(0))) incidences = incidences[-index_missing]\\n\")\n", - "score_code.write(\" upper_values_incidences = incidences[order(upper_values)]\\n\")\n", - "score_code.write(\" upper_values = upper_values[order(upper_values)]\\n\")\n", - "#score_code.write(\" incidence_replaced_values = c()\\n\")\n", - "#score_code.write(\" for (original_value in as.numeric(df_base[,header])){\\n\")\n", - "#score_code.write(\" if (is.na(original_value)){\\n\")\n", - "#score_code.write(\" incidence_to_attribute = incidence_missing\\n\")\n", - "#score_code.write(\" }\\n\")\n", - "#score_code.write(\" else {\\n\")\n", - "#score_code.write(\" lowest_membership = min(which(original_value<=upper_values))\\n\")\n", - "#score_code.write(\" incidence_to_attribute = upper_values_incidences[lowest_membership]\\n\")\n", - "#score_code.write(\" }\\n\")\n", - "#score_code.write(\" incidence_replaced_values = c(incidence_replaced_values,incidence_to_attribute)\\n\")\n", - "#score_code.write(\" }\\n\")\n", - "#score_code.write(\" df_out[paste('D_',header, sep='')] = incidence_replaced_values\\n\")\n", - "score_code.write(\" mask_nan = is.na(df_base[,header])\\n\")\n", - "score_code.write(\" lowest_memberships = findInterval(as.numeric(df_base[,header]), upper_values * (1 + .Machine$double.eps)) + 1\\n\")\n", - "score_code.write(\" incidences_to_attribute = upper_values_incidences[lowest_memberships]\\n\")\n", - "score_code.write(\" incidences_to_attribute[mask_nan] = incidence_missing\\n\")\n", - "score_code.write(\" df_out[paste('D_',header, sep='')] = incidences_to_attribute\\n\")\n", - "score_code.write(\"}\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.write(\"### Scoring\\n\")\n", - "score_code.write(\"df_scores = data.frame(ID=as.numeric(as.character(df_out$ID)))\\n\")\n", - "score_code.write(\"scores = c()\\n\")\n", - "score_code.write(\"intercept=\"+str(df_modrules.coeff.values[0])+\"\\n\")\n", - "score_code.write(\"coefficients=c\"+str([coeff for coeff in df_modrules.coeff][1:]).replace(\"[\",\"(\").replace(\"]\",\")\")+\"\\n\")\n", - "score_code.write(\"productsums = rowSums(t(t(df_out[,paste('D_',predictors,sep='')])*coefficients))\\n\")\n", - "score_code.write(\"exponents = intercept + productsums\\n\")\n", - "score_code.write(\"scores = sapply(exponents, FUN = function(x) (exp(x)) / (1+exp(x)))\\n\")\n", - "score_code.write(\"df_scores['score']=scores\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### for Python" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "score_code = open(root+\"/Python/scorecode.py\",'w')\n", - "\n", - "score_code.write(\"### Importing libraries & basetable to score\\n\")\n", - "score_code.write(\"# Importing Libraries\\n\")\n", - "score_code.write(\"import time\\nimport math\\nimport csv\\nimport re\\nimport pandas as pd\\nimport numpy as np\\n\")\n", - "score_code.write(\"# Importing Types\\n\")\n", - "score_code.write(\"typevariables=\"+str([var for var in df_types.loc[:,'var']])+\"\\n\")\n", - "score_code.write(\"typetypes=\"+str([vartype for vartype in df_types.loc[:,'type']])+\"\\n\")\n", - "score_code.write(\"df_types=pd.DataFrame({'var':typevariables,'type':typetypes},columns=['var','type'])\\n\")\n", - "score_code.write(\"df_types_copy = df_types.copy()\\n\")\n", - "score_code.write(\"bool_mask = df_types_copy.loc[:,'type']!='bool'\\n\")\n", - "score_code.write(\"df_types_copy.loc[bool_mask,'type'] = [getattr(__builtins__, type_str) for type_str in df_types_copy.loc[bool_mask,'type']]\\n\")\n", - "score_code.write(\"df_types_copy.loc[bool_mask==False,'type'] = getattr(__builtins__, 'str')\\n\")\n", - "score_code.write(\"types = df_types_copy.set_index('var').T.to_dict('records')\\n\") \n", - "score_code.write(\"# Importing Basetable with similar typing as in univariate analysis\\n\")\n", - "score_code.write(\"df_base = pd.read_csv('df_base.csv',header=0,sep=None,engine='python',converters=types[0])\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.write(\"### Creating dataframe containing model rules\\n\")\n", - "score_code.write(\"modvariables=\"+str([var for var in df_modrules.loc[:,'varname']])+\"\\n\")\n", - "score_code.write(\"modcoefficients=\"+str([coeff for coeff in df_modrules.loc[:,'coeff']])+\"\\n\")\n", - "score_code.write(\"df_modrules=pd.DataFrame({'varname':modvariables,'coeff':modcoefficients})\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.write(\"### Creating dataframe containing incidence translation rules\\n\")\n", - "score_code.write(\"prepvariables=\"+str([var for var in df_prep.loc[:,'var']])+\"\\n\")\n", - "score_code.write(\"prepbins=\"+str([bin for bin in df_prep.loc[:,'bin']])+\"\\n\")\n", - "score_code.write(\"prepincids=\"+str([bin for bin in df_prep.loc[:,'incid']])+\"\\n\")\n", - "score_code.write(\"df_prep = pd.DataFrame({'var':prepvariables,'bin':prepbins,'incid':prepincids}, dtype=object)\\n\")\n", - "score_code.write(\"df_prep.loc[:,'incid']=df_prep.loc[:,'incid'].astype('float64')\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.write(\"### Grouping basetable predictors along their types and trimming basetable accordingly\\n\")\n", - "score_code.write(\"predictors = list(df_modrules.loc[df_modrules.varname!='Intercept','varname'].values)\\n\")\n", - "score_code.write(\"not_predictors = [column for column in df_base.columns if column not in predictors]\\n\")\n", - "score_code.write(\"mask_FloatOrInt = (df_types.type=='int')|(df_types.type=='float')\\n\")\n", - "score_code.write(\"numeric_headers=[var for var in df_types.loc[mask_FloatOrInt,'var'].values if var in predictors]\\n\")\n", - "score_code.write(\"object_headers=[var for var in df_types.loc[df_types.type=='str','var'].values if var in predictors]\\n\")\n", - "score_code.write(\"bool_headers=[var for var in df_types.loc[df_types.type=='bool','var'].values if var in predictors]\\n\")\n", - "score_code.write(\"df_base = df_base[predictors+['ID']]\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.write(\"### Preprocessing the basetable\\n\")\n", - "score_code.write(\"# Strip quot function\\n\")\n", - "score_code.write(\"def strip_quot(x_in):\\n\")\n", - "score_code.write(\" try:\\n\")\n", - "score_code.write(\" x_out = x_in.strip().strip('\")\n", - "score_code.write('\"')\n", - "score_code.write(\"').strip(\")\n", - "score_code.write('\"')\n", - "score_code.write(\"'\")\n", - "score_code.write('\")\\n')\n", - "score_code.write(\" except:\\n\")\n", - "score_code.write(\" x_out=x_in\\n\")\n", - "score_code.write(\" return x_out\\n\")\n", - "score_code.write(\"# Lower/upper function\\n\")\n", - "score_code.write(\"def lower_upper(x_in):\\n\")\n", - "score_code.write(\" if ((x_in.lower() == 'id')|(x_in.lower() == 'target')):\\n\")\n", - "score_code.write(\" x_out = x_in.upper()\\n\")\n", - "score_code.write(\" else:\\n\")\n", - "score_code.write(\" x_out = x_in.lower()\\n\")\n", - "score_code.write(\" return x_out\\n\")\n", - "score_code.write(\"# maskmissing function in str/bool columns\\n\")\n", - "score_code.write(\"def maskmissing(var):\\n\")\n", - "score_code.write(\" crit1 = var.isnull()\\n\")\n", - "score_code.write(\" modvar = pd.Series([str(value).strip() for value in var])\\n\")\n", - "score_code.write(\" crit2 = modvar==pd.Series(['']*len(var))\\n\")\n", - "score_code.write(\" return crit1 | crit2\\n\")\n", - "score_code.write(\"# Apply preprocessing functions\\n\")\n", - "score_code.write(\"df_base = df_base.rename(columns=strip_quot)\\n\")\n", - "score_code.write(\"df_base = df_base.rename(columns=lower_upper)\\n\")\n", - "score_code.write(\"df_base = df_base.applymap(strip_quot)\\n\")\n", - "score_code.write(\"for header in object_headers+bool_headers:\\n\")\n", - "score_code.write(\" mask = maskmissing(df_base[header])\\n\")\n", - "score_code.write(\" df_base.loc[mask,header]='Missing'\\n\")\n", - "score_code.write(\"\\n\")\n", - " \n", - "score_code.write(\"### Incidence replacement\\n\")\n", - "score_code.write(\"# Recipient dataframe\\n\")\n", - "score_code.write(\"df_out = pd.DataFrame()\\n\")\n", - "score_code.write(\"df_out['ID']=df_base['ID']\\n\")\n", - "score_code.write(\"# Incidence replacement for string columns\\n\")\n", - "score_code.write(\"for header in object_headers+bool_headers:\\n\")\n", - "score_code.write(\" mask = df_prep.loc[:,'var']==header\\n\")\n", - "score_code.write(\" bins = df_prep.loc[mask,'bin']\\n\")\n", - "score_code.write(\" incidences = df_prep.loc[mask,'incid']\\n\")\n", - "score_code.write(\" nonsig_bins = []\\n\")\n", - "score_code.write(\" nonsig_incidences = []\\n\")\n", - "score_code.write(\" if (bins == 'Non-significants').any():\\n\")\n", - "score_code.write(\" nonsig_bins = [binn for binn in df_base[header].unique() if binn not in list(bins)]\\n\")\n", - "score_code.write(\" nonsig_incidences = list(incidences[bins=='Non-significants'])*len(nonsig_bins)\\n\")\n", - "score_code.write(\" keys = list(bins)\\n\")\n", - "score_code.write(\" keys.extend(nonsig_bins)\\n\")\n", - "score_code.write(\" values = list(incidences)\\n\")\n", - "score_code.write(\" values.extend(nonsig_incidences)\\n\")\n", - "score_code.write(\" keys_and_values = zip(keys,values)\\n\")\n", - "score_code.write(\" transdic = dict(keys_and_values)\\n\")\n", - "score_code.write(\" items_to_translate = df_base[header] \\n\")\n", - "score_code.write(\" df_out.loc[:,'D_'+header]= pd.Series([transdic[item] for item in items_to_translate])\\n\")\n", - "score_code.write(\"# Incidence replacement for numeric columns\\n\")\n", - "score_code.write(\"for header in numeric_headers:\\n\")\n", - "score_code.write(\" mask = df_prep.loc[:,'var']==header\\n\")\n", - "score_code.write(\" bins = df_prep.loc[mask,'bin']\\n\")\n", - "score_code.write(\" incidences = df_prep.loc[mask,'incid']\\n\")\n", - "score_code.write(\" index_missing = bins.index[bins=='Missing']\\n\")\n", - "score_code.write(\" incidence_missing = incidences[index_missing]\\n\")\n", - "score_code.write(\" upper_values = pd.Series([])\\n\")\n", - "score_code.write(\" for i,binn in enumerate(bins.values):\\n\")\n", - "score_code.write(\" upper_value = binn.split(',')[-1]\\n\")\n", - "score_code.write(\" try:\\n\")\n", - "score_code.write(\" upper_value = re.findall('[0-9]+',upper_value)[0]\\n\")\n", - "score_code.write(\" except:\\n\")\n", - "score_code.write(\" upper_value = math.inf\\n\")\n", - "score_code.write(\" upper_values[i] = upper_value\\n\")\n", - "score_code.write(\" upper_values.index = bins.index\\n\")\n", - "score_code.write(\" upper_values.drop(index_missing, inplace=True)\\n\")\n", - "score_code.write(\" upper_values = upper_values.astype(float)\\n\")\n", - "score_code.write(\" upper_values.sort_values(inplace=True)\\n\")\n", - "score_code.write(\" upper_values_incidences = incidences[upper_values.index]\\n\")\n", - "score_code.write(\" upper_values.reset_index(drop=True, inplace=True)\\n\")\n", - "score_code.write(\" upper_values_incidences.reset_index(drop=True, inplace=True)\\n\")\n", - "#score_code.write(\" incidence_replaced_values = np.array([])\\n\")\n", - "#score_code.write(\" for original_value in df_base[header]:\\n\")\n", - "#score_code.write(\" lowest_membership = upper_values.index[original_value<=upper_values].min()\\n\")\n", - "#score_code.write(\" try:\\n\")\n", - "#score_code.write(\" incidence_to_attribute = upper_values_incidences[lowest_membership]\\n\")\n", - "#score_code.write(\" except:\\n\")\n", - "#score_code.write(\" if np.isnan(original_value):\\n\")\n", - "#score_code.write(\" incidence_to_attribute = incidence_missing\\n\")\n", - "#score_code.write(\" else:\\n\")\n", - "#score_code.write(\" incidence_to_attribute = np.nan\\n\")\n", - "#score_code.write(\" incidence_replaced_values = np.append(incidence_replaced_values,incidence_to_attribute)\\n\")\n", - "#score_code.write(\" df_out['D_'+header] = pd.Series(incidence_replaced_values)\\n\")\n", - "score_code.write(\" mask_npnan = df_base.loc[:,header].isnull()\\n\")\n", - "score_code.write(\" lowest_memberships = upper_values.searchsorted(df_base.loc[:,header],side='left')\\n\")\n", - "score_code.write(\" incidences_to_attribute = upper_values_incidences[lowest_memberships].reset_index(drop=True)\\n\")\n", - "score_code.write(\" incidences_to_attribute[mask_npnan] = incidence_missing\\n\")\n", - "score_code.write(\" df_out['D_'+header] = incidences_to_attribute\\n\")\n", - "score_code.write(\"\\n\") \n", - "\n", - " \n", - "score_code.write(\"### Scoring\\n\")\n", - "score_code.write(\"df_scores = pd.DataFrame([])\\n\")\n", - "score_code.write(\"df_scores['ID'] = df_out['ID']\\n\")\n", - "score_code.write(\"scores = []\\n\")\n", - "score_code.write(\"intercept=\"+str(df_modrules.coeff.values[0])+\"\\n\")\n", - "score_code.write(\"coefficients=np.array(\"+str([coeff for coeff in df_modrules.coeff][1:])+\")\\n\")\n", - "score_code.write(\"productsums = (df_out['D_'+pd.Series(predictors)]*coefficients).sum(axis=1)\\n\")\n", - "score_code.write(\"exponents = intercept + productsums\\n\")\n", - "score_code.write(\"scores = exponents.apply(func=lambda x:(math.exp(x)) / (1+math.exp(x)))\\n\")\n", - "score_code.write(\"df_scores['score']=scores\\n\")\n", - "score_code.write(\"\\n\")\n", - "\n", - "score_code.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### for Sas" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# ..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ok\n" - ] - } - ], - "source": [ - "print('ok')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/legacy_code/script_sprint2_.py b/legacy_code/script_sprint2_.py deleted file mode 100644 index dbf045d..0000000 --- a/legacy_code/script_sprint2_.py +++ /dev/null @@ -1,991 +0,0 @@ - -# coding: utf-8 - -# # Univariate analysis - -# ### General Imports - -# In[1]: - -import math -import csv -import warnings -import time -import os -import itertools -import scipy.integrate -import re - - -# In[2]: - -import numpy as np -import pandas as pd - - -# In[3]: - -from scipy import stats -from itertools import chain -from sklearn import metrics - - -# --- - -# ### Miscellaneous - -# In[4]: - -log = [] - - -# In[5]: - -# When code is in script, we define the path of the script's parent folder location as the 'root' directory -# From this 'root' we can travel to the relevant folders with minimal adjustment -try: - root = os.path.dirname(os.path.realpath(__file__)) - root = "/".join(root.split('\\')[:-1]) - log.append('Dynamic paths'+'\n') -except: - root = 'C:/wamp64/www/python_predictions_4/assets/scripts' - log.append('Static paths'+'\n') - - -# In[6]: - -# To allow pandas dataframes to display more columns -pd.set_option("display.max_columns",50) - - -# --- - -# ### Read data and organize - -# Basetable and its types - -# In[7]: - -# A Types csv file CAN be defined to be used to convert variables (of the basetable, see below) to the desired data types -# The Types csv files should include one column with variable names and one column with desired types (e.g. int,float,str,bool) -# If no Types csv file is provided no convertions will be forced. In that case 'Python' will guess the data type of each column -types_path = root+"/python/data_types.csv" -types_exist = True - -try: - df_types = pd.read_csv(types_path, header=None) - bool_mask = df_types[1]!='bool' - # Extract the functions based on the given type (e.g. 'str' -> str, 'int' -> int), for proper convertion - df_types.loc[bool_mask,1] = [getattr(__builtins__, type_str) for type_str in df_types.loc[bool_mask,1]] - # A type 'bool' is also attributed the function str, for convertion - df_types.loc[bool_mask==False,1] = getattr(__builtins__, 'str') - #types = df_types[bool_mask].set_index(0).T.to_dict('records') - types = df_types.set_index(0).T.to_dict('records') -except FileNotFoundError: - types = [dict()] - types_exist = False - - -# In[8]: - -# The basetable csv file should have column names as its first row -# The columns names should include 'TARGET', 'ID' -data_path = root+"/python/data.csv" - -df_in = pd.read_csv(data_path - ,header=0 - ,sep=None - ,engine='python' - ,converters=types[0]) - -# If no Types csv file was provided pd.read_csv guessed the types, we now output these types in a csv for re-use & later use -if types_exist == False: - filename = root+"/python/data_types.csv" - funtotype = lambda x:re.findall('[a-z]+',str(x))[0].replace('object','str') - with open(filename, 'w') as csvfile: - write=csv.writer(csvfile, delimiter =',') - write.writerows([column - ,funtotype(df_in[column].dtype)] for column in df_in.columns) - - -# In[9]: - -# Function to remove quotes from variable names and/or variable values -def strip_quot(x_in): - try: - x_out = x_in.strip().strip('"').strip("'") - except: - x_out=x_in - return x_out - -# Function to put 'id' and 'target' variable names in uppercase, all other variable names are put in lowercase -# This is coded as to visually differentiate predictors from other variables -# But another combination of upper/lower is possible as well, e.g. all variable names in uppercase -def lower_upper(x_in): - if ((x_in.lower() == 'id')|(x_in.lower() == 'target')): - x_out = x_in.upper() - else: - x_out = x_in.lower() - return x_out - -# Function to group variable names based on the data type of the variable -# Could as well use the types in Types.csv -def get_headers(dataframe,type): - return dataframe.select_dtypes(include=[type]).columns.values - - -# In[10]: - -# Clean up quotes from column names -df_in = df_in.rename(columns=strip_quot) - -# Perform uppercase/lowercase transformation to column names -df_in = df_in.rename(columns=lower_upper) - -# Clean up quotes from column values -df_in = df_in.applymap(strip_quot) - - -# In[11]: - -# Group variable (names) based on the respective data type of each variable -# With this information we know which variables are destined for equifrequency, regrouping or simply passing (see further) -other_headers = [n for n in ["TARGET","ID"]] -try: - bool_headers = [n for n in df_types.loc[bool_mask==False,0].values if n not in other_headers] -except: - bool_headers = [] -object_headers = [n for n in get_headers(df_in,'object') if n not in other_headers+bool_headers] -numeric_headers = [n for n in get_headers(df_in,'number') if n not in other_headers+bool_headers] - - -# Analysis settings - -# In[12]: - -# Import settings defined by the user -df_settings = pd.read_csv(root+'/python/analysis_settings.csv', sep=',', index_col=0, header=None).T - - -# --- - -# ### Partitioning -# Shuffle and sort on TARGET -df_in = df_in.iloc[np.random.permutation(len(df_in))].sort_values(by='TARGET', ascending=False).reset_index(drop=True) - -# Create dic of partitioning settings -partition_dic = {'train':df_settings.loc[:,'partitioning_train'] - ,'selection':df_settings.loc[:,'partitioning_selec'] - ,'validation':df_settings.loc[:,'partitioning_valid'] - } - -# Create a partition variable -partition = [] -sorted_target=df_in.TARGET #Just the target since it is allready sorted (see above) -for target in [sorted_target.iloc[0],sorted_target.iloc[-1]]: - target_length = (sorted_target==target).sum() - for part in partition_dic: - partition.extend( [part]*math.ceil(target_length*partition_dic[part][1]/100) ) - -# Attach partition variable to dataframe -df_in["PARTITION"] = partition[:len(df_in)] - -# Sampling based on analysis settings (if both sampling_settings are set to 100, all data is used) -sampling_settings = {1:df_settings.sampling_1, 0:df_settings.sampling_0} -drop_index = [] -for sample in sampling_settings: - if sampling_settings[sample].values<100: - sample_length = int(round((df_in.TARGET==sample).sum() * sampling_settings[sample]/100)) - for part in partition_dic: - part_length = int(round(sample_length * partition_dic[part] / 100)) - drop_index_part = df_in[(df_in.TARGET==sample) & (df_in.PARTITION==part)].index[part_length:] - drop_index.extend(drop_index_part) - #drop_index = df_in[df_in.TARGET==sample].index[sample_length:] -df_in.drop(drop_index,inplace=True) -df_in.reset_index(drop=True, inplace=True) - - -# --- - -# ### Output Container - -# In[15]: - -# Create output dataframe which will contain transformed variables -df_out = df_in.loc[:,["ID","TARGET","PARTITION"]].copy() - - -# --- - -# ### Preprocessing of continuous variables - -# Discretization function for Continuous variables - -# In[16]: - -### This function is a reworked version of pd.qcut to satisfy our particular needs -### Takes for var a continuous pd.Series as input and returns a pd.Series with bin-labels (e.g. [4,6[ ) -### Train takes a series/list of booleans (note: we define bins based on the training set) -### Autobins reduces the number of bins (starting from nbins) as a function of the number of missings -### Nbins is the wished number of bins -### Precision=0 results in integer bin-labels if possible -### twobins=True forces the function to output at least two bins -### catchLarge tests if some groups (or missing group) are very large, and if so catches and outputs two groups -#### note: catchLarge makes twobins irrelevant - -def eqfreq(var, train, autobins=True, nbins=10, precision=0, twobins=True, catchLarge=True): - - - # Test for large groups and if one exists pass them with two bins: Large_group,Other - if catchLarge: - catchPercentage=1-(1/nbins) - groupCount = var[train].groupby(by=var[train]).count() - maxGroupPerc = groupCount.max()/len(var[train]) - missingPerc = sum(var[train].isnull())/len(var[train]) - if maxGroupPerc>=catchPercentage: - largeGroup = groupCount.sort_values(ascending=False).index[0] - x_binned = var.copy() - x_binned.name = 'B_'+var.name - x_binned[x_binned!=largeGroup]='Other' - cutpoints=None - info = (var.name+": One large group, outputting 2 groups") - return x_binned, cutpoints, info - elif missingPerc>=catchPercentage: - x_binned = var.copy() - x_binned.name = 'B_'+var.name - x_binned[x_binned.isnull()]='Missing' - x_binned[x_binned!='Missing']='Other' - cutpoints=None - info = (var.name+": One large missing group, outputting 2 groups") - return x_binned, cutpoints, info - # Adapt number of bins as a function of number of missings - if autobins: - length = len(var[train]) - missing_total = var[train].isnull().sum() - missing_perten = missing_total/length*10 - nbins = max(round(10-missing_perten)*nbins/10 ,1) - # Store the name and index of the variable - name = var.name - series_index = var.index - # Transform var and train to a np.array and list respectively, which is needed for some particular function&methods - x = np.asarray(var) - train = list(train) - # First step in finding the bins is determining what the quantiles are (named as cutpoints) - # If the quantile lies between 2 points we use lin interpolation to determine it - cutpoints = var[train].quantile(np.linspace(0,1,nbins+1),interpolation = 'linear') - # If the variable results only in 2 unique quantiles (due to skewness) increase number of quantiles until more than 2 bins can be formed - if twobins: - extrasteps = 1 - # Include a max. extrasteps to avoid infinite loop - while (len(cutpoints.unique())<=2) & (extrasteps<20): - cutpoints = var[train].quantile(np.linspace(0,1,nbins+1+extrasteps),interpolation = 'linear') - extrasteps+=1 - # We store which rows of the variable x lies under/above the lowest/highest cutpoint - # Without np.errstate(): xcutpoints.max() can give if x contains nan values (missings) - # However the function will result in False in both >&< cases, which is a correct result, so the warning can be ignored - with np.errstate(invalid='ignore'): - under_lowestbin = x < cutpoints.min() - above_highestbin= x > cutpoints.max() - - - def _binnedx_from_cutpoints(x, cutpoints, precision, under_lowestbin, above_highestbin): - ### Attributes the correct bin ........................ - ### Function that, based on the cutpoints, seeks the lowest precision necessary to have meaningful bins - ### e.g. (5.5,5.5] ==> (5.51,5.54] - ### Attributes those bins to each value of x, to achieve a binned version of x - - # Store unique cutpoints (e.g. from 1,3,3,5 to 1,3,5) to avoid inconsistensies when bin-label making - # Indeed, bins [...,1], (1,3], (3,3], (3,5], (5,...] do not make much sense - # While, bins [...,1], (1,3], (3,5], (5,...] do make sense - unique_cutpoints = cutpoints.unique() - # If there are only 2 unique cutpoints (and thus only one bin will be returned), - # keep original values and code missings as 'Missing' - if len(unique_cutpoints) <= 2: - cutpoints = None - x_binned = pd.Series(x) - x_binned[x_binned.isnull()] = 'Missing' - info = (var.name+": Only one resulting bin, keeping original values instead") - return x_binned, cutpoints, info - # Store info on whether or not the number of resulting bins equals the desired number of bins - elif len(unique_cutpoints) < len(cutpoints): - info = (var.name+": Resulting # bins < whished # bins") - else: - info = (var.name+": Resulting # bins as desired") - # Finally, recode the cutpoints (which can have doubles) as the unique cutpoints - cutpoints = unique_cutpoints - - # Store missing values in the variable as a mask, and create a flag to test if there are any missing in the variable - na_mask = np.isnan(x) - has_nas = na_mask.any() - # Attribute to every x-value the index of the cutpoint (from the sorted cutpoint list) which is equal or higher than - # the x-value, effectively encompasing that x-value. - # e.g. for x=6 and for sorted_cutpoint_list=[0,3,5,8,...] the resulting_index=3 - ids = cutpoints.searchsorted(x, side='left') - # x-values equal to the lowest cutpoint will recieve a ids value of 0 - # but our code to attribute bins to x-values based on ids (see end of this subfunction) requires a min. value of 1 - ids[x == cutpoints[0]] = 1 - # Idem as previous: x-values below the lowest cutpoint should recieve a min. value of 1 - if under_lowestbin.any(): - ids[under_lowestbin] = 1 - # Similar as previous: x-values above the highest cutpoint should recieve the max. allowed ids - if above_highestbin.any(): - max_ids_allowed = ids[(above_highestbin == False) & (na_mask==False)].max() - ids[above_highestbin] = max_ids_allowed - # Maximal ids can now be defined if we neglect ids of missing values - max_ids = ids[na_mask==False].max() - - # Based on the cutpoints create bin-labels - # Iteratively go through each precision (= number of decimals) until meaningful bins are formed - # If theoretical bin is ]5.51689,5.83654] we will prefer ]5.5,5.8] as output bin - increases = 0 - original_precision = precision - while True: - try: - bins = _format_bins(cutpoints, precision) - except ValueError: - increases += 1 - precision += 1 - #if increases >= 5: - #warnings.warn("Modifying precision from "+str(original_precision)+" to "+str(precision)+" to achieve discretization") - #print("Modifying precision from "+str(original_precision)+" to "+str(precision)+" to achieve discretization") - else: - break - - # Make array of bins to allow vector-like attribution - bins = np.asarray(bins, dtype=object) - # If x has nas: for each na-value, set the ids-value to max_ids+1 - # this will allow na-values to be attributed the highest bin which we define right below - if has_nas: - np.putmask(ids, na_mask, max_ids+1) - # The highest bin is defined as 'Missing' - bins = np.append(bins,'Missing') - # ids-1 is used as index in the bin-labels list to attribute a bin-label to each x. Example: - # x=6 sorted_cutpoint_list=[0,3,5,8,...] ids=3 levels=[[0,3],(3,5],(5,8],...] - # The correct bin level for x is (5,8] which has index 2 which is equal to the ids-1 - x_binned = bins[ids-1] - return x_binned, cutpoints, info - - - def _format_bins(cutpoints, prec): - # Based on the quantile list create bins. Raise error if values are similar within one bin. - # On error _binnedx_from_cutpoints will increase precision - - fmt = lambda v: _format_label(v, precision=prec) - bins = [] - for a, b in zip(cutpoints, cutpoints[1:]): - fa, fb = fmt(a), fmt(b) - - if a != b and fa == fb: - raise ValueError('precision too low') - - formatted = '(%s, %s]' % (fa, fb) - bins.append(formatted) - - bins[0] = '[...,' + bins[0].split(",")[-1] - bins[-1] = bins[-1].split(",")[0] + ',...]' - return bins - - - def _format_label(x, precision): - # For a specific precision, returns the value formatted with the appropriate amount of numbers after comma and correct brackets - - if isinstance(x,float): - frac, whole = np.modf(x) - sgn = '-' if x < 0 else '' - whole = abs(whole) - if frac != 0.0: - val = '{0:.{1}f}'.format(frac, precision) - val = _trim_zeros(val) - if '.' in val: - return sgn + '.'.join(('%d' % whole, val.split('.')[1])) - else: - if '0' in val: - return sgn + '%0.f' % whole - else: - return sgn + '%0.f' % (whole+1) - else: - return sgn + '%0.f' % whole - else: - return str(x) - - - def _trim_zeros(x): - # Removes unnecessary zeros and commas - while len(x) > 1 and x[-1] == '0': - x = x[:-1] - if len(x) > 1 and x[-1] == '.': - x = x[:-1] - return x - - - x_binned, cutpoints, info = _binnedx_from_cutpoints(x, cutpoints, precision=precision, under_lowestbin=under_lowestbin, above_highestbin=above_highestbin) - x_binned = pd.Series(x_binned, index=series_index, name="B_"+name) - return x_binned, cutpoints, info - - -# # WIP - -# for n in numeric_headers: -# result = eqfreq(var=df_in[n] -# ,train=df_in["PARTITION"]=="train" -# ,autobins=True -# ,nbins=int(df_settings.discretization_nbins) -# ,precision=0 -# ,twobins=True -# ,catchLarge=True) -# print(n) -# print(result[0].unique()) -# print('\n') - -# # /WIP - -# Apply function to continuous variables - -# In[17]: - -tic = time.time() -# We loop only through the numeric variables -for n in numeric_headers: - # Perform the equifrequency function - result = eqfreq(var=df_in[n] - ,train=df_in["PARTITION"]=="train" - ,autobins=True - ,nbins=int(df_settings.discretization_nbins) - ,precision=0 - ,twobins=True - ,catchLarge=False) # TRUE OPTION STILL PRODUCES ERROR IN SORTNUMERIC function AND SCORING procedure !!!!!!!!! - df_out = pd.concat([df_out,result[0]], axis=1) - log.append(result[2]) -toc = time.time() -log.append("Discretisation: "+str(toc-tic)+" sec"+"\n") - - -# --- - -# ### Preprocessing of categorical variables - -# Function for labeling missing/empty values - -# In[18]: - -# Check which values of a var are empty strings or null values -def maskmissing(var): - # Check if values are null - crit1 = var.isnull() - # Check if values are empty strings - modvar = pd.Series([str(value).strip() for value in var]) - crit2 = modvar==pd.Series(['']*len(var)) - #crit2 = var==pd.Series(['']*len(var)) - #crit3 = var==pd.Series([' ']*len(var)) - return crit1 | crit2 - - -# Regrouping Function for nominal/ordinal variables - -# In[19]: - -# Regrouping function for categorical variables -# Each group is tested with a chi² for relevant incidence differences in comparison to a rest-group -# The rest group has the size of the remaining groups and an 'overall average incidence' (if dummy=True) or -# 'remaining groups average incidence' (if dummy=False) -# Groups with a pvalue above the threshold are relabled to a single group - -def regroup(var,target,train,pval_thresh=0.01,dummy=True,keep='Missing',rename='Other'): - - # Define the chi² test condition - # Groups that do not meet the condition are not analyzed and will be unconditionally relabled - def _chi2cond_(var=var,target=target,train=train): - varcounts = var[train].groupby(by=var).count() - train_inc = target[train].sum()/len(target[train]) - factor = max(train_inc, 1-train_inc) - analyze_mask = (varcounts*factor)>5 - analyze_groups = analyze_mask.index[analyze_mask].values - return analyze_groups - - # Compute overal incidence mean - incidence_mean = target[train].mean() - # Create container of which groups will be kept, compared to the groups which will be relabled - keepgroups = [] - # Cycle and test each group that meets the chi² condition - for group in _chi2cond_(): - # Container for target 0/1 observations of the group under scrutiny - obs_group = [] - # Counts of the target 0/1 occurences for the group under scrutiny - obs_group.append(((target[train]==0)&(var[train]==group)).sum()) - obs_group.append(((target[train]==1)&(var[train]==group)).sum()) - obs_group = np.array(obs_group) - # Container for target 0/1 observations of the remaining groups together - obs_other = [] - # Counts of the target 0/1 occurences for the remaining groups together - obs_other.append(((target[train]==0)&(var[train]!=group)).sum()) - obs_other.append(((target[train]==1)&(var[train]!=group)).sum()) - obs_other = np.array(obs_other) - # If dummy=True, we scale the two groups of target 0/1 occurences such that the incidence is equal to the overall incidence - # The size of the two groups of target 0/1 occurences is still equal to the size of the remaining groups - if dummy: - obs_other_size = obs_other.sum() - obs_other[0]=(1-incidence_mean)*obs_other_size # 0(1) index coincides with target = 0(1) - obs_other[1]=( incidence_mean)*obs_other_size - obs = np.array([obs_group,obs_other]) - # Place at least 1 observation to avoid error in chi2 test - obs[obs==0] = 1 - # Perform chi² test - pval = stats.chi2_contingency(obs, correction=False)[1] - # If pval outperforms threshold, append the group in the keepgroups list - if pval<=pval_thresh: - keepgroups.append(group) - #elif group==keep: - # keepgroups.append(group) - # If the specific group to be kept (e.g. 'Missing') didn't pass the test, append it to the keepgroups list - if keep not in keepgroups: - keepgroups.append(keep) - # Makes a list of all groups not in the keepgroups list - regroup_mask = [val not in keepgroups for val in var.values] - var_regroup = var.copy() - # Rename those groups - var_regroup[regroup_mask] = rename - var_regroup.name = "B_"+var.name - info = (var.name+": from "+str(len(var.unique()))+" to "+str(len(var_regroup.unique()))) - return var_regroup, info - - -# Apply function to nominal/ordinal variables - -# In[20]: - -tic = time.time() -# We loop only through the categorical variables -for h in object_headers: - # We label missing and empty values for categorical variables as 'Missing' - # Note the interaction with the 'keep' parameter of the regroup function. - mask = maskmissing(df_in[h]) - df_in.loc[mask,h]='Missing' - # Perform regrouping function - result = regroup(var=df_in[h] - ,target=df_in.loc[:,'TARGET'] - ,train=df_in.PARTITION=='train' - ,pval_thresh=float(df_settings.regrouping_signif) - ,dummy=True - ,keep='Missing' - ,rename='Non-significants') - df_out = pd.concat([df_out,result[0]],axis=1) - log.append(result[1]) -toc = time.time() -log.append("Regrouping: "+str(toc-tic)+" sec"+"\n") - - -# --- - -# ### Preprocessing of boolean variables - -# Defining Function to pass variables as is - -# In[21]: - -# We could just rename them or put them with the regoup function, but for now let's keep consistent with the other functions -def passvar(var): - var_pass = var.copy() - var_pass.name = "B_"+var.name - info = ("Passing "+var.name) - return var_pass, info - - -# Executing function - -# In[22]: - -tic = time.time() -# We loop only through the boolean variables -for b in bool_headers: - # We label missing and empty values for boolean variables as 'Missing' - mask = maskmissing(df_in[b]) - df_in.loc[mask,b]='Missing' - # Perform the passvar function - result = passvar(var=df_in[b]) - df_out = pd.concat([df_out,result[0]],axis=1) - log.append(result[1]) -toc = time.time() -log.append("Passing: "+str(toc-tic)+" sec"+"\n") - - -# --- - -# ### Incidence Replacement - -# Function for incidence replacement - -# In[23]: - -def increp(b_var, target, train): - #get variable name - name = b_var.name - #get overall incidence - incidence_mean = target[train].mean() - #get incidence per group - incidences = target[train].groupby(b_var).mean() - #construct dataframe with incidences - idf = pd.DataFrame(incidences).reset_index() - #get values that are in the data but not in the labels - bin_labels = incidences.index - newgroups = list(set(b_var.unique()) ^ set(bin_labels)) - #if newgroups, add mean incidence to incidence dataframe for each new group - if len(newgroups)>0: - #make dataframe: - ngdf = pd.DataFrame(newgroups) - ngdf.columns = [name] - ngdf["TARGET"] = incidence_mean - #dataframe with incidences: - idf = idf.append(ngdf) - #dataframe with the variable - vdf = pd.DataFrame(b_var) - #discretized variable by merge - d_var = pd.merge(vdf,idf,how='left',on=name)["TARGET"] - return pd.Series(d_var, name="D_"+name[2:]) - - -# Apply function for incidence replacement - -# In[24]: - -# We define the columns destined for incidence replacement -headers_for_incidrep = [h for h in df_out.columns if ((h not in ['ID','TARGET','PARTITION']) & (h[:2]=="B_"))] - - -# In[25]: - -tic = time.time() -# We loop only through the columns destined for incidence replacement -for n in headers_for_incidrep: - # Perform increp function - result = increp(b_var=df_out[n] - ,target=df_out.TARGET - ,train=df_out.PARTITION=="train") - df_out = pd.concat([df_out,result], axis=1) - log.append(n+ " processed") -toc = time.time() -log.append("Incidence replacement: "+str(toc-tic)+" sec"+"\n") - - -# --- - -# ### Calculate AUCS - -# Function for auc calculation - -# In[26]: - -def getauc(var, target, partition): - - y = np.array(target[partition]) - pred = np.array(var[partition]) - pred = pred.astype(np.float64) - fpr, tpr, thresholds = metrics.roc_curve(y,pred, pos_label=1) - - return metrics.auc(fpr, tpr) - - -# Applying function for auc calculation - -# In[27]: - -# We define the columns for which an AUC score should be computed -headers_for_auc = [h for h in df_out.columns if ((h not in ['ID','TARGET','PARTITION']) & (h[:2]=="D_"))] - - -# In[28]: - -auc_list_all = [] -parts = ["train","selection"] -tic = time.time() -# We loop only through those columns for which an AUC score should be computed -for header in headers_for_auc: - auc_list_var = [header[2:]] - # We loop through the two sets ('train' and 'selection') for which an AUC score is needed - for part in parts: - # Perform getauc function - auc_value = getauc(var=df_out[header] - ,target=df_out.TARGET - ,partition=df_out.PARTITION==part) - auc_list_var.append(auc_value.round(2)) #We round auc values to 2 decimals - auc_list_all.append(auc_list_var) - log.append(header + " processed") -# We create a supplementary dataframe destined for Cobra input -df_auc = pd.DataFrame(auc_list_all,columns=['variable','AUC train','AUC test']) -toc = time.time() -log.append("Auc: "+str(toc-tic)+" sec"+"\n") - - -# --- - -# ### Preselection - -# In[29]: - -tic = time.time() -# We identify those variables for which the AUC score is above the user-defined threshold -auc_thresh = df_auc.loc[:,'AUC test'] > float(df_settings.preselection_auc) -# We identify those variables for which the AUC score difference between 'train' and 'selection' is within the user-defined ratio -auc_overtrain = (df_auc.loc[:,'AUC train']*100 - df_auc.loc[:,'AUC test']*100) < float(df_settings.preselection_overtrain) -# Only those variables passing the 2 criteria above are preselected -preselect = auc_thresh & auc_overtrain - - -# In[30]: - -# We create a supplementary dataframe destined for Cobra input -df_variable_selections = pd.DataFrame({'variable':df_auc.variable - ,'preselect':preselect.astype(int) - ,'Default':np.zeros(len(preselect)).astype(int) - ,'Alternative 1':np.zeros(len(preselect)).astype(int) - ,'Alternative 2':np.zeros(len(preselect)).astype(int) - ,'Alternative 3':np.zeros(len(preselect)).astype(int) - ,'Alternative 4':np.zeros(len(preselect)).astype(int) - ,'Alternative 5':np.zeros(len(preselect)).astype(int)} - ,columns=['variable' - ,'preselect' - ,'Default' - ,'Alternative 1' - ,'Alternative 2' - ,'Alternative 3' - ,'Alternative 4' - ,'Alternative 5']) - - -# In[31]: - -for i,var in enumerate(df_variable_selections.variable): - log.append(var+" "+np.array(['passed','filtered'])[df_variable_selections.preselect][i]) -toc = time.time() -log.append("Preselection: "+str(toc-tic)+" sec"+"\n") - - -# --- - -# ### Calculate Correlations - -# In[32]: - -# We define the columns for which a correlation score should be computed -headers_for_corr = [h for h in df_out.columns if ((h not in ['ID','TARGET','PARTITION']) & (h[:2]=="D_"))] - - -# In[33]: - -train = df_out.PARTITION=="train" -tic = time.time() -dataforcorr = np.transpose(np.matrix(df_out.loc[train,headers_for_corr],dtype=float)) -with np.errstate(invalid='ignore', divide='ignore'): - mat_corr = np.corrcoef(dataforcorr) -toc = time.time() -log.append("Correlations: "+str(toc-tic)+" sec"+"\n") - - -# In[34]: - -df_corr = pd.DataFrame(mat_corr) -df_corr.columns = headers_for_corr -df_corr.index = headers_for_corr -df_corr.fillna(0, inplace=True) - -# --- - -# ### Export files - -# Table of all Auc values - -# In[35]: - -auc_path = root+'/data/univariate/aucs.csv' -df_auc = df_auc.sort_values(by=['AUC test','AUC train'], ascending=False).reset_index(drop=True) -df_auc.to_csv(path_or_buf=auc_path - ,sep=';' - ,index=False - ,encoding='utf-8' - ,line_terminator='\n') - - -# Tables of Incidences & Correlations per variable - -# In[36]: - -# Function for sorting cont.variables, whether or not they have undergone discritization -def sortnumeric(dataframe): - - lowestnumber = 0 - # If the variable was discretisized - if '[...' in [str(l)[:4] for l in dataframe.group.values]: - unsorted_labels = dataframe.group.values - label_items=[] - for label in unsorted_labels: - # For each bin label, retain the first value - label_items.append(label.split(",")[0].strip("[").strip("(")) - label_items=np.asarray(label_items) - # Special cases that are not numeric are given numbers - lowestnumber = label_items[(label_items!="...")&(label_items!="Missing")].astype('float64').min() - label_items[label_items=='...']= lowestnumber-1 - label_items[label_items=='Missing']= lowestnumber-2 - # argsort based on the numbers - rank = label_items.astype('float64').argsort() - return rank - - # If the variable wasn't discretisized, simply argsort on the numbers - else: - label_items = dataframe.group.values - if len(label_items)>1: - lowestnumber = label_items[label_items.astype('O')!="Missing"].astype('float64').min() - label_items[label_items.astype('O')=='Missing']= lowestnumber-2 - rank = label_items.astype('float64').argsort() - return rank - - -# In[37]: - -# Function for sorting cont.variables, whether or not they have undergone discritization -def sortnumeric_old(dataframe): - - # If the variable was discretisized - if dataframe.group.dtype=='object': #or# if np.array([str(unsorted_labels[i])[0] in ["[","(","M"] for i in range(0,len(unsorted_labels))]).all(): - unsorted_labels = dataframe.group.values - label_items=[] - for label in unsorted_labels: - # For each bin label, retain the first value - label_items.append(label.split(",")[0].strip("[").strip("(")) - label_items=np.asarray(label_items) - # Special cases that are not numeric are given numbers - lowestnumber = label_items[(label_items!="...")&(label_items!="Missing")].astype('float64').min() - label_items[label_items=='...']= lowestnumber-1 - label_items[label_items=='Missing']= lowestnumber-2 - # argsort based on the numbers - rank = label_items.astype('float64').argsort() - return rank - - # If the variable wasn't discretisized, simply argsort on the numbers - else: - rank = dataframe.group.values.argsort() - return rank - - -# In[38]: - -# Function for sorting cat. variables -def sortobject(dataframe): - # Sort dataframe on increasing incidence values - unsorted_incidences = dataframe.incidence.values - rank = unsorted_incidences.argsort() - return rank - - -# In[39]: - -n_decimals = 2 -average = round(df_out.TARGET[df_out.PARTITION=="train"].mean(),n_decimals) - -headers_to_output = list(df_auc['variable']) -for i,varname in enumerate(headers_to_output): - b_varname = 'B_'+varname - d_varname ='D_'+varname - #INCIDENCE CSV's - incidence_path = root+"/data/univariate/incidence_"+str(varname)+".csv" - groups_and_incidences = df_out.TARGET[df_out.PARTITION=='train'].groupby(df_out[b_varname]).mean() - n_groups= len(groups_and_incidences) - group = groups_and_incidences.index - incidence = groups_and_incidences.values.round(n_decimals) - size = df_out.TARGET[df_out.PARTITION=='train'].groupby(df_out[b_varname]).size().astype(float).values - df_incidence = pd.DataFrame( {'group':group - ,'incidence':incidence - ,'size':size - ,'average':average} - ,columns=['group','incidence','size','average']) - if varname in numeric_headers: - df_incidence = df_incidence.iloc[sortnumeric(df_incidence),:] - elif varname in object_headers: - df_incidence = df_incidence.iloc[sortobject(df_incidence),:] - else: - a=1 - #df_incidence = df_incidence.iloc[sortother(df_incidence),:] - df_incidence.to_csv(path_or_buf=incidence_path - ,sep=';' - ,index=False - ,encoding='utf-8' - ,line_terminator='\n') #quoting=csv.QUOTE_NONNUMERIC - - #CORRELATION CSV's - correlation_path = root+"/data/univariate/correlations_"+str(varname)+".csv" - Variable = [v.strip("D_") for v in df_corr[d_varname].index] - Correlation = abs(df_corr[d_varname].values).round(n_decimals) - Sign = np.array(["+","-"])[(df_corr[d_varname].values<0).astype(int)] - AUC = np.array([df_auc.loc[df_auc['variable']== v,'AUC test'].values[0] for v in Variable]).round(n_decimals) - df_correlation = pd.DataFrame({"Variable":Variable - ,"Correlation":Correlation - ,"Sign":Sign - ,"AUC": AUC} - ,columns=["Variable","Correlation","Sign","AUC"]) - df_correlation.sort_values(by='Correlation', ascending=False, inplace=True) - df_correlation = df_correlation.loc[df_correlation.Variable!=varname,:] - df_correlation.to_csv(path_or_buf=correlation_path - ,sep=';' - ,index=False - ,encoding='utf-8' - ,line_terminator='\n') # quoting=csv.QUOTE_NONNUMERIC - - -# Variable Preselections - -# In[ ]: - -selections_path = root+'/data/univariate/variable_selections.csv' -df_variable_selections.to_csv(path_or_buf=selections_path - ,sep=';' - ,index=False - ,encoding='utf-8' - ,line_terminator='\n') - - -# Result dataframe for Modeling input - -# In[ ]: - -out_path = root+"/data/univariate/df_univariate.csv" -df_out.to_csv(path_or_buf=out_path, sep=';', index=False, encoding='utf-8', line_terminator='\n', quoting=csv.QUOTE_NONNUMERIC) - - -# Modeltab reset - -# In[ ]: - -# Generate modeltab info -filename = root+"/data/univariate/modeltab_info.csv" -with open(filename, 'w') as csvfile: - write=csv.writer(csvfile, delimiter =';') - write.writerow(["key","value"]) - write.writerow(["run","Default"]) - write.writerow(["new","Alternative 1"]) - write.writerow(["new_template","Default"]) - write.writerow(["champ","Default"]) - write.writerow(["score","Default"]) - - -# Log messages - -# In[ ]: - -log.append("-- Univariate analysis completed --"+"\n") - - -# In[ ]: - -log_file = open(root+'/python/univariate.log','w') -log_file.write('\n'.join(log)) -log_file.close() - - -# --- - -# ### Stop script - -# In[ ]: - -print("ok") - - -# --- diff --git a/legacy_code/script_sprint3.py b/legacy_code/script_sprint3.py deleted file mode 100644 index 256e96b..0000000 --- a/legacy_code/script_sprint3.py +++ /dev/null @@ -1,662 +0,0 @@ - -# coding: utf-8 - -# # Modeling - -# ### General Imports - -# In[1]: - -import time -import math -import random -import csv -import os - - -# In[2]: - -import pandas as pd -import numpy as np -import statsmodels.api as sm -import matplotlib.pyplot as plt - - -# In[3]: - -from scipy import stats -from sklearn import metrics -from sklearn.linear_model import LogisticRegression -from itertools import chain - - -# --- - -# ### Miscellaneous - -# In[4]: - -log = [] - - -# In[5]: - -# When code is in script, we define the path of the script's parent folder location as the root directory -# From this root we can travel to the relevant folders with minimal adjustment -try: - root = os.path.dirname(os.path.realpath(__file__)) - root = "/".join(root.split('\\')[:-1]) - log.append('Dynamic paths'+'\n') -except: - root = 'C:/wamp64/www/python_predictions_4/assets/scripts' - log.append('Static paths'+'\n') - - -# In[6]: - -# To allow pandas dataframes to display more columns -pd.set_option("display.max_columns",50) - - -# --- - -# ### Read data and organize - -# Read-in univariate output with asssumed ID, TARGET, PARTITION and D_VARS - -# In[7]: - -df_univariate_path = root+"/data/univariate/df_univariate.csv" -df_in = pd.read_csv(df_univariate_path, sep=";") - - -# Reference X and Y for each partition individually - -# In[8]: - -dvars = [n for n in df_in.columns if n[:2] == 'D_'] - - -# In[9]: - -mask_train = df_in.PARTITION=="train" -mask_selection = df_in.PARTITION=="selection" -mask_validation = df_in.PARTITION=="validation" - - -# In[10]: - -y_train = df_in.loc[mask_train,'TARGET'] -y_selection = df_in.loc[mask_selection,'TARGET'] -y_validation = df_in.loc[mask_validation,'TARGET'] - - -# In[11]: - -x_train = df_in.loc[mask_train,dvars] -x_selection = df_in.loc[mask_selection,dvars] -x_validation = df_in.loc[mask_validation,dvars] - - -# Analysis settings - -# In[12]: - -df_settings = pd.read_csv(root+'/python/analysis_settings.csv', sep=',', index_col=0, header=None).T - - -# Modeltab info - -# In[13]: - -df_modeltab = pd.read_csv(root+'/data/univariate/modeltab_info.csv',sep=';', index_col=0, header=None).T -modelrun = df_modeltab.run[1] - - -# Variable selections - -# In[14]: - -df_selections = pd.read_csv(root+'/data/univariate/variable_selections.csv',sep=';') - - -# --- - -# ### Model making and recording - -# Define functions - -# In[15]: - -# Function to make logistic model on a predefined set of predictors + compute train AUC of resulting model -def processSubset(predictors_subset): - from sklearn.linear_model import LogisticRegression - from sklearn import metrics - # Fit model on predictors_subset and retrieve performance metric - model = LogisticRegression(fit_intercept=True, C=1e9, solver = 'liblinear') - modelfit = model.fit(y=y_train, X=x_train[predictors_subset]) - # Position of the TARGET==1 class - pos = [i for i,h in enumerate(modelfit.classes_) if h==1] - # Prediction probabilities for the TARGET==1 - y_pred = modelfit.predict_proba(x_train[predictors_subset])[:,pos] - auc = metrics.roc_auc_score(y_true=y_train, y_score=y_pred) - return {"modelfit":modelfit,"auc":auc,"predictor_names":predictors_subset,"predictor_lastadd":predictors_subset[-1]} - - -# In[16]: - -# Function for computing AUC of all sets (train, selection & validation) -def getAuc(df_without_auc): - import pandas as pd - from sklearn import metrics - df_with_auc = df_without_auc[:] - for x,y,part in [(x_train,y_train,'train'), - (x_selection,y_selection,'selection'), - (x_validation,y_validation,'validation')]: - pos = [i for i,h in enumerate(df_without_auc.modelfit.classes_) if h==1] - y_pred = df_without_auc.modelfit.predict_proba(x[df_without_auc['predictor_names']])[:,pos] - df_with_auc["auc_"+part] = metrics.roc_auc_score(y_true=y, y_score=y_pred) - df_with_auc["pred_"+part] = y_pred - return(df_with_auc) - - -# In[17]: - -# Forward selection function that uses processSubset and getAuc -def forward(current_predictors, pool_predictors, positive_only=True): - import pandas as pd - import numpy as np - tic = time.time() - - #Pull out predictors we still need to process - remaining_predictors = [p for p in pool_predictors if p not in current_predictors] - # If there are no more predictors left to use, raise an error we can easily identify as normal - if len(remaining_predictors)==0: - raise ValueError("No more predictors left to use","NormalStop") - - #Create a model for each combination of: current predictor(s) + one of the remaining predictors - #Keep track of the submodels and their performance - #If error skip to next and do not include in comparison table - results = [] - errorcount = 0 - for p in remaining_predictors: - try: - results.append(processSubset(current_predictors+[p])) - except: - errorcount += 1 - models = pd.DataFrame(results) - - # If we require all coefficients to be positive... - if positive_only: - #Create a flag for each submodel to test if all coefficients are positive - all_positive = pd.Series(None, index=models.index) - for i in range(0,len(models)): - all_positive[i] = (models.modelfit[i].coef_ >= 0 ).all() - - # if no model exist with only positive coefficients raise error we can easily identify as normal - if (all_positive==0).all(): - raise ValueError("No models with only positive coefficients","NormalStop") - - #Choose model with best performance and only positive coefficients - best_model = models.loc[models[all_positive==1].auc.argmax()] - best_model = getAuc(best_model) - - # If we don't require all coefficients to be positive... - else: - #Choose model with best performance - best_model = models.loc[models.auc.argmax()] - best_model = getAuc(best_model) - - - tac = time.time() - info = ("Processed " - + str(models.shape[0]) - + " models on " - + str(len(current_predictors)+1) - + " predictors in " - + str(round(tac-tic,2)) - +" sec with " - + str(errorcount) - +" errors") - - return best_model, info - - -# Create recipient vars - -# In[18]: - -best_models = pd.DataFrame(columns=["modelfit", - "predictor_names", - "predictor_lastadd", - "auc_train", - "auc_selection", - "auc_validation", - "pred_train", - "pred_selection", - "pred_validation"]) -predictors = [] - - -# Define number of steps depending on settings and total number of predictors - -# In[19]: - -step_setting = int(df_settings.modeling_nsteps) -n_steps = min(step_setting,len(x_train.columns)) - - -# Define which variables to pass, force and filter - -# In[20]: - -mask_pass = (df_selections.preselect == 1) & (df_selections[modelrun]==0) -varname_list_pass = 'D_'+df_selections.loc[mask_pass,'variable'] -length_pass = len(varname_list_pass) - -mask_force = (df_selections.preselect == 1) & (df_selections[modelrun]==1) -varname_list_force = 'D_'+df_selections.loc[mask_force,'variable'] -length_force = len(varname_list_force) - - -# Execute forward modeling process - -# In[21]: - -tic = time.time() -use_predictors = varname_list_force #x_train.columns -for i in range(1,n_steps+1): - try: - # Use predictors to be forced first. Once through the list, append the remaining variables to be passed. - use_predictors = varname_list_force.append(varname_list_pass[[i>length_force]*length_pass]).reset_index(drop=True) - result = forward(current_predictors=predictors - ,pool_predictors= use_predictors - ,positive_only=True) - best_models.loc[i] = result[0] - predictors = best_models.loc[i].predictor_names - log.append(result[1]) - except Exception as e: - # Normal errors (i.e. no more predictors to be used / no models with only positive coefficients) - if e.args[-1]=='NormalStop': - log.append("Stopped modeling at "+str(i)+" predictors: "+ e.args[-2]) - # Other unknown errors - else: - log.append("Stopped modeling at "+str(i)+" predictors: unknown error") - break -toc = time.time() -log.append("Forward selection modeling: " + str(round((toc-tic)/60,0)) + " min"+"\n") - - -# --- - -# ### Optimal model criterion - -# Define functions - -# In[22]: - -def comparefit(p,g=2): - # We fit a second degree (g=2) polyline through our auccurve - # This serves as a starting base for finding our optimal stopping point - import numpy as np - import pandas as pd - z = np.polyfit(p.index, p, g) - f = np.poly1d(z) - y_new = f(p.index) - return pd.Series(y_new,index=p.index) - - -# In[23]: - -def slopepoint(p,p_fit,thresh_ratio=0.2): - # We take the polyline from comparefit and look for the point of which the slope lies just below some percentage of the max. slope - slopes = [p_fit[i+1]-p_fit[i] for i in range(1,len(p_fit))] - slopes = pd.Series(slopes, index=range(1,len(p_fit))) - thresh = slopes.max()*thresh_ratio - p_best_index = (slopes[slopes>thresh])[-1:].index - p_best = p.loc[p_best_index] - return p_best - - -# In[24]: - -def moveright(p,p_fit,p_best,n_steps=5,dampening=0.01): - # We look nsteps right on the polyline (starting from the slopepoint) and take the point with largest difference with real line - # We move to that point if that difference is larger than some multiplication of the difference at the slopepoint - # That multiplication gets larger as current the current difference gets smaller with a certain amount of dampening. - # The rationale behind this is as follows: - # if the current difference is already large than the larger difference will definitely be noteworthy - # if however the current difference is near zero than there needs to be much larger difference to be noteworthy - in_index = p_best.index.values[0] - lower = (in_index-1) - upper = (in_index+n_steps-1) - p_diff = p[lower:upper]-p_fit[lower:upper] - out_index = p_diff.argmax() - factor = 1/abs(p_diff[in_index]) - if (p_diff[out_index]>p_diff[in_index]+(abs(p_diff[in_index])*factor*dampening)): - p_best_new = pd.Series(p[out_index],index=[out_index]) - else: - p_best_new = p_best - return p_best_new - - -# In[25]: - -def moveleft(p,p_fit,p_best,rangeloss=0.1, diffshare=0.8): #diff_min=0.005): - # Starting from whatever point we end up with (either the slopepoint or a move to the right) - # We look left on the polyline and take the point for which the real line is largest (current point included) - # We move left if we stay within [a specific % loss of range] AND [a minimum % of current difference] - # i.e. we don't won't to go to low compared to the overall real line - # and we don't won't to move to a point that does not make a significant increase in AUC (i.e. difference between polyline and real line) - p_left = p[:p_best.index.values[0]] - p_best = p_left[p_left==p_left.max()] - p_diff = p-p_fit - p_range = p.max()-p.min() - s = p[(p >= p_best.values[0]-(rangeloss*p_range)) - & (p.index <= p_best.index.values[0]) - & (p_diff>=diffshare*p_diff[p_left.index[-1]]) - ] - p_best_new = s[s.index == s.index.values.min()] - return p_best_new - - -# Execute functions - -# In[26]: - -points = best_models.auc_selection -points_fit = comparefit(p=points, g=2) -points_slope = slopepoint(p=points, p_fit=points_fit, thresh_ratio=0.2) -points_right = moveright(p=points, p_fit=points_fit, p_best=points_slope, n_steps=5, dampening=0.01) -points_left = moveleft(p=points, p_fit=points_fit, p_best=points_right, rangeloss=0.1, diffshare=0.8) - -optimal_nvars = points_left.index.values[0] - - -# Inspect - -# %matplotlib inline -# from pylab import rcParams -# rcParams['figure.figsize'] = 10, 5 -# -# plt.plot( points.index , points , color="blue") -# plt.plot( points_fit.index , points_fit , color="red") -# plt.plot( points_slope.index, points_slope,'o', color="lightgreen", markersize=12) -# plt.plot( points_right.index, points_right,'o', color="black" , markersize=8) -# plt.plot( points_left.index , points_left ,'o', color="gold" , markersize=4) -# -# axes = plt.gca() -# axes.set_ylim([0.45,1]) -# plt.show() - -# --- - -# ### Cumulative gains/response - -# Define functions - -# In[28]: - -# Compute cumulative response/gains -def cumulatives(y,yhat,perc_as_int=False,dec=2): - nrows = len(y) - npositives = y.sum() - y_yhat = pd.DataFrame({"y":y, "yhat":yhat}).sort_values(by='yhat', ascending=False).reset_index(drop=True) - cresp = [] - cgains = [0] - for stop in (np.linspace(0.01,1,100)*nrows).astype(int): - cresp.append(round(y_yhat.loc[:stop,'y'].mean()*max(100*int(perc_as_int),1),dec)) - cgains.append(round(y_yhat.loc[:stop,'y'].sum()/npositives*max(100*int(perc_as_int),1),dec)) - return cresp,cgains - - -# Execute functions - -# In[29]: - -cresp_all = [None] -cgains_all = [None] -for i in range(1,len(best_models)+1): - out = cumulatives(y=y_selection - ,yhat=best_models.pred_selection[i][:,0] - ,perc_as_int=True - ,dec=2) - cresp_all.append(out[0]) - cgains_all.append(out[1]) - - -# Inspect - -# %matplotlib inline -# from pylab import rcParams -# rcParams['figure.figsize'] = 10, 5 -# -# cmap = plt.get_cmap('hot') -# colors = [cmap(i) for i in np.linspace(0, 1, n_steps)] -# for i in range(1,len(best_models)): -# plt.plot(range(1,101), cresp_all[i], color=colors[i-1]) -# plt.plot(range(1,101), cresp_all[-1], color="black") -# -# axes = plt.gca() -# axes.set_ylim([0,max(max(l) for l in np.array(cresp_all)[1:])]) -# plt.show() - -# %matplotlib inline -# from pylab import rcParams -# rcParams['figure.figsize'] = 10, 5 -# -# cmap = plt.get_cmap('hot') -# colors = [cmap(i) for i in np.linspace(0, 1, n_steps)] -# for i in range(1,len(best_models)): -# plt.plot(range(0,101), cgains_all[i], color=colors[i-1]) -# plt.plot(range(0,101), cgains_all[-1], color="black") -# -# axes = plt.gca() -# axes.set_ylim([0,max(max(l) for l in np.array(cgains_all)[1:])]) -# plt.show() - -# --- - -# ### Variable Importance - -# Define function - -# In[32]: - -# Compute variable importance based on correlation between predictor and prediction (on selection set) -def getImportance(model): - from scipy import stats - - predictors = [pred[2:] for pred in model.predictor_names] - pearcorr = [] - for predictor in predictors: - pearsonr = stats.pearsonr(x_selection.loc[:,'D_'+predictor].values, model.pred_selection[:,0]) - pearcorr.append(pearsonr[0].round(2)) - df_result = pd.DataFrame({'variable':predictors,'importance':pearcorr}, columns=['variable','importance']) - return df_result - - -# Execute function - -# In[33]: - -importance_all=[None] -for i in best_models.index: - importance_all.append(getImportance(best_models.loc[i,:])) - - -# Inspect - -# %matplotlib inline -# from pylab import rcParams -# rcParams['figure.figsize'] = 10, 5 -# -# #nvars = optimal_nvars -# nvars = len(best_models) -# -# fig, ax = plt.subplots() -# predictors = importance_all[nvars].variable -# y_pos = np.arange(len(predictors)) -# importance = importance_all[nvars].importance -# -# ax.barh(y_pos, importance, align='center', -# color='darkblue', ecolor='black') -# ax.set_yticks(y_pos) -# ax.set_yticklabels(predictors) -# ax.invert_yaxis() -# ax.set_xlabel('Importance') -# plt.show() - -# --- - -# ### Model Coefficients - -# In[35]: - -# Store all variable names + coefficients for every best model (with 1,2,3,... variables) -vars_out = [] -coef_out = [] -nmod_out = [] -for i in best_models.index: - modout = best_models.loc[i,:] - vars_out_st = ['Intercept']+[var[2:] for var in modout.predictor_names] - vars_out.append(vars_out_st) - coef_out_st = list(modout.modelfit.intercept_)+list(+ modout.modelfit.coef_[0]) - coef_out.append(coef_out_st) - nmod_out.append([i]*(i+1)) - -vars_out = list(chain.from_iterable(vars_out)) -coef_out = list(chain.from_iterable(coef_out)) -nmod_out = list(chain.from_iterable(nmod_out)) - - -# In[36]: - -df_coeff = pd.DataFrame({'nstep':nmod_out,'varname':vars_out,'coeff':coef_out}, columns=['nstep','varname','coeff']) - - -# --- - -# ### Export Files - -# In[37]: - -nmods = len(best_models) - - -# Auc curve - -# In[38]: - -filename = root+"/data/modeling/"+modelrun+"_auccurve.csv" -with open(filename, 'w') as csvfile: - write=csv.writer(csvfile, delimiter =';') - write.writerow(["optimal" ,optimal_nvars]) - write.writerow(["selected",optimal_nvars]) - write.writerow(["variable","train", "selection","validation"]) - write.writerows([best_models.predictor_lastadd[i][2:] - , best_models.auc_train[i].round(3) - , best_models.auc_selection[i].round(3) - , best_models.auc_validation[i].round(3) ] for i in range(1,nmods+1)) - - -# Cresp - -# In[39]: - -for v in range(1,nmods+1): - filename = root+"/data/modeling/"+modelrun+"_cresp_"+str(v)+".csv" - with open(filename, 'w') as csvfile: - write=csv.writer(csvfile, delimiter =';') - write.writerows([i+1, cresp_all[v][i]] for i in range(0,100)) - - -# Cgains - -# In[40]: - -for v in range(1,nmods+1): - filename = root+"/data/modeling/"+modelrun+"_cgains_"+str(v)+".csv" - with open(filename, 'w') as csvfile: - write=csv.writer(csvfile, delimiter =';') - write.writerows([i, cgains_all[v][i]] for i in range(0,101)) - - -# Variable importance - -# In[41]: - -for v in range(1,nmods+1): - filename = root+"/data/modeling/"+modelrun+"_importance_"+str(v)+".csv" - with open(filename, 'w') as csvfile: - write=csv.writer(csvfile, delimiter =';') - write.writerow(['variable','importance']) - write.writerows([importance_all[v].iloc[i,0],importance_all[v].iloc[i,1]] for i in range(v)) - - -# Model coefficients - -# In[42]: - -out_path = root+"/data/modeling/"+modelrun+"_modelcoeff.csv" -df_coeff.to_csv(path_or_buf=out_path, sep=';', index=False, encoding='utf-8', line_terminator='\n', quoting=csv.QUOTE_NONNUMERIC) - - -# Log messages - -# In[43]: - -log.append("-- Modeling phase completed --"+"\n") - - -# In[44]: - -log_file = open(root+"/python/"+modelrun+"_modeling.log",'w') -log_file.write('\n'.join(log)) -log_file.close() - - -# --- - -# ### Stop script - -# In[45]: - -print("ok") - - -# --- - -# # WIP - -# Scoring all rows - -# # Scoring of all rows -# import re -# tic = time.time() -# df_score = pd.DataFrame([]) -# df_score['ID'] = df_in['ID'] -# scores = [] -# for i in range(len(df_in)): -# ### METHOD 1: using function -# score = [optifit.predict_proba(df_in[optivars])[i,:][-1]] -# ### METHOD 2: with coefficients (same method as in scoring) -# #exponent = optiint + ((df_in[optivars].iloc[i,:])*(opticoef[0])).sum() -# #score = [(math.exp(exponent)) / (1+math.exp(exponent))] -# -# scores.extend(score) -# try: -# zeros = re.findall('[0]+$',str(i)) -# if len(zeros[0])>=3: -# print(i) -# except: -# a=1 -# df_score['score']=pd.Series(scores) -# tac = time.time() -# print((tac-tic)/60) -# -# -# df_in.to_csv('df_mod.csv', sep=';', index=False, encoding='utf-8', line_terminator='\n') -# df_score.to_csv(path_or_buf='scores_modeling.csv', sep=';', index=False, encoding='utf-8', line_terminator='\n') - -# # /WIP diff --git a/legacy_code/script_sprint4_1.py b/legacy_code/script_sprint4_1.py deleted file mode 100644 index af1fce9..0000000 --- a/legacy_code/script_sprint4_1.py +++ /dev/null @@ -1,38 +0,0 @@ -import pandas as pd -import numpy as np - - - - -filename_modeltab = "C:\/wamp64\/www\/python_predictions_4\/assets\/scripts\/data\/univariate\/modeltab_info.csv" -filename_varsel = "C:\/wamp64\www\/python_predictions_4\/assets\/scripts\/data\/univariate\/variable_selections.csv" - -# Variable_selections.csv is the file that needs to be modified -df_varsel = pd.read_csv(filename_varsel, sep=";") - -# From modeltab_info.csv we get the information which column in variable_selections.csv needs to be modified -# as well as which model-tab_auccurve.csv needs to be imported to use as template-filler -df_tab = pd.read_csv(filename_modeltab, sep=";") -new = df_tab.loc[df_tab.key=="new", "value"].reset_index(drop=True)[0] -new_template = df_tab.loc[df_tab.key=="new_template", "value"].reset_index(drop=True)[0] - -# The correct model-tab_auccurve.csv is imported from which the variables to be forced are read -# This list can be deduced from the line "selected" and the column "variable" -template_list = [] -if new_template.upper()!="SCRATCH": - filename_auccurve = "C:\/wamp64\/www\/python_predictions_4\/assets\/scripts\/data\/modeling\/"+new_template+"_auccurve.csv" - df_usersel = pd.read_csv(filename_auccurve, sep=";", nrows =2, header=None, names=["option","value"]) - nvars = df_usersel.loc[df_usersel.option == "selected", "value"].reset_index(drop=True)[0] - df_template = pd.read_csv(filename_auccurve, sep=";", skiprows=2) - template_list = list(df_template.loc[:nvars-1, "variable"]) -mask_force= np.array([var in template_list for var in df_varsel.variable]) - - -# Apply template to variable_selections.csv -df_varsel.loc[:,[new]] = 0 -df_varsel.loc[mask_force,[new]] = 1 - -# Export to csv -df_varsel.to_csv(filename_varsel, sep=";", index=False) - -print("ok") diff --git a/legacy_code/script_sprint4_2.py b/legacy_code/script_sprint4_2.py deleted file mode 100644 index 5579a72..0000000 --- a/legacy_code/script_sprint4_2.py +++ /dev/null @@ -1,458 +0,0 @@ - -# coding: utf-8 - -# # Scoring - -# Purpose of this script is to output (an)other script(s) with self-contained code to score out a basetable -# -# Call this script: the scriptmaker -# -# Call the created self-contained script: the scoring script - -# ### Import libraries - -# In[1]: - -import time -import math -import csv -import re -import pandas as pd -import numpy as np - - -# --- - -# ### Miscellaneous - -# In[2]: - -log = [] - - -# In[3]: - -# When code is in script, we define the path of the script's parent folder location as the root directory -# From this root we can travel to the relevant folders with minimal adjustment -try: - root = os.path.dirname(os.path.realpath(__file__)) - root = "/".join(root.split('\\')[:-1]) - log.append('Dynamic paths'+'\n') -except: - root = 'C:/wamp64/www/python_predictions_4/assets/scripts' - log.append('Static paths'+'\n') - - -# --- - -# ### Read the data and create variables to be exported - -# ##### 1A - Retrieve Modeltab info to find out for which modeltab we want a scoring script for -# Intermediate step for 1C - -# In[4]: - -df_modeltab = pd.read_csv(root+'/data/univariate/modeltab_info.csv',sep=';', index_col=0, header=None).T -modeltabtoscore = df_modeltab.score[1] - - -# ##### 1B - Retrieve number of vars selected to identify which n-th model of the 'modeltab'-models we are interested in -# Intermediate step for 1C - -# In[5]: - -df_auccurve_path = root+"/data/modeling/"+modeltabtoscore+"_auccurve.csv" -df_auccurve = open(df_auccurve_path).read() -selected_nvars = int(re.findall(r"selected;[0-9]+",df_auccurve)[0].split(';')[-1]) -selected_nvars = len(pd.read_csv(df_auccurve_path,skiprows=3, sep=';')) # USED FOR TESTING ALL VARS, TO BE DELETED !!!!!!!!!!! - - -# ##### 1C - Retrieve model coefficients of the specific n-th model of the 'modeltab'-models -# -# Result is to be stored in text in the scoring script, to achieve self-containment - -# In[6]: - -df_modelcoeff_path = root+"/data/modeling/"+modeltabtoscore+"_modelcoeff.csv" -df_modelcoeff = pd.read_csv(df_modelcoeff_path, sep=';') -mask = df_modelcoeff.nstep == int(selected_nvars) -df_modrules = df_modelcoeff.loc[mask,:] # TO BE STORED in SCORING SCRIPT ------------------------------------------------------- - - -# ##### 2 - Retrieve Data types of the predictors -# -# Result is to be stored in text in the scoring script, to achieve self-containment - -# In[7]: - -types_path = root+"/python/data_types.csv" -df_types = pd.read_csv(types_path, header=None) -df_types.columns=['var','type'] # TO BE STORED in SCORING SCRIPT --------------------------------------------------------------- - - -# ##### 3 - Univariate table output for deriving the translation from original VAR to discretisized D_VAR -# Result is to be stored in text in the scoring script, to achieve self-containment - -# In[8]: - -df_univariate_path = root+"/data/univariate/df_univariate.csv" -# we create str converters for all the B_variables coming from the univariate table -# we need this because bool & str variables (defined by ...types.csv) from the basetable are converted to objects -# and will have to be compared to the B_variables values, which better have the same type -# e.g. we have a varflag in our basetable which is converted to an object, but assume B_varflag is not converted and will be automatically read as float 1.0/0.0 -# in our incidence replacement we will thus be comparing '1'/'0' with 1.0/0.0, which won't work -uni_iterable = [(variable,getattr(__builtins__, 'str')) for variable in 'B_'+df_modrules.varname[1:].values] -uni_dict = dict(uni_iterable) -df_uni = pd.read_csv(df_univariate_path, sep=";", converters=uni_dict) - - -# In[9]: - -dcolumns = ['D_'+name for name in df_modrules.varname[1:]] -bcolumns = ['B_'+name for name in df_modrules.varname[1:]] -gvar = [] -gincid = [] -gbin = [] - -for i in range(len(dcolumns)): - # Select B_varname and D_varname - # Then take unique combinations of B_var and D_var in the univariate dataframe - # These combinations give the incidence value to attribute to the (possibly discretisized/regrouped) variables - columns_set = dcolumns[i:i+1]+bcolumns[i:i+1] - df_dupli = df_uni.loc[:,columns_set].drop_duplicates() - n_occurences = len(df_dupli) - - gvar.extend([df_dupli.columns[0][2:]]*n_occurences) - gincid.extend(df_dupli.iloc[:,0].values) - gbin.extend(df_dupli.iloc[:,1].values) - - -df_prep = pd.DataFrame({'var':gvar,'bin':gbin,'incid':gincid} - ,columns=['var','bin','incid']) # TO BE STORED in SCORING SCRIPT -------------------------------- - - -# --- - -# ### Writing scoring scripts - -# ##### for R - -# In[10]: - -score_code = open(root+"/Python/scorecode.R",'w') - -score_code.write("### Importing libraries & basetable to score\n") -score_code.write("# Importing libraries\n") -score_code.write("#library(dplyr)\n") -score_code.write("# Importing Types\n") -score_code.write("typevariables=c"+str([var for var in df_types.loc[:,'var']]).replace("[","(").replace("]",")")+"\n") -score_code.write("typetypes=c"+str([vartype for vartype in df_types.loc[:,'type']]).replace("[","(").replace("]",")")+"\n") -score_code.write("df_types=data.frame(var=typevariables,type=typetypes, stringsAsFactors='False')\n") -score_code.write("df_types_copy = df_types\n") -score_code.write("df_types_copy$type[df_types_copy$type=='int'|df_types_copy$type=='float']='numeric'\n") -score_code.write("df_types_copy$type[df_types_copy$type=='str'|df_types_copy$type=='bool']='character'\n") -score_code.write("coltypes = df_types_copy$type\n") -score_code.write("names(coltypes) = df_types_copy$var\n") -score_code.write("# Importing Basetable (with similar typing as in univariate analysis)\n") -score_code.write("df_base = read.csv('df_base.csv', check.names='False', colClasses=coltypes )\n") - -score_code.write("### Creating dataframe containing model rules\n") -score_code.write("modvariables=c"+str([var for var in df_modrules.loc[:,'varname']]).replace("[","(").replace("]",")")+"\n") -score_code.write("modcoefficients=c"+str([coeff for coeff in df_modrules.loc[:,'coeff']]).replace("[","(").replace("]",")")+"\n") -score_code.write("df_modrules=data.frame(varname=modvariables,coeff=modcoefficients, stringsAsFactors='False')\n") -score_code.write("\n") - -score_code.write("### Creating dataframe containing incidence translation rules\n") -score_code.write("prepvariables=c"+str([var for var in df_prep.loc[:,'var']]).replace("[","(").replace("]",")")+"\n") -score_code.write("prepbins=c"+str([bin for bin in df_prep.loc[:,'bin']]).replace("[","(").replace("]",")")+"\n") -score_code.write("prepincids=c"+str([bin for bin in df_prep.loc[:,'incid']]).replace("[","(").replace("]",")")+"\n") -score_code.write("df_prep =data.frame(var=prepvariables,bin=prepbins,incid=prepincids, stringsAsFactors='False')\n") -score_code.write("\n") - -score_code.write("### Grouping basetable predictors along their types and trimming basetable accordingly\n") -score_code.write("predictors = df_modrules$varname[df_modrules$varname!='Intercept']\n") -score_code.write("not_predictors = subset(colnames(df_base),!(colnames(df_base) %in% predictors))\n") -score_code.write("mask_FloatOrInt = df_types$type=='int'|df_types$type=='float'\n") -score_code.write("numeric_headers = subset(df_types$var[mask_FloatOrInt], df_types$var[mask_FloatOrInt] %in% predictors)\n") -score_code.write("object_headers = subset(df_types$var[df_types$type=='str'], df_types$var[df_types$type=='str'] %in% predictors)\n") -score_code.write("bool_headers = subset(df_types$var[df_types$type=='bool'], df_types$var[df_types$type=='bool'] %in% predictors)\n") -score_code.write("df_base = df_base[c(predictors,'ID')]\n") -score_code.write("\n") - -score_code.write("### Preprocessing the basetable\n") -score_code.write("# Strip quot function\n") -score_code.write("strip_quot<-function(x){\n") -score_code.write(' x = gsub("') -score_code.write("'") -score_code.write('","",x)\n') -score_code.write(" x = gsub('") -score_code.write('"') -score_code.write("','',x)\n") -score_code.write(" x = trimws(x)\n") -score_code.write(" return(x)\n") -score_code.write("}\n") -score_code.write("# Lower/upper function\n") -score_code.write("lower_upper<-function(x){\n") -score_code.write(" if (tolower(x)=='id'|tolower(x)=='target'){\n") -score_code.write(" x = toupper(x)\n") -score_code.write(" }\n") -score_code.write(" else {\n") -score_code.write(" x = tolower(x)\n") -score_code.write(" }\n") -score_code.write("}\n") -score_code.write("# maskmissing function in str/bool columns\n") -score_code.write("maskmissing<-function(var){\n") -score_code.write(" crit1 = is.na(var)\n") -score_code.write(" crit2 = var==''\n") -score_code.write(" return(crit1|crit2)\n") -score_code.write("}\n") -score_code.write("# Apply preprocessing functions\n") -score_code.write("colnames(df_base) = sapply(colnames(df_base), lower_upper)\n") -score_code.write("colnames(df_base) = sapply(colnames(df_base), strip_quot)\n") -score_code.write("df_base[] = lapply(df_base, strip_quot)\n") -score_code.write("for (predictor in c(object_headers,bool_headers)){\n") -score_code.write(" df_base[maskmissing(df_base[predictor]),predictor]='Missing'\n") -score_code.write("}\n") -score_code.write("\n") - -score_code.write("### Incidence replacement\n") -score_code.write("# Recipient dataframe\n") -score_code.write("df_out = data.frame(ID=df_base$ID)\n") -score_code.write("# Incidence replacement for string columns\n") -score_code.write("for (header in c(object_headers,bool_headers)){\n") -score_code.write(" mask = df_prep$var==header\n") -score_code.write(" bins = df_prep[mask,'bin']\n") -score_code.write(" incidences = df_prep[mask,'incid']\n") -score_code.write(" nonsig_bins = c()\n") -score_code.write(" nonsig_incidences = c()\n") -score_code.write(" if (sum(bins == 'Non-significants')>0) {\n") -score_code.write(" nonsig_bins = subset(unique(df_base[,header]), !(unique(df_base[,header]) %in% bins))\n") -score_code.write(" nonsig_incidences = rep(incidences[bins=='Non-significants'],length(nonsig_bins))\n") -score_code.write(" }\n") -score_code.write(" keys = c(bins,nonsig_bins)\n") -score_code.write(" values = c(incidences,nonsig_incidences)\n") -score_code.write(" df_out[paste('D_',header, sep='')] = values[match(df_base[,header], keys)]\n") -score_code.write("}\n") -score_code.write("# Incidence replacement for numeric columns\n") -score_code.write("for (header in numeric_headers){\n") -score_code.write(" mask = df_prep$var==header\n") -score_code.write(" bins = df_prep[mask,'bin']\n") -score_code.write(" incidences = df_prep[mask,'incid']\n") -score_code.write(" index_missing = which(bins=='Missing')\n") -score_code.write(" incidence_missing = incidences[index_missing]\n") -score_code.write(" upper_values = c()\n") -score_code.write(" last <- function(x) { return( x[length(x)] ) }\n") -score_code.write(" for (binn in bins){\n") -score_code.write(" upper_value = last(unlist(strsplit(binn,',')))\n") -score_code.write(" upper_value = tryCatch(as.numeric(gsub('([0-9]+).*$', '\\") -score_code.write("\\") -score_code.write("1',upper_value)), warning=function(e) Inf)\n") -score_code.write(" upper_values = c(upper_values,upper_value)\n") -score_code.write(" }\n") -score_code.write(" if(!identical(index_missing,integer(0))) upper_values = upper_values[-index_missing]\n") -score_code.write(" if(!identical(index_missing,integer(0))) incidences = incidences[-index_missing]\n") -score_code.write(" upper_values_incidences = incidences[order(upper_values)]\n") -score_code.write(" upper_values = upper_values[order(upper_values)]\n") -#score_code.write(" incidence_replaced_values = c()\n") -#score_code.write(" for (original_value in as.numeric(df_base[,header])){\n") -#score_code.write(" if (is.na(original_value)){\n") -#score_code.write(" incidence_to_attribute = incidence_missing\n") -#score_code.write(" }\n") -#score_code.write(" else {\n") -#score_code.write(" lowest_membership = min(which(original_value<=upper_values))\n") -#score_code.write(" incidence_to_attribute = upper_values_incidences[lowest_membership]\n") -#score_code.write(" }\n") -#score_code.write(" incidence_replaced_values = c(incidence_replaced_values,incidence_to_attribute)\n") -#score_code.write(" }\n") -#score_code.write(" df_out[paste('D_',header, sep='')] = incidence_replaced_values\n") -score_code.write(" mask_nan = is.na(df_base[,header])\n") -score_code.write(" lowest_memberships = findInterval(as.numeric(df_base[,header]), upper_values * (1 + .Machine$double.eps)) + 1\n") -score_code.write(" incidences_to_attribute = upper_values_incidences[lowest_memberships]\n") -score_code.write(" incidences_to_attribute[mask_nan] = incidence_missing\n") -score_code.write(" df_out[paste('D_',header, sep='')] = incidences_to_attribute\n") -score_code.write("}\n") -score_code.write("\n") - -score_code.write("### Scoring\n") -score_code.write("df_scores = data.frame(ID=as.numeric(as.character(df_out$ID)))\n") -score_code.write("scores = c()\n") -score_code.write("intercept="+str(df_modrules.coeff.values[0])+"\n") -score_code.write("coefficients=c"+str([coeff for coeff in df_modrules.coeff][1:]).replace("[","(").replace("]",")")+"\n") -score_code.write("productsums = rowSums(t(t(df_out[,paste('D_',predictors,sep='')])*coefficients))\n") -score_code.write("exponents = intercept + productsums\n") -score_code.write("scores = sapply(exponents, FUN = function(x) (exp(x)) / (1+exp(x)))\n") -score_code.write("df_scores['score']=scores\n") -score_code.write("\n") - -score_code.close() - - -# ##### for Python - -# In[11]: - -score_code = open(root+"/Python/scorecode.py",'w') - -score_code.write("### Importing libraries & basetable to score\n") -score_code.write("# Importing Libraries\n") -score_code.write("import time\nimport math\nimport csv\nimport re\nimport pandas as pd\nimport numpy as np\n") -score_code.write("# Importing Types\n") -score_code.write("typevariables="+str([var for var in df_types.loc[:,'var']])+"\n") -score_code.write("typetypes="+str([vartype for vartype in df_types.loc[:,'type']])+"\n") -score_code.write("df_types=pd.DataFrame({'var':typevariables,'type':typetypes},columns=['var','type'])\n") -score_code.write("df_types_copy = df_types.copy()\n") -score_code.write("bool_mask = df_types_copy.loc[:,'type']!='bool'\n") -score_code.write("df_types_copy.loc[bool_mask,'type'] = [getattr(__builtins__, type_str) for type_str in df_types_copy.loc[bool_mask,'type']]\n") -score_code.write("df_types_copy.loc[bool_mask==False,'type'] = getattr(__builtins__, 'str')\n") -score_code.write("types = df_types_copy.set_index('var').T.to_dict('records')\n") -score_code.write("# Importing Basetable with similar typing as in univariate analysis\n") -score_code.write("df_base = pd.read_csv('df_base.csv',header=0,sep=None,engine='python',converters=types[0])\n") -score_code.write("\n") - -score_code.write("### Creating dataframe containing model rules\n") -score_code.write("modvariables="+str([var for var in df_modrules.loc[:,'varname']])+"\n") -score_code.write("modcoefficients="+str([coeff for coeff in df_modrules.loc[:,'coeff']])+"\n") -score_code.write("df_modrules=pd.DataFrame({'varname':modvariables,'coeff':modcoefficients})\n") -score_code.write("\n") - -score_code.write("### Creating dataframe containing incidence translation rules\n") -score_code.write("prepvariables="+str([var for var in df_prep.loc[:,'var']])+"\n") -score_code.write("prepbins="+str([bin for bin in df_prep.loc[:,'bin']])+"\n") -score_code.write("prepincids="+str([bin for bin in df_prep.loc[:,'incid']])+"\n") -score_code.write("df_prep = pd.DataFrame({'var':prepvariables,'bin':prepbins,'incid':prepincids}, dtype=object)\n") -score_code.write("df_prep.loc[:,'incid']=df_prep.loc[:,'incid'].astype('float64')\n") -score_code.write("\n") - -score_code.write("### Grouping basetable predictors along their types and trimming basetable accordingly\n") -score_code.write("predictors = list(df_modrules.loc[df_modrules.varname!='Intercept','varname'].values)\n") -score_code.write("not_predictors = [column for column in df_base.columns if column not in predictors]\n") -score_code.write("mask_FloatOrInt = (df_types.type=='int')|(df_types.type=='float')\n") -score_code.write("numeric_headers=[var for var in df_types.loc[mask_FloatOrInt,'var'].values if var in predictors]\n") -score_code.write("object_headers=[var for var in df_types.loc[df_types.type=='str','var'].values if var in predictors]\n") -score_code.write("bool_headers=[var for var in df_types.loc[df_types.type=='bool','var'].values if var in predictors]\n") -score_code.write("df_base = df_base[predictors+['ID']]\n") -score_code.write("\n") - -score_code.write("### Preprocessing the basetable\n") -score_code.write("# Strip quot function\n") -score_code.write("def strip_quot(x_in):\n") -score_code.write(" try:\n") -score_code.write(" x_out = x_in.strip().strip('") -score_code.write('"') -score_code.write("').strip(") -score_code.write('"') -score_code.write("'") -score_code.write('")\n') -score_code.write(" except:\n") -score_code.write(" x_out=x_in\n") -score_code.write(" return x_out\n") -score_code.write("# Lower/upper function\n") -score_code.write("def lower_upper(x_in):\n") -score_code.write(" if ((x_in.lower() == 'id')|(x_in.lower() == 'target')):\n") -score_code.write(" x_out = x_in.upper()\n") -score_code.write(" else:\n") -score_code.write(" x_out = x_in.lower()\n") -score_code.write(" return x_out\n") -score_code.write("# maskmissing function in str/bool columns\n") -score_code.write("def maskmissing(var):\n") -score_code.write(" crit1 = var.isnull()\n") -score_code.write(" modvar = pd.Series([str(value).strip() for value in var])\n") -score_code.write(" crit2 = modvar==pd.Series(['']*len(var))\n") -score_code.write(" return crit1 | crit2\n") -score_code.write("# Apply preprocessing functions\n") -score_code.write("df_base = df_base.rename(columns=strip_quot)\n") -score_code.write("df_base = df_base.rename(columns=lower_upper)\n") -score_code.write("df_base = df_base.applymap(strip_quot)\n") -score_code.write("for header in object_headers+bool_headers:\n") -score_code.write(" mask = maskmissing(df_base[header])\n") -score_code.write(" df_base.loc[mask,header]='Missing'\n") -score_code.write("\n") - -score_code.write("### Incidence replacement\n") -score_code.write("# Recipient dataframe\n") -score_code.write("df_out = pd.DataFrame()\n") -score_code.write("df_out['ID']=df_base['ID']\n") -score_code.write("# Incidence replacement for string columns\n") -score_code.write("for header in object_headers+bool_headers:\n") -score_code.write(" mask = df_prep.loc[:,'var']==header\n") -score_code.write(" bins = df_prep.loc[mask,'bin']\n") -score_code.write(" incidences = df_prep.loc[mask,'incid']\n") -score_code.write(" nonsig_bins = []\n") -score_code.write(" nonsig_incidences = []\n") -score_code.write(" if (bins == 'Non-significants').any():\n") -score_code.write(" nonsig_bins = [binn for binn in df_base[header].unique() if binn not in list(bins)]\n") -score_code.write(" nonsig_incidences = list(incidences[bins=='Non-significants'])*len(nonsig_bins)\n") -score_code.write(" keys = list(bins)\n") -score_code.write(" keys.extend(nonsig_bins)\n") -score_code.write(" values = list(incidences)\n") -score_code.write(" values.extend(nonsig_incidences)\n") -score_code.write(" keys_and_values = zip(keys,values)\n") -score_code.write(" transdic = dict(keys_and_values)\n") -score_code.write(" items_to_translate = df_base[header] \n") -score_code.write(" df_out.loc[:,'D_'+header]= pd.Series([transdic[item] for item in items_to_translate])\n") -score_code.write("# Incidence replacement for numeric columns\n") -score_code.write("for header in numeric_headers:\n") -score_code.write(" mask = df_prep.loc[:,'var']==header\n") -score_code.write(" bins = df_prep.loc[mask,'bin']\n") -score_code.write(" incidences = df_prep.loc[mask,'incid']\n") -score_code.write(" index_missing = bins.index[bins=='Missing']\n") -score_code.write(" incidence_missing = incidences[index_missing]\n") -score_code.write(" upper_values = pd.Series([])\n") -score_code.write(" for i,binn in enumerate(bins.values):\n") -score_code.write(" upper_value = binn.split(',')[-1]\n") -score_code.write(" try:\n") -score_code.write(" upper_value = re.findall('[0-9]+',upper_value)[0]\n") -score_code.write(" except:\n") -score_code.write(" upper_value = math.inf\n") -score_code.write(" upper_values[i] = upper_value\n") -score_code.write(" upper_values.index = bins.index\n") -score_code.write(" upper_values.drop(index_missing, inplace=True)\n") -score_code.write(" upper_values = upper_values.astype(float)\n") -score_code.write(" upper_values.sort_values(inplace=True)\n") -score_code.write(" upper_values_incidences = incidences[upper_values.index]\n") -score_code.write(" upper_values.reset_index(drop=True, inplace=True)\n") -score_code.write(" upper_values_incidences.reset_index(drop=True, inplace=True)\n") -#score_code.write(" incidence_replaced_values = np.array([])\n") -#score_code.write(" for original_value in df_base[header]:\n") -#score_code.write(" lowest_membership = upper_values.index[original_value<=upper_values].min()\n") -#score_code.write(" try:\n") -#score_code.write(" incidence_to_attribute = upper_values_incidences[lowest_membership]\n") -#score_code.write(" except:\n") -#score_code.write(" if np.isnan(original_value):\n") -#score_code.write(" incidence_to_attribute = incidence_missing\n") -#score_code.write(" else:\n") -#score_code.write(" incidence_to_attribute = np.nan\n") -#score_code.write(" incidence_replaced_values = np.append(incidence_replaced_values,incidence_to_attribute)\n") -#score_code.write(" df_out['D_'+header] = pd.Series(incidence_replaced_values)\n") -score_code.write(" mask_npnan = df_base.loc[:,header].isnull()\n") -score_code.write(" lowest_memberships = upper_values.searchsorted(df_base.loc[:,header],side='left')\n") -score_code.write(" incidences_to_attribute = upper_values_incidences[lowest_memberships].reset_index(drop=True)\n") -score_code.write(" incidences_to_attribute[mask_npnan] = incidence_missing\n") -score_code.write(" df_out['D_'+header] = incidences_to_attribute\n") -score_code.write("\n") - - -score_code.write("### Scoring\n") -score_code.write("df_scores = pd.DataFrame([])\n") -score_code.write("df_scores['ID'] = df_out['ID']\n") -score_code.write("scores = []\n") -score_code.write("intercept="+str(df_modrules.coeff.values[0])+"\n") -score_code.write("coefficients=np.array("+str([coeff for coeff in df_modrules.coeff][1:])+")\n") -score_code.write("productsums = (df_out['D_'+pd.Series(predictors)]*coefficients).sum(axis=1)\n") -score_code.write("exponents = intercept + productsums\n") -score_code.write("scores = exponents.apply(func=lambda x:(math.exp(x)) / (1+math.exp(x)))\n") -score_code.write("df_scores['score']=scores\n") -score_code.write("\n") - -score_code.close() - - -# ##### for Sas - -# In[12]: - -# ... - -print('ok') - -# --- diff --git a/legacy_code/script_sprint4_3.py b/legacy_code/script_sprint4_3.py deleted file mode 100644 index 6860046..0000000 --- a/legacy_code/script_sprint4_3.py +++ /dev/null @@ -1,25 +0,0 @@ -import pandas as pd -import os - -try: - root = os.path.dirname(os.path.realpath(__file__)) - root = "/".join(root.split('\\')[:-1]) -except: - root = 'C:\/wamp64\/www\/python_predictions_4\/assets\/scripts\/python' - -auc_path = root + '\/data\/univariate\/aucs.csv' -df_in = pd.read_csv(auc_path, sep=';') -df_sortqual = df_in.sort_values(by=['AUC test','AUC train'], ascending=False).reset_index(drop=True) -df_sortname = df_in.sort_values(by=['variable']).reset_index(drop=True).reset_index(drop=True) -if (df_in.variable == df_sortqual.variable).all(): - df_out = df_sortname -else: - df_out = df_sortqual - -df_out.to_csv(path_or_buf=auc_path - ,sep=';' - ,index=False - ,encoding='utf-8' - ,line_terminator='\n') - -print('ok') diff --git a/legacy_code/univariate[171005].ipynb b/legacy_code/univariate[171005].ipynb deleted file mode 100644 index 3556837..0000000 --- a/legacy_code/univariate[171005].ipynb +++ /dev/null @@ -1,1516 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Univariate analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### General Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import math\n", - "import csv\n", - "import warnings\n", - "import time\n", - "import os\n", - "import itertools\n", - "import scipy.integrate\n", - "import re" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from scipy import stats\n", - "from itertools import chain\n", - "from sklearn import metrics" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Miscellaneous" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "log = []" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# When code is in script, we define the path of the script's parent folder location as the 'root' directory\n", - "# From this 'root' we can travel to the relevant folders with minimal adjustment\n", - "try:\n", - " root = os.path.dirname(os.path.realpath(__file__))\n", - " root = \"/\".join(root.split('\\\\')[:-1])\n", - " log.append('Dynamic paths'+'\\n')\n", - "except:\n", - " root = 'C:/wamp64/www/python_predictions_4/assets/scripts'\n", - " log.append('Static paths'+'\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# To allow pandas dataframes to display more columns\n", - "pd.set_option(\"display.max_columns\",50)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Read data and organize" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Basetable and its types" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# A Types csv file CAN be defined to be used to convert variables (of the basetable, see below) to the desired data types\n", - "# The Types csv files should include one column with variable names and one column with desired types (e.g. int,float,str,bool)\n", - "# If no Types csv file is provided no convertions will be forced. In that case 'Python' will guess the data type of each column \n", - "types_path = root+\"/python/data_types.csv\"\n", - "types_exist = True\n", - "\n", - "try: \n", - " df_types = pd.read_csv(types_path, header=None)\n", - " bool_mask = df_types[1]!='bool'\n", - " # Extract the functions based on the given type (e.g. 'str' -> str, 'int' -> int), for proper convertion \n", - " df_types.loc[bool_mask,1] = [getattr(__builtins__, type_str) for type_str in df_types.loc[bool_mask,1]]\n", - " # A type 'bool' is also attributed the function str, for convertion\n", - " df_types.loc[bool_mask==False,1] = getattr(__builtins__, 'str')\n", - " #types = df_types[bool_mask].set_index(0).T.to_dict('records')\n", - " types = df_types.set_index(0).T.to_dict('records')\n", - "except FileNotFoundError:\n", - " types = [dict()]\n", - " types_exist = False" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# The basetable csv file should have column names as its first row\n", - "# The columns names should include 'TARGET', 'ID'\n", - "data_path = root+\"/python/data.csv\"\n", - "\n", - "df_in = pd.read_csv(data_path\n", - " ,header=0\n", - " ,sep=None\n", - " ,engine='python'\n", - " ,converters=types[0])\n", - "\n", - "# If no Types csv file was provided pd.read_csv guessed the types, we now output these types in a csv for re-use & later use\n", - "if types_exist == False:\n", - " filename = root+\"/python/data_types.csv\"\n", - " funtotype = lambda x:re.findall('[a-z]+',str(x))[0].replace('object','str')\n", - " with open(filename, 'w') as csvfile:\n", - " write=csv.writer(csvfile, delimiter =',')\n", - " write.writerows([column\n", - " ,funtotype(df_in[column].dtype)] for column in df_in.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# Function to remove quotes from variable names and/or variable values\n", - "def strip_quot(x_in):\n", - " try:\n", - " x_out = x_in.strip().strip('\"').strip(\"'\")\n", - " except:\n", - " x_out=x_in\n", - " return x_out\n", - "\n", - "# Function to put 'id' and 'target' variable names in uppercase, all other variable names are put in lowercase\n", - "# This is coded as to visually differentiate predictors from other variables\n", - "# But another combination of upper/lower is possible as well, e.g. all variable names in uppercase\n", - "def lower_upper(x_in):\n", - " if ((x_in.lower() == 'id')|(x_in.lower() == 'target')):\n", - " x_out = x_in.upper()\n", - " else:\n", - " x_out = x_in.lower()\n", - " return x_out\n", - "\n", - "# Function to group variable names based on the data type of the variable\n", - "# Could as well use the types in Types.csv\n", - "def get_headers(dataframe,type): \n", - " return dataframe.select_dtypes(include=[type]).columns.values" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Clean up quotes from column names\n", - "df_in = df_in.rename(columns=strip_quot)\n", - "\n", - "# Perform uppercase/lowercase transformation to column names\n", - "df_in = df_in.rename(columns=lower_upper)\n", - "\n", - "# Clean up quotes from column values\n", - "df_in = df_in.applymap(strip_quot)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Group variable (names) based on the respective data type of each variable\n", - "# With this information we know which variables are destined for equifrequency, regrouping or simply passing (see further)\n", - "other_headers = [n for n in [\"TARGET\",\"ID\"]]\n", - "try:\n", - " bool_headers = [n for n in df_types.loc[bool_mask==False,0].values if n not in other_headers]\n", - "except:\n", - " bool_headers = []\n", - "object_headers = [n for n in get_headers(df_in,'object') if n not in other_headers+bool_headers]\n", - "numeric_headers = [n for n in get_headers(df_in,'number') if n not in other_headers+bool_headers]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Analysis settings" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "# Import settings defined by the user\n", - "df_settings = pd.read_csv(root+'/python/analysis_settings.csv', sep=',', index_col=0, header=None).T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Partitioning " - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Function to make partition sets (based on desired user settings) for both targets (0 & 1) \n", - "def partitionList(train_setting,selection_setting,validation_setting,sorted_target):\n", - " settings = {'train':train_setting,'selection':selection_setting,'validation':validation_setting}\n", - " parts = ['train','selection','validation']\n", - " result = []\n", - " for target in [sorted_target.iloc[0],sorted_target.iloc[-1]]:\n", - " target_length = (sorted_target==target).sum()\n", - " for part in parts:\n", - " result.extend( [part]*math.ceil(target_length*settings[part][1]/100) ) \n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# Shuffle and sort on TARGET\n", - "df_in = df_in.iloc[np.random.permutation(len(df_in))].sort_values(by='TARGET', ascending=False).reset_index(drop=True)\n", - "\n", - "# Create partition based on analysis_setting.csv\n", - "partition = partitionList(train_setting=df_settings.loc[:,'partitioning_train']\n", - " ,selection_setting=df_settings.loc[:,'partitioning_selec']\n", - " ,validation_setting=df_settings.loc[:,'partitioning_valid']\n", - " ,sorted_target=df_in.TARGET) \n", - "\n", - "# Attach to dataframe\n", - "df_in[\"PARTITION\"] = partition[:len(df_in)]\n", - "\n", - "# Sampling based on analysis settings (if both sampling_settings are set to 100, all data is used)\n", - "sampling_settings = {1:df_settings.sampling_1, 0:df_settings.sampling_0}\n", - "if (int(sampling_settings[1])<100) | (int(sampling_settings[0])<100):\n", - " for sample in sampling_settings:\n", - " sample_length = int(round((df_in.TARGET==sample).sum() * sampling_settings[sample]/100))\n", - " drop_index = df_in[df_in.TARGET==sample].index[sample_length:]\n", - " df_in.drop(drop_index,inplace=True)\n", - " df_in.reset_index(drop=True, inplace=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Output Container" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Create output dataframe which will contain transformed variables\n", - "df_out = df_in.loc[:,[\"ID\",\"TARGET\",\"PARTITION\"]].copy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preprocessing of continuous variables" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Discretization function for Continuous variables" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "### This function is a reworked version of pd.qcut to satisfy our particular needs\n", - "### Takes for var a continuous pd.Series as input and returns a pd.Series with bin-labels (e.g. [4,6[ )\n", - "### Train takes a series/list of booleans (note: we define bins based on the training set)\n", - "### Autobins reduces the number of bins (starting from nbins) as a function of the number of missings\n", - "### Nbins is the wished number of bins\n", - "### Precision=0 results in integer bin-labels if possible\n", - "### twobins=True forces the function to output at least two bins\n", - "### catchLarge tests if some groups (or missing group) are very large, and if so catches and outputs two groups\n", - "#### note: catchLarge makes twobins irrelevant\n", - "\n", - "def eqfreq(var, train, autobins=True, nbins=10, precision=0, twobins=True, catchLarge=True):\n", - " \n", - " \n", - " # Test for large groups and if one exists pass them with two bins: Large_group,Other\n", - " if catchLarge:\n", - " catchPercentage=1-(1/nbins)\n", - " groupCount = var[train].groupby(by=var[train]).count()\n", - " maxGroupPerc = groupCount.max()/len(var[train])\n", - " missingPerc = sum(var[train].isnull())/len(var[train])\n", - " if maxGroupPerc>=catchPercentage:\n", - " largeGroup = groupCount.sort_values(ascending=False).index[0]\n", - " x_binned = var.copy()\n", - " x_binned.name = 'B_'+var.name\n", - " x_binned[x_binned!=largeGroup]='Other'\n", - " cutpoints=None\n", - " info = (var.name+\": One large group, outputting 2 groups\")\n", - " return x_binned, cutpoints, info\n", - " elif missingPerc>=catchPercentage:\n", - " x_binned = var.copy()\n", - " x_binned.name = 'B_'+var.name\n", - " x_binned[x_binned.isnull()]='Missing'\n", - " x_binned[x_binned!='Missing']='Other'\n", - " cutpoints=None\n", - " info = (var.name+\": One large missing group, outputting 2 groups\")\n", - " return x_binned, cutpoints, info\n", - " # Adapt number of bins as a function of number of missings\n", - " if autobins:\n", - " length = len(var[train])\n", - " missing_total = var[train].isnull().sum()\n", - " missing_perten = missing_total/length*10\n", - " nbins = max(round(10-missing_perten)*nbins/10 ,1)\n", - " # Store the name and index of the variable\n", - " name = var.name\n", - " series_index = var.index\n", - " # Transform var and train to a np.array and list respectively, which is needed for some particular function&methods\n", - " x = np.asarray(var)\n", - " train = list(train)\n", - " # First step in finding the bins is determining what the quantiles are (named as cutpoints)\n", - " # If the quantile lies between 2 points we use lin interpolation to determine it\n", - " cutpoints = var[train].quantile(np.linspace(0,1,nbins+1),interpolation = 'linear')\n", - " # If the variable results only in 2 unique quantiles (due to skewness) increase number of quantiles until more than 2 bins can be formed\n", - " if twobins:\n", - " extrasteps = 1\n", - " # Include a max. extrasteps to avoid infinite loop\n", - " while (len(cutpoints.unique())<=2) & (extrasteps<20):\n", - " cutpoints = var[train].quantile(np.linspace(0,1,nbins+1+extrasteps),interpolation = 'linear')\n", - " extrasteps+=1\n", - " # We store which rows of the variable x lies under/above the lowest/highest cutpoint \n", - " # Without np.errstate(): xcutpoints.max() can give if x contains nan values (missings)\n", - " # However the function will result in False in both >&< cases, which is a correct result, so the warning can be ignored\n", - " with np.errstate(invalid='ignore'):\n", - " under_lowestbin = x < cutpoints.min()\n", - " above_highestbin= x > cutpoints.max()\n", - "\n", - "\n", - " def _binnedx_from_cutpoints(x, cutpoints, precision, under_lowestbin, above_highestbin):\n", - " ### Attributes the correct bin ........................\n", - " ### Function that, based on the cutpoints, seeks the lowest precision necessary to have meaningful bins\n", - " ### e.g. (5.5,5.5] ==> (5.51,5.54]\n", - " ### Attributes those bins to each value of x, to achieve a binned version of x \n", - " \n", - " # Store unique cutpoints (e.g. from 1,3,3,5 to 1,3,5) to avoid inconsistensies when bin-label making\n", - " # Indeed, bins [...,1], (1,3], (3,3], (3,5], (5,...] do not make much sense\n", - " # While, bins [...,1], (1,3], (3,5], (5,...] do make sense\n", - " unique_cutpoints = cutpoints.unique()\n", - " # If there are only 2 unique cutpoints (and thus only one bin will be returned), \n", - " # keep original values and code missings as 'Missing'\n", - " if len(unique_cutpoints) <= 2:\n", - " cutpoints = None\n", - " x_binned = pd.Series(x)\n", - " x_binned[x_binned.isnull()] = 'Missing'\n", - " info = (var.name+\": Only one resulting bin, keeping original values instead\")\n", - " return x_binned, cutpoints, info\n", - " # Store info on whether or not the number of resulting bins equals the desired number of bins\n", - " elif len(unique_cutpoints) < len(cutpoints):\n", - " info = (var.name+\": Resulting # bins < whished # bins\")\n", - " else:\n", - " info = (var.name+\": Resulting # bins as desired\")\n", - " # Finally, recode the cutpoints (which can have doubles) as the unique cutpoints\n", - " cutpoints = unique_cutpoints\n", - " \n", - " # Store missing values in the variable as a mask, and create a flag to test if there are any missing in the variable\n", - " na_mask = np.isnan(x)\n", - " has_nas = na_mask.any()\n", - " # Attribute to every x-value the index of the cutpoint (from the sorted cutpoint list) which is equal or higher than\n", - " # the x-value, effectively encompasing that x-value.\n", - " # e.g. for x=6 and for sorted_cutpoint_list=[0,3,5,8,...] the resulting_index=3 \n", - " ids = cutpoints.searchsorted(x, side='left')\n", - " # x-values equal to the lowest cutpoint will recieve a ids value of 0\n", - " # but our code to attribute bins to x-values based on ids (see end of this subfunction) requires a min. value of 1\n", - " ids[x == cutpoints[0]] = 1\n", - " # Idem as previous: x-values below the lowest cutpoint should recieve a min. value of 1\n", - " if under_lowestbin.any():\n", - " ids[under_lowestbin] = 1\n", - " # Similar as previous: x-values above the highest cutpoint should recieve the max. allowed ids\n", - " if above_highestbin.any():\n", - " max_ids_allowed = ids[(above_highestbin == False) & (na_mask==False)].max()\n", - " ids[above_highestbin] = max_ids_allowed\n", - " # Maximal ids can now be defined if we neglect ids of missing values\n", - " max_ids = ids[na_mask==False].max()\n", - " \n", - " # Based on the cutpoints create bin-labels\n", - " # Iteratively go through each precision (= number of decimals) until meaningful bins are formed\n", - " # If theoretical bin is ]5.51689,5.83654] we will prefer ]5.5,5.8] as output bin\n", - " increases = 0\n", - " original_precision = precision\n", - " while True:\n", - " try:\n", - " bins = _format_bins(cutpoints, precision)\n", - " except ValueError:\n", - " increases += 1\n", - " precision += 1\n", - " #if increases >= 5:\n", - " #warnings.warn(\"Modifying precision from \"+str(original_precision)+\" to \"+str(precision)+\" to achieve discretization\")\n", - " #print(\"Modifying precision from \"+str(original_precision)+\" to \"+str(precision)+\" to achieve discretization\")\n", - " else:\n", - " break\n", - " \n", - " # Make array of bins to allow vector-like attribution\n", - " bins = np.asarray(bins, dtype=object)\n", - " # If x has nas: for each na-value, set the ids-value to max_ids+1\n", - " # this will allow na-values to be attributed the highest bin which we define right below\n", - " if has_nas:\n", - " np.putmask(ids, na_mask, max_ids+1)\n", - " # The highest bin is defined as 'Missing'\n", - " bins = np.append(bins,'Missing')\n", - " # ids-1 is used as index in the bin-labels list to attribute a bin-label to each x. Example:\n", - " # x=6 sorted_cutpoint_list=[0,3,5,8,...] ids=3 levels=[[0,3],(3,5],(5,8],...]\n", - " # The correct bin level for x is (5,8] which has index 2 which is equal to the ids-1\n", - " x_binned = bins[ids-1]\n", - " return x_binned, cutpoints, info\n", - " \n", - "\n", - " def _format_bins(cutpoints, prec):\n", - " # Based on the quantile list create bins. Raise error if values are similar within one bin.\n", - " # On error _binnedx_from_cutpoints will increase precision\n", - " \n", - " fmt = lambda v: _format_label(v, precision=prec)\n", - " bins = []\n", - " for a, b in zip(cutpoints, cutpoints[1:]):\n", - " fa, fb = fmt(a), fmt(b)\n", - " \n", - " if a != b and fa == fb:\n", - " raise ValueError('precision too low')\n", - " \n", - " formatted = '(%s, %s]' % (fa, fb)\n", - " bins.append(formatted)\n", - " \n", - " bins[0] = '[...,' + bins[0].split(\",\")[-1]\n", - " bins[-1] = bins[-1].split(\",\")[0] + ',...]'\n", - " return bins\n", - "\n", - "\n", - " def _format_label(x, precision):\n", - " # For a specific precision, returns the value formatted with the appropriate amount of numbers after comma and correct brackets\n", - " \n", - " if isinstance(x,float):\n", - " frac, whole = np.modf(x)\n", - " sgn = '-' if x < 0 else ''\n", - " whole = abs(whole)\n", - " if frac != 0.0:\n", - " val = '{0:.{1}f}'.format(frac, precision)\n", - " val = _trim_zeros(val)\n", - " if '.' in val:\n", - " return sgn + '.'.join(('%d' % whole, val.split('.')[1]))\n", - " else: \n", - " if '0' in val:\n", - " return sgn + '%0.f' % whole\n", - " else:\n", - " return sgn + '%0.f' % (whole+1)\n", - " else:\n", - " return sgn + '%0.f' % whole\n", - " else:\n", - " return str(x)\n", - "\n", - "\n", - " def _trim_zeros(x):\n", - " # Removes unnecessary zeros and commas\n", - " while len(x) > 1 and x[-1] == '0':\n", - " x = x[:-1]\n", - " if len(x) > 1 and x[-1] == '.':\n", - " x = x[:-1]\n", - " return x\n", - "\n", - "\n", - " x_binned, cutpoints, info = _binnedx_from_cutpoints(x, cutpoints, precision=precision, under_lowestbin=under_lowestbin, above_highestbin=above_highestbin)\n", - " x_binned = pd.Series(x_binned, index=series_index, name=\"B_\"+name)\n", - " return x_binned, cutpoints, info" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# WIP" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "for n in numeric_headers:\n", - " result = eqfreq(var=df_in[n]\n", - " ,train=df_in[\"PARTITION\"]==\"train\"\n", - " ,autobins=True\n", - " ,nbins=int(df_settings.discretization_nbins)\n", - " ,precision=0\n", - " ,twobins=True\n", - " ,catchLarge=True)\n", - " print(n)\n", - " print(result[0].unique())\n", - " print('\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# /WIP" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Apply function to continuous variables" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "tic = time.time()\n", - "# We loop only through the numeric variables\n", - "for n in numeric_headers:\n", - " # Perform the equifrequency function\n", - " result = eqfreq(var=df_in[n]\n", - " ,train=df_in[\"PARTITION\"]==\"train\"\n", - " ,autobins=True\n", - " ,nbins=int(df_settings.discretization_nbins)\n", - " ,precision=0\n", - " ,twobins=True\n", - " ,catchLarge=False) # TRUE OPTION STILL PRODUCES ERROR IN SORTNUMERIC function AND SCORING procedure !!!!!!!!!\n", - " df_out = pd.concat([df_out,result[0]], axis=1)\n", - " log.append(result[2])\n", - "toc = time.time()\n", - "log.append(\"Discretisation: \"+str(toc-tic)+\" sec\"+\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preprocessing of categorical variables" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Function for labeling missing/empty values" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Check which values of a var are empty strings or null values\n", - "def maskmissing(var):\n", - " # Check if values are null\n", - " crit1 = var.isnull()\n", - " # Check if values are empty strings\n", - " modvar = pd.Series([str(value).strip() for value in var])\n", - " crit2 = modvar==pd.Series(['']*len(var))\n", - " #crit2 = var==pd.Series(['']*len(var))\n", - " #crit3 = var==pd.Series([' ']*len(var))\n", - " return crit1 | crit2 " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Regrouping Function for nominal/ordinal variables" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "# Regrouping function for categorical variables\n", - "# Each group is tested with a chi² for relevant incidence differences in comparison to a rest-group\n", - "# The rest group has the size of the remaining groups and an 'overall average incidence' (if dummy=True) or \n", - "# 'remaining groups average incidence' (if dummy=False)\n", - "# Groups with a pvalue above the threshold are relabled to a single group\n", - "\n", - "def regroup(var,target,train,pval_thresh=0.01,dummy=True,keep='Missing',rename='Other'):\n", - " \n", - " # Define the chi² test condition\n", - " # Groups that do not meet the condition are not analyzed and will be unconditionally relabled\n", - " def _chi2cond_(var=var,target=target,train=train):\n", - " varcounts = var[train].groupby(by=var).count()\n", - " train_inc = target[train].sum()/len(target[train])\n", - " factor = max(train_inc, 1-train_inc)\n", - " analyze_mask = (varcounts*factor)>5\n", - " analyze_groups = analyze_mask.index[analyze_mask].values\n", - " return analyze_groups\n", - " \n", - " # Compute overal incidence mean\n", - " incidence_mean = target[train].mean()\n", - " # Create container of which groups will be kept, compared to the groups which will be relabled\n", - " keepgroups = []\n", - " # Cycle and test each group that meets the chi² condition\n", - " for group in _chi2cond_():\n", - " # Container for target 0/1 observations of the group under scrutiny\n", - " obs_group = []\n", - " # Counts of the target 0/1 occurences for the group under scrutiny\n", - " obs_group.append(((target[train]==0)&(var[train]==group)).sum())\n", - " obs_group.append(((target[train]==1)&(var[train]==group)).sum())\n", - " obs_group = np.array(obs_group)\n", - " # Container for target 0/1 observations of the remaining groups together\n", - " obs_other = []\n", - " # Counts of the target 0/1 occurences for the remaining groups together\n", - " obs_other.append(((target[train]==0)&(var[train]!=group)).sum())\n", - " obs_other.append(((target[train]==1)&(var[train]!=group)).sum())\n", - " obs_other = np.array(obs_other)\n", - " # If dummy=True, we scale the two groups of target 0/1 occurences such that the incidence is equal to the overall incidence\n", - " # The size of the two groups of target 0/1 occurences is still equal to the size of the remaining groups\n", - " if dummy:\n", - " obs_other_size = obs_other.sum()\n", - " obs_other[0]=(1-incidence_mean)*obs_other_size # 0(1) index coincides with target = 0(1)\n", - " obs_other[1]=( incidence_mean)*obs_other_size\n", - " obs = np.array([obs_group,obs_other])\n", - " # Place at least 1 observation to avoid error in chi2 test\n", - " obs[obs==0] = 1\n", - " # Perform chi² test\n", - " pval = stats.chi2_contingency(obs, correction=False)[1]\n", - " # If pval outperforms threshold, append the group in the keepgroups list\n", - " if pval<=pval_thresh:\n", - " keepgroups.append(group)\n", - " #elif group==keep:\n", - " # keepgroups.append(group)\n", - " # If the specific group to be kept (e.g. 'Missing') didn't pass the test, append it to the keepgroups list\n", - " if keep not in keepgroups:\n", - " keepgroups.append(keep)\n", - " # Makes a list of all groups not in the keepgroups list\n", - " regroup_mask = [val not in keepgroups for val in var.values]\n", - " var_regroup = var.copy()\n", - " # Rename those groups\n", - " var_regroup[regroup_mask] = rename\n", - " var_regroup.name = \"B_\"+var.name\n", - " info = (var.name+\": from \"+str(len(var.unique()))+\" to \"+str(len(var_regroup.unique())))\n", - " return var_regroup, info" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "Apply function to nominal/ordinal variables" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "tic = time.time()\n", - "# We loop only through the categorical variables\n", - "for h in object_headers:\n", - " # We label missing and empty values for categorical variables as 'Missing'\n", - " # Note the interaction with the 'keep' parameter of the regroup function.\n", - " mask = maskmissing(df_in[h])\n", - " df_in.loc[mask,h]='Missing'\n", - " # Perform regrouping function\n", - " result = regroup(var=df_in[h]\n", - " ,target=df_in.loc[:,'TARGET']\n", - " ,train=df_in.PARTITION=='train'\n", - " ,pval_thresh=float(df_settings.regrouping_signif)\n", - " ,dummy=True\n", - " ,keep='Missing'\n", - " ,rename='Non-significants')\n", - " df_out = pd.concat([df_out,result[0]],axis=1)\n", - " log.append(result[1])\n", - "toc = time.time()\n", - "log.append(\"Regrouping: \"+str(toc-tic)+\" sec\"+\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preprocessing of boolean variables" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Defining Function to pass variables as is" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# We could just rename them or put them with the regoup function, but for now let's keep consistent with the other functions\n", - "def passvar(var):\n", - " var_pass = var.copy()\n", - " var_pass.name = \"B_\"+var.name\n", - " info = (\"Passing \"+var.name)\n", - " return var_pass, info" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Executing function" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "tic = time.time()\n", - "# We loop only through the boolean variables\n", - "for b in bool_headers:\n", - " # We label missing and empty values for boolean variables as 'Missing'\n", - " mask = maskmissing(df_in[b])\n", - " df_in.loc[mask,b]='Missing'\n", - " # Perform the passvar function\n", - " result = passvar(var=df_in[b])\n", - " df_out = pd.concat([df_out,result[0]],axis=1)\n", - " log.append(result[1])\n", - "toc = time.time()\n", - "log.append(\"Passing: \"+str(toc-tic)+\" sec\"+\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "### Incidence Replacement" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "Function for incidence replacement" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def increp(b_var, target, train): \n", - " #get variable name\n", - " name = b_var.name\n", - " #get overall incidence \n", - " incidence_mean = target[train].mean()\n", - " #get incidence per group\n", - " incidences = target[train].groupby(b_var).mean()\n", - " #construct dataframe with incidences\n", - " idf = pd.DataFrame(incidences).reset_index()\n", - " #get values that are in the data but not in the labels\n", - " bin_labels = incidences.index\n", - " newgroups = list(set(b_var.unique()) ^ set(bin_labels))\n", - " #if newgroups, add mean incidence to incidence dataframe for each new group\n", - " if len(newgroups)>0:\n", - " #make dataframe:\n", - " ngdf = pd.DataFrame(newgroups)\n", - " ngdf.columns = [name]\n", - " ngdf[\"TARGET\"] = incidence_mean\n", - " #dataframe with incidences: \n", - " idf = idf.append(ngdf)\n", - " #dataframe with the variable\n", - " vdf = pd.DataFrame(b_var)\n", - " #discretized variable by merge\n", - " d_var = pd.merge(vdf,idf,how='left',on=name)[\"TARGET\"]\n", - " return pd.Series(d_var, name=\"D_\"+name[2:]) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Apply function for incidence replacement" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "# We define the columns destined for incidence replacement\n", - "headers_for_incidrep = [h for h in df_out.columns if ((h not in ['ID','TARGET','PARTITION']) & (h[:2]==\"B_\"))]" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "tic = time.time()\n", - "# We loop only through the columns destined for incidence replacement\n", - "for n in headers_for_incidrep:\n", - " # Perform increp function\n", - " result = increp(b_var=df_out[n]\n", - " ,target=df_out.TARGET\n", - " ,train=df_out.PARTITION==\"train\")\n", - " df_out = pd.concat([df_out,result], axis=1)\n", - " log.append(n+ \" processed\")\n", - "toc = time.time()\n", - "log.append(\"Incidence replacement: \"+str(toc-tic)+\" sec\"+\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Calculate AUCS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Function for auc calculation" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def getauc(var, target, partition): \n", - " \n", - " y = np.array(target[partition])\n", - " pred = np.array(var[partition])\n", - " pred = pred.astype(np.float64)\n", - " fpr, tpr, thresholds = metrics.roc_curve(y,pred, pos_label=1)\n", - " \n", - " return metrics.auc(fpr, tpr)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Applying function for auc calculation" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# We define the columns for which an AUC score should be computed\n", - "headers_for_auc = [h for h in df_out.columns if ((h not in ['ID','TARGET','PARTITION']) & (h[:2]==\"D_\"))]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "auc_list_all = []\n", - "parts = [\"train\",\"selection\"]\n", - "tic = time.time()\n", - "# We loop only through those columns for which an AUC score should be computed\n", - "for header in headers_for_auc:\n", - " auc_list_var = [header[2:]]\n", - " # We loop through the two sets ('train' and 'selection') for which an AUC score is needed\n", - " for part in parts:\n", - " # Perform getauc function\n", - " auc_value = getauc(var=df_out[header]\n", - " ,target=df_out.TARGET\n", - " ,partition=df_out.PARTITION==part)\n", - " auc_list_var.append(auc_value.round(2)) #We round auc values to 2 decimals\n", - " auc_list_all.append(auc_list_var)\n", - " log.append(header + \" processed\")\n", - "# We create a supplementary dataframe destined for Cobra input \n", - "df_auc = pd.DataFrame(auc_list_all,columns=['variable','AUC train','AUC test'])\n", - "toc = time.time()\n", - "log.append(\"Auc: \"+str(toc-tic)+\" sec\"+\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preselection" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "tic = time.time()\n", - "# We identify those variables for which the AUC score is above the user-defined threshold\n", - "auc_thresh = df_auc.loc[:,'AUC test'] > float(df_settings.preselection_auc)\n", - "# We identify those variables for which the AUC score difference between 'train' and 'selection' is within the user-defined ratio\n", - "auc_overtrain = (df_auc.loc[:,'AUC train']*100 - df_auc.loc[:,'AUC test']*100) < float(df_settings.preselection_overtrain)\n", - "# Only those variables passing the 2 criteria above are preselected\n", - "preselect = auc_thresh & auc_overtrain" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "# We create a supplementary dataframe destined for Cobra input \n", - "df_variable_selections = pd.DataFrame({'variable':df_auc.variable\n", - " ,'preselect':preselect.astype(int)\n", - " ,'Default':np.zeros(len(preselect)).astype(int)\n", - " ,'Alternative 1':np.zeros(len(preselect)).astype(int)\n", - " ,'Alternative 2':np.zeros(len(preselect)).astype(int)\n", - " ,'Alternative 3':np.zeros(len(preselect)).astype(int)\n", - " ,'Alternative 4':np.zeros(len(preselect)).astype(int)\n", - " ,'Alternative 5':np.zeros(len(preselect)).astype(int)}\n", - " ,columns=['variable'\n", - " ,'preselect'\n", - " ,'Default'\n", - " ,'Alternative 1'\n", - " ,'Alternative 2'\n", - " ,'Alternative 3'\n", - " ,'Alternative 4'\n", - " ,'Alternative 5'])" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "for i,var in enumerate(df_variable_selections.variable):\n", - " log.append(var+\" \"+np.array(['passed','filtered'])[df_variable_selections.preselect][i])\n", - "toc = time.time()\n", - "log.append(\"Preselection: \"+str(toc-tic)+\" sec\"+\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Calculate Correlations" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "# We define the columns for which a correlation score should be computed\n", - "headers_for_corr = [h for h in df_out.columns if ((h not in ['ID','TARGET','PARTITION']) & (h[:2]==\"D_\"))]" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "train = df_out.PARTITION==\"train\"\n", - "tic = time.time()\n", - "dataforcorr = np.transpose(np.matrix(df_out.loc[train,headers_for_corr],dtype=float))\n", - "mat_corr = np.corrcoef(dataforcorr)\n", - "toc = time.time()\n", - "log.append(\"Correlations: \"+str(toc-tic)+\" sec\"+\"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "df_corr = pd.DataFrame(mat_corr)\n", - "df_corr.columns = headers_for_corr\n", - "df_corr.index = headers_for_corr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Export files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Table of all Auc values" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "auc_path = root+'/data/univariate/aucs.csv'\n", - "df_auc = df_auc.sort_values(by=['AUC test','AUC train'], ascending=False).reset_index(drop=True)\n", - "df_auc.to_csv(path_or_buf=auc_path\n", - " ,sep=';'\n", - " ,index=False\n", - " ,encoding='utf-8'\n", - " ,line_terminator='\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Tables of Incidences & Correlations per variable" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Function for sorting cont.variables, whether or not they have undergone discritization\n", - "def sortnumeric(dataframe):\n", - " \n", - " lowestnumber = 0\n", - " # If the variable was discretisized\n", - " if '[...' in [str(l)[:4] for l in dataframe.group.values]:\n", - " unsorted_labels = dataframe.group.values\n", - " label_items=[]\n", - " for label in unsorted_labels:\n", - " # For each bin label, retain the first value\n", - " label_items.append(label.split(\",\")[0].strip(\"[\").strip(\"(\"))\n", - " label_items=np.asarray(label_items)\n", - " # Special cases that are not numeric are given numbers\n", - " lowestnumber = label_items[(label_items!=\"...\")&(label_items!=\"Missing\")].astype('float64').min()\n", - " label_items[label_items=='...']= lowestnumber-1\n", - " label_items[label_items=='Missing']= lowestnumber-2\n", - " # argsort based on the numbers\n", - " rank = label_items.astype('float64').argsort()\n", - " return rank\n", - " \n", - " # If the variable wasn't discretisized, simply argsort on the numbers\n", - " else:\n", - " label_items = dataframe.group.values\n", - " if len(label_items)>1:\n", - " lowestnumber = label_items[label_items.astype('O')!=\"Missing\"].astype('float64').min()\n", - " label_items[label_items.astype('O')=='Missing']= lowestnumber-2\n", - " rank = label_items.astype('float64').argsort()\n", - " return rank" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "# Function for sorting cont.variables, whether or not they have undergone discritization\n", - "def sortnumeric_old(dataframe):\n", - " \n", - " # If the variable was discretisized\n", - " if dataframe.group.dtype=='object': #or# if np.array([str(unsorted_labels[i])[0] in [\"[\",\"(\",\"M\"] for i in range(0,len(unsorted_labels))]).all():\n", - " unsorted_labels = dataframe.group.values\n", - " label_items=[]\n", - " for label in unsorted_labels:\n", - " # For each bin label, retain the first value\n", - " label_items.append(label.split(\",\")[0].strip(\"[\").strip(\"(\"))\n", - " label_items=np.asarray(label_items)\n", - " # Special cases that are not numeric are given numbers\n", - " lowestnumber = label_items[(label_items!=\"...\")&(label_items!=\"Missing\")].astype('float64').min()\n", - " label_items[label_items=='...']= lowestnumber-1\n", - " label_items[label_items=='Missing']= lowestnumber-2\n", - " # argsort based on the numbers\n", - " rank = label_items.astype('float64').argsort()\n", - " return rank\n", - " \n", - " # If the variable wasn't discretisized, simply argsort on the numbers\n", - " else:\n", - " rank = dataframe.group.values.argsort()\n", - " return rank" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Function for sorting cat. variables\n", - "def sortobject(dataframe):\n", - " # Sort dataframe on increasing incidence values\n", - " unsorted_incidences = dataframe.incidence.values\n", - " rank = unsorted_incidences.argsort()\n", - " return rank" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "n_decimals = 2\n", - "average = round(df_out.TARGET[df_out.PARTITION==\"train\"].mean(),n_decimals)\n", - "\n", - "headers_to_output = list(df_auc['variable'])\n", - "for i,varname in enumerate(headers_to_output):\n", - " b_varname = 'B_'+varname\n", - " d_varname ='D_'+varname\n", - " #INCIDENCE CSV's\n", - " incidence_path = root+\"/data/univariate/incidence_\"+str(varname)+\".csv\"\n", - " groups_and_incidences = df_out.TARGET[df_out.PARTITION=='train'].groupby(df_out[b_varname]).mean()\n", - " n_groups= len(groups_and_incidences)\n", - " group = groups_and_incidences.index\n", - " incidence = groups_and_incidences.values.round(n_decimals)\n", - " size = df_out.TARGET[df_out.PARTITION=='train'].groupby(df_out[b_varname]).size().astype(float).values\n", - " df_incidence = pd.DataFrame( {'group':group\n", - " ,'incidence':incidence\n", - " ,'size':size\n", - " ,'average':average}\n", - " ,columns=['group','incidence','size','average'])\n", - " if varname in numeric_headers:\n", - " df_incidence = df_incidence.iloc[sortnumeric(df_incidence),:]\n", - " elif varname in object_headers:\n", - " df_incidence = df_incidence.iloc[sortobject(df_incidence),:]\n", - " else:\n", - " a=1\n", - " #df_incidence = df_incidence.iloc[sortother(df_incidence),:]\n", - " df_incidence.to_csv(path_or_buf=incidence_path\n", - " ,sep=';'\n", - " ,index=False\n", - " ,encoding='utf-8'\n", - " ,line_terminator='\\n') #quoting=csv.QUOTE_NONNUMERIC\n", - " \n", - " #CORRELATION CSV's\n", - " correlation_path = root+\"/data/univariate/correlations_\"+str(varname)+\".csv\"\n", - " Variable = [v.strip(\"D_\") for v in df_corr[d_varname].index]\n", - " Correlation = abs(df_corr[d_varname].values).round(n_decimals)\n", - " Sign = np.array([\"+\",\"-\"])[(df_corr[d_varname].values<0).astype(int)]\n", - " AUC = np.array([df_auc.loc[df_auc['variable']== v,'AUC test'].values[0] for v in Variable]).round(n_decimals)\n", - " df_correlation = pd.DataFrame({\"Variable\":Variable\n", - " ,\"Correlation\":Correlation\n", - " ,\"Sign\":Sign\n", - " ,\"AUC\": AUC}\n", - " ,columns=[\"Variable\",\"Correlation\",\"Sign\",\"AUC\"]) \n", - " df_correlation.sort_values(by='Correlation', ascending=False, inplace=True)\n", - " df_correlation = df_correlation.loc[df_correlation.Variable!=varname,:]\n", - " df_correlation.to_csv(path_or_buf=correlation_path\n", - " ,sep=';'\n", - " ,index=False\n", - " ,encoding='utf-8'\n", - " ,line_terminator='\\n') # quoting=csv.QUOTE_NONNUMERIC" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Variable Preselections" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "selections_path = root+'/data/univariate/variable_selections.csv'\n", - "df_variable_selections.to_csv(path_or_buf=selections_path\n", - " ,sep=';'\n", - " ,index=False\n", - " ,encoding='utf-8'\n", - " ,line_terminator='\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Result dataframe for Modeling input" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "out_path = root+\"/data/univariate/df_univariate.csv\"\n", - "df_out.to_csv(path_or_buf=out_path, sep=';', index=False, encoding='utf-8', line_terminator='\\n', quoting=csv.QUOTE_NONNUMERIC)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Modeltab reset" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Generate modeltab info\n", - "filename = root+\"/data/univariate/modeltab_info.csv\"\n", - "with open(filename, 'w') as csvfile:\n", - " write=csv.writer(csvfile, delimiter =';')\n", - " write.writerow([\"key\",\"value\"])\n", - " write.writerow([\"run\",\"Default\"])\n", - " write.writerow([\"new\",\"Alternative 1\"])\n", - " write.writerow([\"new_template\",\"Default\"])\n", - " write.writerow([\"champ\",\"Default\"])\n", - " write.writerow([\"score\",\"Default\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Log messages" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "log.append(\"-- Univariate analysis completed --\"+\"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "log_file = open(root+'/python/univariate.log','w')\n", - "log_file.write('\\n'.join(log))\n", - "log_file.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Stop script" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ok\n" - ] - } - ], - "source": [ - "print(\"ok\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} From d09bcdd7113317f4825b7745e70438df51340656 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 24 Dec 2019 11:06:46 +0100 Subject: [PATCH 27/98] Update README.md --- README.md | 64 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 2c025a6..50cb9ef 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,61 @@ # COBRA :snake: -**Cobra** here on GitHub is refactored web-based cobra originally developed by Guillaume. The goal is to wrap the back-end into easy to use Python package. +**Cobra** is a Python package that implements the Python Predictions methodology for predictive analytics. It consists of a main script/notebook that can be used to build and save a predictive model only by setting several parameters. The main scripts itself consists of several modules that can be used independently of one another to build custom scripts. -If you wish to modify the code, the best is to fork the repository or create another branch! +Note that this package is a refactored version of the back-end of the original web-based cobra, developed by _Guillaume Marion_ (former Python Predictions employee). -:heavy_exclamation_mark: Still lots of :bug: and under construction, keep that in mind:heavy_exclamation_mark: +:heavy_exclamation_mark: Be aware that there could still be :bug: in the code :heavy_exclamation_mark: -## What can Cobra 1.0 do: - * Transform given .csv to be ready to use for prediction modelling - * _Clense the headers, partition into train/selection/validation sets, sample, bins and regroups variables and add columns with incidence rate per categories._ +## Getting started + +These instructions will get you a copy of the project up and running on your local machine for usage, development and testing purposes. + +### What can cobra do? + + * Transform given pandas DataFrame to be ready to use for prediction modelling: partition into train/selection/validation sets, create bins from continuous variables, regroup categorical variables and add columns with incidence rate per category/bin. * Perform univariate selection based on AUC * Find best model by forward selection * Visualize the results * Allow iteration among each step for the analyst + +### Requirements + +This package requires the usual Python packages for data science: + +* numpy +* scipy +* matplotlib +* seaborn +* pandas +* scikit-learn + +These packages, along with their versions are listed in `requirements.txt` and `conda_env.txt`. To install these packages using pip, run + +``` +pip install requirements.txt +``` + +or using conda + +``` +conda install requirements.txt +``` + +### Installation + +As this package is an internal package that is not open-sourced, it is not available through `pip` or `conda`. As a result, the package has to be installed manually using the following steps: + + * Clone this repository. + * Open a shell that can execute python code and navigate to the folder where this repo was cloned in. + * Once you are in the folder, execute `python setup.py install`. + +### Usage + +TO DO + +## Development + +We'd love you to contribute to the development of Cobra! To do so, clone the repo and create a _feature branch_ to do your development. Once your are finished, you can create a _pull request_ to merge it back into the main branch. + + -## Installation - * Clone this repository to your local PC (use GitHub Desktop). This assumes that the cloned repository will be in this directory `C:\Local\pers\Documents\GitHub\cobra` - * Open Powershell and navigate to that folder - * Once you are in the folder, execute `python setup.py install`. This is how the line should look like: - `PS C:\Local\pers\Documents\GitHub\cobra> python setup.py install` - * Restart kernel and you are ready to go - * For example of use, see the Jupyter Notebook in `examples` folder From 75eb75c3a8d64cf935ddd9445f57e24320010b36 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 24 Dec 2019 15:45:41 +0100 Subject: [PATCH 28/98] Add (de)serializers to CategoricalDataProcessor --- .../categorical_data_processor.py | 57 +++++++++++++++++ .../test_categorical_data_processor.py | 61 +++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index a77851f..26d9964 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -51,6 +51,10 @@ class CategoricalDataProcessor(BaseEstimator): Whether contingency table should be scaled before chi^2.' """ + valid_keys = ["regroup", "regroup_name", "keep_missing", + "category_size_threshold", "p_value_threshold", + "scale_contingency_table", "forced_categories"] + def __init__(self, regroup: bool=True, regroup_name: str="Other", keep_missing: bool=True, category_size_threshold: int=5, @@ -69,6 +73,59 @@ def __init__(self, regroup: bool=True, regroup_name: str="Other", # dict to store fitted output in self._combined_categories_by_column = {} + def attributes_to_dict(self) -> dict: + """Return the attributes of CategoricalDataProcessor as a dictionary + + Returns + ------- + dict + Contains the attributes of CategoricalDataProcessor instance with + the attribute name as key + """ + params = self.get_params() + + params["_combined_categories_by_column"] = { + key: list(value) + for key, value in self._combined_categories_by_column.items() + } + + return params + + def set_attributes_from_dict(self, params: dict): + """Set instance attributes from a dictionary of values with key the + name of the attribute. + + Parameters + ---------- + params : dict + Contains the attributes of CategoricalDataProcessor with their + names as key. + + Raises + ------ + ValueError + In case _combined_categories_by_column is not of type dict + """ + _fitted_output = params.pop("_combined_categories_by_column", {}) + + if type(_fitted_output) != dict: + raise ValueError("_combined_categories_by_column is expected to " + "be a dict but is of type {} instead" + .format(type(_fitted_output))) + + # Clean out params dictionary to remove unknown keys (for safety!) + params = {key: params[key] for key in params if key in self.valid_keys} + + # We cannot turn this method into a classmethod as we want to make use + # of the following method from BaseEstimator: + self.set_params(**params) + + self._combined_categories_by_column = { + key: set(value) for key, value in _fitted_output.items() + } + + return self + def fit(self, data: pd.DataFrame, column_names: list, target_column: str): """Fit the CategoricalDataProcessor diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py index 5155127..a918651 100644 --- a/tests/preprocessing/test_categorical_data_processor.py +++ b/tests/preprocessing/test_categorical_data_processor.py @@ -8,6 +8,67 @@ class TestCategoricalDataProcessor: + def test_attributes_to_dict(self): + + processor = CategoricalDataProcessor() + + combined_categories = ["a", "b", "c"] + processor._combined_categories_by_column = { + "variable": set(combined_categories) + } + + actual = processor.attributes_to_dict() + + expected = { + "regroup": True, + "regroup_name": "Other", + "keep_missing": True, + "category_size_threshold": 5, + "p_value_threshold": 0.001, + "scale_contingency_table": True, + "forced_categories": {}, + "_combined_categories_by_column": { + "variable": combined_categories + } + } + + assert actual == expected + + @pytest.mark.parametrize("attribute", + ["regroup", "regroup_name", "keep_missing", + "category_size_threshold", "p_value_threshold", + "scale_contingency_table", "forced_categories", + "_combined_categories_by_column"]) + def test_set_attributes_from_dict(self, attribute): + + processor = CategoricalDataProcessor() + + combined_categories = ["a", "b", "c"] + params = { + "regroup": True, + "regroup_name": "Other", + "keep_missing": True, + "category_size_threshold": 5, + "p_value_threshold": 0.001, + "scale_contingency_table": True, + "forced_categories": {}, + "_combined_categories_by_column": { + "variable": combined_categories + } + } + + expected = params[attribute] + + if attribute == "_combined_categories_by_column": + # list is transformed to a set in CategoricalDataProcessor + expected = {"variable": set(combined_categories)} + + processor.set_attributes_from_dict(params) + + actual = getattr(processor, attribute) + + assert actual == expected + @pytest.mark.parametrize("scale_contingency_table, expected", [(False, 0.013288667), (True, 0.434373)]) From 08eda64beefffd1a457ade8966687f215d03a5c3 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 24 Dec 2019 16:10:32 +0100 Subject: [PATCH 29/98] Fix in unittest for CategoricalDataProcessor --- tests/preprocessing/test_categorical_data_processor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py index a918651..a1ea39a 100644 --- a/tests/preprocessing/test_categorical_data_processor.py +++ b/tests/preprocessing/test_categorical_data_processor.py @@ -28,7 +28,7 @@ def test_attributes_to_dict(self): "scale_contingency_table": True, "forced_categories": {}, "_combined_categories_by_column": { - "variable": combined_categories + "variable": list(set(combined_categories)) } } @@ -70,8 +70,8 @@ def test_set_attributes_from_dict(self, attribute): assert actual == expected @pytest.mark.parametrize("scale_contingency_table, expected", - [(False, 0.013288667), - (True, 0.434373)]) + [(False, 0.01329), + (True, 0.43437)]) def test_compute_p_value(self, scale_contingency_table, expected): X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10)) @@ -81,7 +81,7 @@ def test_compute_p_value(self, scale_contingency_table, expected): actual = (CategoricalDataProcessor ._compute_p_value(X, y, category, scale_contingency_table)) - assert pytest.approx(actual) == expected + assert pytest.approx(actual, abs=1e-5) == expected def test_get_small_categories(self): From 10cb556757c9770dc877296884a4cdf480004374 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 8 Jan 2020 13:59:15 +0100 Subject: [PATCH 30/98] Update .gitignore --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9ef222c..37c9f41 100644 --- a/.gitignore +++ b/.gitignore @@ -103,5 +103,9 @@ ENV/ # mypy .mypy_cache/ -#VScode settins +# VScode settins .vscode + +# Other ignore files +*.pptx +*.ppt From cff611e74b4a761a3ebfb38fcf56b4e54d3c1ad7 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 8 Jan 2020 13:59:56 +0100 Subject: [PATCH 31/98] Add fit_transform method to TargetEncoder --- cobra/preprocessing/kbins_discretizer.py | 2 +- cobra/preprocessing/target_encoder.py | 26 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 603bac5..5d126c9 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -307,7 +307,7 @@ def _transform_column(self, data: pd.DataFrame, data[column_name_bin] = pd.cut(x=data[column_name], bins=interval_idx) - # Rename bins so that the output has a proper format + # Rename bins so that the output has a proper format bin_labels = self._create_bin_labels(bins) data[column_name_bin] = (data[column_name_bin] diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 328d431..ea63e13 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -217,6 +217,28 @@ def transform(self, data: pd.DataFrame, return data + def fit_transform(self, data: pd.DataFrame, + column_names: list, + target_column: str) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Data to be encoded + column_names : list + Columns of data to be encoded + target_column : str + Column name of the target + + Returns + ------- + pd.DataFrame + data with additional discretized variables + """ + self.fit(data, column_names, target_column) + return self.transform(data, column_names) + @staticmethod def _clean_column_name(column_name: str) -> str: """Clean column name string by removing "_bin" and adding "_enc" @@ -233,5 +255,9 @@ def _clean_column_name(column_name: str) -> str: """ if "_bin" in column_name: return column_name.replace("_bin", "") + "_enc" + elif "_processed" in column_name: + return column_name.replace("_processed", "") + "_enc" + elif "_cleaned" in column_name: + return column_name.replace("_cleaned", "") + "_enc" else: return column_name + "_enc" From b01f78d64c72ca1474aa3c11ba13000d8a7bfdbe Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 8 Jan 2020 14:00:47 +0100 Subject: [PATCH 32/98] Update utils.py --- cobra/utils.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/cobra/utils.py b/cobra/utils.py index ea09c64..e7057da 100644 --- a/cobra/utils.py +++ b/cobra/utils.py @@ -26,11 +26,6 @@ def get_column_datatypes(data: pd.DataFrame, dict Description """ - column_names = list(data.columns) - - # dummies variables: case they have only 2 values - vars_dummy = set([col for col in column_names - if len(data[col].unique()) == 2]) # categorical vars vars_cat = (set(data.dtypes[data.dtypes == object].index) @@ -41,9 +36,6 @@ def get_column_datatypes(data: pd.DataFrame, bool_arr_is_numeric = is_number(data.dtypes) vars_numeric = set(data.columns[bool_arr_is_numeric]) - # remove dummy variables from set - vars_numeric = vars_numeric.difference(vars_dummy) - # Remark: numeric variables can still be "categorical" # i.e. when they only contain some distinct values! # We only consider a variable continuous if they have more distinct values @@ -62,8 +54,7 @@ def get_column_datatypes(data: pd.DataFrame, if id_column_name: vars_cat = vars_cat.difference(set([id_column_name])) if target_column_name: - vars_dummy = vars_dummy.difference(set([target_column_name])) + vars_cat = vars_cat.difference(set([target_column_name])) - return {"numeric_variables": vars_numeric, - "categorical_variables": vars_cat, - "dummy_variables": vars_dummy} + return {"numeric_variables": list(vars_numeric), + "categorical_variables": list(vars_cat)} From 6cc18a9d53a62e94463786d64d990724ad8f7817 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 8 Jan 2020 14:02:47 +0100 Subject: [PATCH 33/98] Change logic of CategoricalDataProcessor Instead of keeping a list per column of categories to merge, we now keep a list of categories to keep. This has the advantage that if a new category arises in a test set (or a future dataset to score), this will automatically be renamed to "Other" so that TargetEncoder can handle this. Unit tests were also updated for this change --- .../categorical_data_processor.py | 44 +++++++++++-------- .../test_categorical_data_processor.py | 33 +++++++------- 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 26d9964..0ce21e9 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -71,7 +71,7 @@ def __init__(self, regroup: bool=True, regroup_name: str="Other", self.forced_categories = forced_categories # dict to store fitted output in - self._combined_categories_by_column = {} + self._cleaned_categories_by_column = {} def attributes_to_dict(self) -> dict: """Return the attributes of CategoricalDataProcessor as a dictionary @@ -84,9 +84,9 @@ def attributes_to_dict(self) -> dict: """ params = self.get_params() - params["_combined_categories_by_column"] = { + params["_cleaned_categories_by_column"] = { key: list(value) - for key, value in self._combined_categories_by_column.items() + for key, value in self._cleaned_categories_by_column.items() } return params @@ -104,12 +104,12 @@ def set_attributes_from_dict(self, params: dict): Raises ------ ValueError - In case _combined_categories_by_column is not of type dict + In case _cleaned_categories_by_column is not of type dict """ - _fitted_output = params.pop("_combined_categories_by_column", {}) + _fitted_output = params.pop("_cleaned_categories_by_column", {}) if type(_fitted_output) != dict: - raise ValueError("_combined_categories_by_column is expected to " + raise ValueError("_cleaned_categories_by_column is expected to " "be a dict but is of type {} instead" .format(type(_fitted_output))) @@ -120,7 +120,7 @@ def set_attributes_from_dict(self, params: dict): # of the following method from BaseEstimator: self.set_params(**params) - self._combined_categories_by_column = { + self._cleaned_categories_by_column = { key: set(value) for key, value in _fitted_output.items() } @@ -153,14 +153,14 @@ def fit(self, data: pd.DataFrame, column_names: list, "skipped in fitting" .format(column_name)) continue - combined_cats = self._fit_column(data, column_name, target_column) + cleaned_cats = self._fit_column(data, column_name, target_column) # Remove forced categories forced_cats = self.forced_categories.get(column_name, set()) - combined_cats = combined_cats.difference(forced_cats) + cleaned_cats = cleaned_cats.union(forced_cats) - # Add to _combined_categories_by_column for later use - self._combined_categories_by_column[column_name] = combined_cats + # Add to _cleaned_categories_by_column for later use + self._cleaned_categories_by_column[column_name] = cleaned_cats def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> set: @@ -185,7 +185,10 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, combined_categories = set() # replace missings and get unique categories as a list - X = CategoricalDataProcessor._replace_missings(data[column_name]) + X = (CategoricalDataProcessor + ._replace_missings(data[column_name]) + .astype(object)) + unique_categories = list(X.unique()) # get small categories and add them to the merged category list @@ -212,7 +215,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, if self.keep_missing: combined_categories.discard("Missing") - return combined_categories + return set(unique_categories).difference(combined_categories) def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: @@ -231,7 +234,7 @@ def transform(self, data: pd.DataFrame, data with additional discretized variables """ - if self.regroup and len(self._combined_categories_by_column) == 0: + if self.regroup and len(self._cleaned_categories_by_column) == 0: msg = ("{} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") @@ -268,7 +271,7 @@ def _transform_column(self, data: pd.DataFrame, """ column_name_clean = column_name + "_processed" - data[column_name_clean] = data[column_name] + data[column_name_clean] = data[column_name].astype(object) # Fill missings first data[column_name_clean] = (CategoricalDataProcessor @@ -276,11 +279,14 @@ def _transform_column(self, data: pd.DataFrame, column_name_clean)) if self.regroup: - categories = self._combined_categories_by_column.get(column_name) + categories = self._cleaned_categories_by_column.get(column_name) if not categories: - log.warning("Column '{}' is not in fitted output " - "and will be skipped".format(column_name)) + # Log warning if categories is None, which indicates it is + # not in fitted output + if categories is None: + log.warning("Column '{}' is not in fitted output " + "and will be skipped".format(column_name)) return data data[column_name_clean] = (CategoricalDataProcessor @@ -425,4 +431,4 @@ def _replace_categories(data: pd.Series, categories: set) -> pd.Series: pd.Series Description """ - return data.apply(lambda x: x if x not in categories else "Other") + return data.apply(lambda x: x if x in categories else "Other") diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py index a1ea39a..95ebc56 100644 --- a/tests/preprocessing/test_categorical_data_processor.py +++ b/tests/preprocessing/test_categorical_data_processor.py @@ -12,9 +12,9 @@ def test_attributes_to_dict(self): processor = CategoricalDataProcessor() - combined_categories = ["a", "b", "c"] - processor._combined_categories_by_column = { - "variable": set(combined_categories) + cleaned_categories = ["a", "b", "c"] + processor._cleaned_categories_by_column = { + "variable": set(cleaned_categories) } actual = processor.attributes_to_dict() @@ -27,8 +27,8 @@ def test_attributes_to_dict(self): "p_value_threshold": 0.001, "scale_contingency_table": True, "forced_categories": {}, - "_combined_categories_by_column": { - "variable": list(set(combined_categories)) + "_cleaned_categories_by_column": { + "variable": list(set(cleaned_categories)) } } @@ -38,12 +38,12 @@ def test_attributes_to_dict(self): ["regroup", "regroup_name", "keep_missing", "category_size_threshold", "p_value_threshold", "scale_contingency_table", "forced_categories", - "_combined_categories_by_column"]) + "_cleaned_categories_by_column"]) def test_set_attributes_from_dict(self, attribute): processor = CategoricalDataProcessor() - combined_categories = ["a", "b", "c"] + cleaned_categories = ["a", "b", "c"] params = { "regroup": True, "regroup_name": "Other", @@ -52,16 +52,16 @@ def test_set_attributes_from_dict(self, attribute): "p_value_threshold": 0.001, "scale_contingency_table": True, "forced_categories": {}, - "_combined_categories_by_column": { - "variable": combined_categories + "_cleaned_categories_by_column": { + "variable": cleaned_categories } } expected = params[attribute] - if attribute == "_combined_categories_by_column": + if attribute == "_cleaned_categories_by_column": # list is transformed to a set in CategoricalDataProcessor - expected = {"variable": set(combined_categories)} + expected = {"variable": set(cleaned_categories)} processor.set_attributes_from_dict(params) @@ -106,15 +106,16 @@ def test_replace_missings(self): pd.testing.assert_frame_equal(actual, expected) - @pytest.mark.parametrize("combined_categories, expected", - [({"c3", "c4"}, + @pytest.mark.parametrize("cleaned_categories, expected", + [({"c1", "c2"}, pd.Series(data=["c1", "c2", "Other", "Other"])), - ({}, pd.Series(data=["c1", "c2", "c3", "c4"]))]) - def test_replace_categories(self, combined_categories, expected): + ({"c1", "c2", "c3", "c4"}, + pd.Series(data=["c1", "c2", "c3", "c4"]))]) + def test_replace_categories(self, cleaned_categories, expected): data = pd.Series(data=["c1", "c2", "c3", "c4"]) actual = (CategoricalDataProcessor - ._replace_categories(data, combined_categories)) + ._replace_categories(data, cleaned_categories)) pd.testing.assert_series_equal(actual, expected) From 5e4a06dbccbdf89253666469d20a64c977daf577 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 8 Jan 2020 14:05:46 +0100 Subject: [PATCH 34/98] Add PreProcessor class as facade for preprocessing The PreProcessor class unifies all the logic for preprocessing and combines all preprocessing steps (which can be found in separate classes). This means that the PreProcessor class is the main class to use from on. Furthermore, it also has functionality to store the preprocessing pipeline as a json file so that it can be easily reused and methods for train-test split. --- cobra/preprocessing/preprocessor.py | 508 +++++++++++++++++++++++ tests/preprocessing/test_preprocessor.py | 36 ++ 2 files changed, 544 insertions(+) create mode 100644 cobra/preprocessing/preprocessor.py create mode 100644 tests/preprocessing/test_preprocessor.py diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py new file mode 100644 index 0000000..7cf1959 --- /dev/null +++ b/cobra/preprocessing/preprocessor.py @@ -0,0 +1,508 @@ +""" +This module is a rework of the old cobra data_preparation.py. Here we will make +use of the classes for discretization, preprocessing of categorical variables +and incidence replacement. All of which will be employed to create a +preprocessing pipeline, which can be stored as a JSON file so that it can +easily be re-used for scoring. + +Authors: +- Geert Verstraeten (methodology) +- Matthias Roels (implementation) +""" +# std lib imports +import json +from typing import Optional +import inspect +from datetime import datetime + +import logging +log = logging.getLogger(__name__) +# third party imports +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError +# custom imports +from cobra.preprocessing import KBinsDiscretizer +from cobra.preprocessing import TargetEncoder +from cobra.preprocessing import CategoricalDataProcessor + +import cobra.utils as utils + + +class PreProcessor(BaseEstimator): + + """Summary + + Attributes + ---------- + categorical_data_processor : CategoricalDataProcessor + Instance of CategoricalDataProcessor to do the prepocessing of + categorical variables + discretizer : KBinsDiscretizer + Instance of KBinsDiscretizer to do the prepocessing of continuous + variables by means of discretization + numeric_threshold : int + Threshold to decide whether a numeric variable is in fact a categorical + one based on the number of unique values of that variable + selection_pct : float + Percentage of data to add to selection dataset + serialization_path : str + path to save the pipeline to + stratify_split : bool + Whether or not to stratify the train-test split + target_encoder : TargetEncoder + Instance of TargetEncoder to do the incidence replacement + train_pct : float + Percentage of data to add to training dataset + validation_pct : float + Percentage of data to add to validation dataset + """ + + def __init__(self, + train_pct: float, + selection_pct: float, + validation_pct: float, + stratify_split: bool, + categorical_data_processor: CategoricalDataProcessor, + discretizer: KBinsDiscretizer, + target_encoder: TargetEncoder, + threshold_numeric_is_categorical: int=None, + serialization_path: str=None, + continuous_vars: list=[], + discrete_vars: list=[]): + + self.train_pct = train_pct + self.selection_pct = selection_pct + self.validation_pct = validation_pct + self.stratify_split = stratify_split + + self.numeric_threshold = threshold_numeric_is_categorical + + self.serialization_path = serialization_path + + self.categorical_data_processor = categorical_data_processor + self.discretizer = discretizer + self.target_encoder = target_encoder + + # placeholders for columns by datatype + # Included as constructor argument to get them from pipeline + self._is_fitted = False + self._continuous_vars = continuous_vars + self._discrete_vars = discrete_vars + + @classmethod + def from_params(cls, + train_pct: float, + selection_pct: float, + validation_pct: float, + stratify_split: bool=True, + threshold_numeric_is_categorical: int=None, + n_bins: int=10, + strategy: str="quantile", + closed: str="right", + auto_adapt_bins: bool=False, + starting_precision: int=0, + label_format: str="{} - {}", + change_endpoint_format: bool=False, + regroup: bool=True, + regroup_name: str="Other", + keep_missing: bool=True, + category_size_threshold: int=5, + p_value_threshold: float=0.001, + scale_contingency_table: bool=True, + forced_categories: dict={}, + weight: float=0.0, + serialization_path: Optional[str]=None): + """Constructor to instantiate PreProcessor from all the parameters + that can be set in all its required classes. + + Parameters + ---------- + train_pct : float + Percentage of data to add to training dataset + selection_pct : float + Percentage of data to add to selection dataset + validation_pct : float + Percentage of data to add to validation dataset + stratify_split : bool, optional + Whether or not to stratify the train-test split + threshold_numeric_is_categorical : int, optional + Threshold to decide whether a numeric variable is in fact a + categorical one based on the number of unique values of + that variable + n_bins : int, optional + Number of bins to produce. Raises ValueError if ``n_bins < 2``. + strategy : str, optional + Binning strategy. Currently only "uniform" and "quantile" + e.g. equifrequency is supported + closed : str, optional + Whether to close the bins (intervals) from the left or right + auto_adapt_bins : bool, optional + reduces the number of bins (starting from n_bins) as a function of + the number of missings + starting_precision : int, optional + Initial precision for the bin edges to start from, + can also be negative. Given a list of bin edges, the class will + automatically choose the minimal precision required to have proper + bins e.g. [5.5555, 5.5744, ...] will be rounded + to [5.56, 5.57, ...]. In case of a negative number, an attempt will + be made to round up the numbers of the bin edges + e.g. 5.55 -> 10, 146 -> 100, ... + label_format : str, optional + format string to display the bin labels + e.g. min - max, (min, max], ... + change_endpoint_format : bool, optional + Whether or not to change the format of the lower and upper bins + into "< x" and "> y" resp. + regroup : bool + Whether or not to regroup categories + regroup_name : str + New name of the non-significant regrouped variables + keep_missing : bool + Whether or not to keep missing as a separate category + category_size_threshold : int + minimal size of a category to keep it as a separate category + p_value_threshold : float + Significance threshold for regroupping. + forced_categories : dict + Map to prevent certain categories from being group into "Other" + for each colum - dict of the form {col:[forced vars]}. + scale_contingency_table : bool + Whether contingency table should be scaled before chi^2.' + weight : float, optional + Smoothing parameters (non-negative). The higher the value of the + parameter, the bigger the contribution of the overall mean. + When set to zero, there is no smoothing + (e.g. the pure target incidence is used). + serialization_path : str, optional + path to save the pipeline to + + Returns + ------- + PreProcessor + Description + """ + categorical_data_processor = CategoricalDataProcessor( + regroup, + regroup_name, + keep_missing, + category_size_threshold, + p_value_threshold, + scale_contingency_table, + forced_categories) + discretizer = KBinsDiscretizer(n_bins, strategy, closed, + auto_adapt_bins, + starting_precision, + label_format, + change_endpoint_format) + + target_encoder = TargetEncoder(weight) + + if not threshold_numeric_is_categorical: + threshold_numeric_is_categorical = n_bins + + return cls(train_pct, selection_pct, validation_pct, stratify_split, + categorical_data_processor, discretizer, target_encoder, + threshold_numeric_is_categorical, serialization_path) + + @classmethod + def from_pipeline(cls, pipeline_path: str): + """Summary + + Parameters + ---------- + pipeline_path : str + Description + + Returns + ------- + PreProcessor + Instance of PreProcessor instantiated from a stored pipeline + + Raises + ------ + ValueError + Description + """ + with open(pipeline_path, "r") as file: + pipeline = json.load(file) + + if not PreProcessor._is_valid_pipeline(pipeline): + raise ValueError("Invalid pipeline") # To do: specify error + + categorical_data_processor = CategoricalDataProcessor() + categorical_data_processor.set_attributes_from_dict( + pipeline["categorical_data_processor"] + ) + + discretizer = KBinsDiscretizer() + discretizer.set_attributes_from_dict(pipeline["discretizer"]) + + target_encoder = TargetEncoder() + target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) + + return cls(pipeline["train_pct"], pipeline["selection_pct"], + pipeline["validation_pct"], pipeline["stratify_split"], + categorical_data_processor, + discretizer, target_encoder, + pipeline["threshold_numeric_is_categorical"], + continuous_vars=pipeline["continuous_vars"], + discrete_vars=pipeline["discrete_vars"]) + + def fit(self, data: pd.DataFrame, target_column_name: str, + id_column_name: str=None): + """Fit the data to the preprocessing pipeline + + Parameters + ---------- + data : pd.DataFrame + Data to be preprocessed + target_column_name : str + Name of the target column + id_column_name : str, optional + Name of the id column + """ + # get list of all variables + var_list = self._get_variable_list(data, target_column_name, + id_column_name) + + data = (PreProcessor + .train_selection_validation_split(data, target_column_name, + self.train_pct, + self.selection_pct, + self.validation_pct, + self.stratify_split)) + + # Select train data to fit preprocessing pipeline + train_data = data[data["split"] == "train"] + + # Fit discretizer, categorical preprocessor & target encoder + # Note that in order to fit target_encoder, we first have to transform + # the data using the fitted discretizer & categorical_data_processor + if self._continuous_vars: + self.discretizer.fit(train_data, self._continuous_vars) + train_data = self.discretizer.transform(train_data, + self._continuous_vars) + + if self._discrete_vars: + self.categorical_data_processor.fit(train_data, + self._discrete_vars, + target_column_name) + train_data = (self.categorical_data_processor + .transform(train_data, self._discrete_vars)) + + self.target_encoder.fit(train_data, var_list, target_column_name) + + self._is_fitted = True # set fitted boolean to True + # serialize the pipeline to store the fitted output along with the + # various parameters that were used + self._serialize() + + def transform(self, data: pd.DataFrame, target_column_name: str, + id_column_name: str=None) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Description + target_column_name : str + Description + id_column_name : str, optional + Description + + Returns + ------- + pd.DataFrame + Description + """ + + if not self._is_fitted: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + var_lists = ([col + "_processed" for col in self._discrete_vars] + + [col + "_bin" for col in self._continuous_vars]) + + data = self.discretizer.transform(data, self._continuous_vars) + data = self.categorical_data_processor.transform(data, + self._discrete_vars) + + data = self.target_encoder.transform(data, var_lists) + + return data + + @staticmethod + def train_selection_validation_split(data: pd.DataFrame, + target_column_name: str, + train_pct: float=0.6, + selection_pct: float=0.2, + validation_pct: float=0.2, + stratify_split=True)->pd.DataFrame: + """Split dataset into train-selection-validation datasets and merge + them into one big DataFrame with an additional column "split" + indicating to which dataset the corresponding row belongs to. + + Parameters + ---------- + data : pd.DataFrame + Input dataset to split into train-selection and validation sets + target_column_name : str + Name of the target column + train_pct : float, optional + Percentage data to put in train set + selection_pct : float, optional + Percentage data to put in selection set + validation_pct : float, optional + Percentage data to put in validation set + stratify_split : bool, optional + Whether or not to stratify the train-test split + + Returns + ------- + pd.DataFrame + Description + """ + column_names = list(data.columns) + + predictors = [col for col in column_names if col != target_column_name] + + test_pct = selection_pct + validation_pct + + X = data[predictors] + y = data[target_column_name] + + stratify = None + if stratify_split: + stratify = y + + X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=test_pct, + random_state=42, + stratify=stratify) + + if stratify_split: + stratify = y_test + + X_sel, X_val, y_sel, y_val = train_test_split(X_test, y_test, + test_size=validation_pct, + random_state=42, + stratify=stratify) + + df_train = pd.DataFrame(X_train, columns=predictors) + df_train[target_column_name] = y_train + df_train["split"] = "train" + + df_selection = pd.DataFrame(X_sel, columns=predictors) + df_selection[target_column_name] = y_sel + df_selection["split"] = "selection" + + df_validation = pd.DataFrame(X_val, columns=predictors) + df_validation[target_column_name] = y_val + df_validation["split"] = "validation" + + return (pd.concat([df_train, df_selection, df_validation]) + .reset_index(drop=True)) + + def _get_variable_list(self, data: pd.DataFrame, + target_column_name: str, + id_column_name: str=None): + """Get list of variables and split into numeric and categorical + + Parameters + ---------- + data : pd.DataFrame + Data to be preprocessed + target_column_name : str + Name of the target column + id_column_name : str, optional + Name of the id column + """ + + columns_by_datatype = utils.get_column_datatypes(data, + target_column_name, + id_column_name, + self.numeric_threshold + ) + + self._continuous_vars = columns_by_datatype["numeric_variables"] + self._discrete_vars = columns_by_datatype["categorical_variables"] + + log.info("Numeric variables: {}".format(self._continuous_vars)) + log.info("Categorical variables:".format(self._discrete_vars)) + + var_list = ([col + "_processed" for col in self._discrete_vars] + + [col + "_bin" for col in self._continuous_vars]) + + if not var_list: + raise ValueError("Variable var_list is None or empty list") + + return var_list + + def _serialize(self) -> dict: + """Serialize the preprocessing pipeline by writing all its required + parameters to a JSON file. + + Returns + ------- + dict + Return the pipeline as a dictionary + """ + pipeline = { + "metadata": { + "timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S") + } + } + + pipeline["train_pct"] = self.train_pct + pipeline["selection_pct"] = self.selection_pct + pipeline["validation_pct"] = self.validation_pct + pipeline["stratify_split"] = self.stratify_split + pipeline["threshold_numeric_is_categorical"] = self.numeric_threshold + + pipeline["categorical_data_processor"] = (self + .categorical_data_processor + .attributes_to_dict()) + + pipeline["discretizer"] = self.discretizer.attributes_to_dict() + pipeline["target_encoder"] = (self.target_encoder.attributes_to_dict()) + + pipeline["continuous_vars"] = self._continuous_vars + pipeline["discrete_vars"] = self._discrete_vars + + if self.serialization_path: + path = self.serialization_path + else: + path = "./pipeline_tmp.json" + + with open(path, "w") as file: + json.dump(pipeline, file) + + return pipeline + + @staticmethod + def _is_valid_pipeline(pipeline: dict) -> bool: + """Validate the loaded pipeline by checking if all required parameters + are present (and no others!). + + Parameters + ---------- + pipeline : dict + Loaded pipeline from json file + """ + keys = inspect.getfullargspec(PreProcessor.from_params).args + valid_keys = set([key for key in keys + if key not in ["cls", "serialization_path"]]) + + input_keys = set() + for key in pipeline: + if key in ["categorical_data_processor", "discretizer", + "target_encoder"]: + input_keys = input_keys.union(set(pipeline[key].keys())) + elif key != "metadata": + input_keys.add(key) + + return valid_keys == input_keys diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py new file mode 100644 index 0000000..1548b9e --- /dev/null +++ b/tests/preprocessing/test_preprocessor.py @@ -0,0 +1,36 @@ +from contextlib import contextmanager +import pytest + +import numpy as np +import pandas as pd + +#from cobra.preprocessing.preprocessor import PreProcessor + + +@contextmanager +def does_not_raise(): + yield + + +class TestPreProcessor: + + def test_from_params(self): + pass + + def test_from_pipeline(self): + pass + + def test_fit(self): + pass + + def test_transform(self): + pass + + def test_train_selection_validation_split(self): + pass + + def test_serialize(self): + pass + + def test_is_valid_pipeline(self): + pass From c8f6c5c2ae356b46b0d3616dc94ed05c67c6df14 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 8 Jan 2020 14:21:10 +0100 Subject: [PATCH 35/98] Bug fix in PreProcessor._is_valid_pipeline --- cobra/preprocessing/preprocessor.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 7cf1959..2cc841d 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -247,8 +247,8 @@ def from_pipeline(cls, pipeline_path: str): categorical_data_processor, discretizer, target_encoder, pipeline["threshold_numeric_is_categorical"], - continuous_vars=pipeline["continuous_vars"], - discrete_vars=pipeline["discrete_vars"]) + continuous_vars=pipeline["_continuous_vars"], + discrete_vars=pipeline["_discrete_vars"]) def fit(self, data: pd.DataFrame, target_column_name: str, id_column_name: str=None): @@ -470,8 +470,8 @@ def _serialize(self) -> dict: pipeline["discretizer"] = self.discretizer.attributes_to_dict() pipeline["target_encoder"] = (self.target_encoder.attributes_to_dict()) - pipeline["continuous_vars"] = self._continuous_vars - pipeline["discrete_vars"] = self._discrete_vars + pipeline["_continuous_vars"] = self._continuous_vars + pipeline["_discrete_vars"] = self._discrete_vars if self.serialization_path: path = self.serialization_path @@ -505,4 +505,7 @@ def _is_valid_pipeline(pipeline: dict) -> bool: elif key != "metadata": input_keys.add(key) - return valid_keys == input_keys + input_keys = sorted(list(input_keys)) + input_keys = [key for key in input_keys if not key.startswith("_")] + + return sorted(list(valid_keys)) == sorted(list(input_keys)) From 9986e050c4f9bf32787a4117c99704bf5529f19f Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 8 Jan 2020 16:53:32 +0100 Subject: [PATCH 36/98] Bug fix in PreProcessor This bug caused an error on transform when PreProcessor is initialized from a pipeline (in which case _is_fitted was False, but should have been True). --- cobra/preprocessing/__init__.py | 4 +++- cobra/preprocessing/preprocessor.py | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py index 2008235..e02ad4c 100644 --- a/cobra/preprocessing/__init__.py +++ b/cobra/preprocessing/__init__.py @@ -1,7 +1,9 @@ from .kbins_discretizer import KBinsDiscretizer from .target_encoder import TargetEncoder from .categorical_data_processor import CategoricalDataProcessor +from .preprocessor import PreProcessor __all__ = ['KBinsDiscretizer', 'TargetEncoder', - 'CategoricalDataProcessor'] + 'CategoricalDataProcessor', + 'PreProcessor'] diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 2cc841d..c0eb3ee 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -90,6 +90,8 @@ def __init__(self, self._is_fitted = False self._continuous_vars = continuous_vars self._discrete_vars = discrete_vars + if continuous_vars or discrete_vars: + self._is_fitted = True @classmethod def from_params(cls, @@ -244,8 +246,7 @@ def from_pipeline(cls, pipeline_path: str): return cls(pipeline["train_pct"], pipeline["selection_pct"], pipeline["validation_pct"], pipeline["stratify_split"], - categorical_data_processor, - discretizer, target_encoder, + categorical_data_processor, discretizer, target_encoder, pipeline["threshold_numeric_is_categorical"], continuous_vars=pipeline["_continuous_vars"], discrete_vars=pipeline["_discrete_vars"]) From f57804c68a7c0f37d54cbd99c14075203f48b9f9 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 8 Jan 2020 17:15:35 +0100 Subject: [PATCH 37/98] Update setup.py --- setup.py | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/setup.py b/setup.py index ceb8112..e04e832 100644 --- a/setup.py +++ b/setup.py @@ -1,24 +1,19 @@ from distutils.core import setup + setup( - name = 'cobra', - packages = ['cobra'], - version = '1.0', - description = 'Library for fast model building', - author='Jan Benisek', - author_email = "jan.benisek@pythonpredictions.com", - url = "https://github.com/JanBenisek/COBRA", - download_url = "https://github.com/JanBenisek/COBRA", - keywords = ["Python", "cobra"], - classifiers = [ - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Development Status :: 1 - Beta", - "Environment :: Other Environment", - "Intended Audience :: Developers", - "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", - "Operating System :: OS Independent", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Text Processing :: Linguistic", - ], - long_description = 'Refactored Cobra project into a Python library.' - ) \ No newline at end of file + name="cobra", + version="0.1.0", + description="Python Prediction's methodology for predictive analytics", + packages=["cobra"], + url="https://github.com/PythonPredictions", + #long_description=long_description, # TO DO + #long_description_content_type="text/markdown", + install_requires=[ + "pandas>=0.25.1", + "numpy>=1.17.2", + "scipy>=1.2.0", + "scikit_learn>=0.22.1", + "matplotlib>=3.0.2", + "seaborn>=0.9.0"], + python_requires=">=3.5", +) From 3e8823775a7ef678809a78474934015af388eca8 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 10 Jan 2020 21:18:50 +0100 Subject: [PATCH 38/98] Change API of PreProcessor for extra flexibility --- cobra/preprocessing/preprocessor.py | 271 ++++++++++++++++------------ 1 file changed, 153 insertions(+), 118 deletions(-) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index c0eb3ee..f17b267 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -14,6 +14,7 @@ from typing import Optional import inspect from datetime import datetime +import time import logging log = logging.getLogger(__name__) @@ -45,61 +46,32 @@ class PreProcessor(BaseEstimator): numeric_threshold : int Threshold to decide whether a numeric variable is in fact a categorical one based on the number of unique values of that variable - selection_pct : float - Percentage of data to add to selection dataset serialization_path : str path to save the pipeline to stratify_split : bool Whether or not to stratify the train-test split target_encoder : TargetEncoder Instance of TargetEncoder to do the incidence replacement - train_pct : float - Percentage of data to add to training dataset - validation_pct : float - Percentage of data to add to validation dataset """ - def __init__(self, - train_pct: float, - selection_pct: float, - validation_pct: float, - stratify_split: bool, - categorical_data_processor: CategoricalDataProcessor, + def __init__(self, categorical_data_processor: CategoricalDataProcessor, discretizer: KBinsDiscretizer, target_encoder: TargetEncoder, threshold_numeric_is_categorical: int=None, serialization_path: str=None, - continuous_vars: list=[], - discrete_vars: list=[]): - - self.train_pct = train_pct - self.selection_pct = selection_pct - self.validation_pct = validation_pct - self.stratify_split = stratify_split + is_fitted: bool=False): self.numeric_threshold = threshold_numeric_is_categorical - self.serialization_path = serialization_path - self.categorical_data_processor = categorical_data_processor - self.discretizer = discretizer - self.target_encoder = target_encoder + self._categorical_data_processor = categorical_data_processor + self._discretizer = discretizer + self._target_encoder = target_encoder - # placeholders for columns by datatype - # Included as constructor argument to get them from pipeline - self._is_fitted = False - self._continuous_vars = continuous_vars - self._discrete_vars = discrete_vars - if continuous_vars or discrete_vars: - self._is_fitted = True + self._is_fitted = is_fitted @classmethod def from_params(cls, - train_pct: float, - selection_pct: float, - validation_pct: float, - stratify_split: bool=True, - threshold_numeric_is_categorical: int=None, n_bins: int=10, strategy: str="quantile", closed: str="right", @@ -115,24 +87,13 @@ def from_params(cls, scale_contingency_table: bool=True, forced_categories: dict={}, weight: float=0.0, + threshold_numeric_is_categorical: int=None, serialization_path: Optional[str]=None): """Constructor to instantiate PreProcessor from all the parameters that can be set in all its required classes. Parameters ---------- - train_pct : float - Percentage of data to add to training dataset - selection_pct : float - Percentage of data to add to selection dataset - validation_pct : float - Percentage of data to add to validation dataset - stratify_split : bool, optional - Whether or not to stratify the train-test split - threshold_numeric_is_categorical : int, optional - Threshold to decide whether a numeric variable is in fact a - categorical one based on the number of unique values of - that variable n_bins : int, optional Number of bins to produce. Raises ValueError if ``n_bins < 2``. strategy : str, optional @@ -179,6 +140,10 @@ def from_params(cls, (e.g. the pure target incidence is used). serialization_path : str, optional path to save the pipeline to + threshold_numeric_is_categorical : int, optional + Threshold to decide whether a numeric variable is in fact a + categorical one based on the number of unique values of + that variable Returns ------- @@ -201,11 +166,7 @@ def from_params(cls, target_encoder = TargetEncoder(weight) - if not threshold_numeric_is_categorical: - threshold_numeric_is_categorical = n_bins - - return cls(train_pct, selection_pct, validation_pct, stratify_split, - categorical_data_processor, discretizer, target_encoder, + return cls(categorical_data_processor, discretizer, target_encoder, threshold_numeric_is_categorical, serialization_path) @classmethod @@ -244,64 +205,81 @@ def from_pipeline(cls, pipeline_path: str): target_encoder = TargetEncoder() target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) - return cls(pipeline["train_pct"], pipeline["selection_pct"], - pipeline["validation_pct"], pipeline["stratify_split"], - categorical_data_processor, discretizer, target_encoder, + return cls(categorical_data_processor, discretizer, target_encoder, pipeline["threshold_numeric_is_categorical"], - continuous_vars=pipeline["_continuous_vars"], - discrete_vars=pipeline["_discrete_vars"]) + pipeline["_is_fitted"]) - def fit(self, data: pd.DataFrame, target_column_name: str, - id_column_name: str=None): + def fit(self, train_data: pd.DataFrame, target_column_name: str, + id_column_name: str=None, + continuous_vars: list=[], discrete_vars: list=[]): """Fit the data to the preprocessing pipeline Parameters ---------- - data : pd.DataFrame + train_data : pd.DataFrame Data to be preprocessed target_column_name : str Name of the target column id_column_name : str, optional Name of the id column + continuous_vars : list, optional + list of continuous variables + discrete_vars : list, optional + list of discrete variables """ - # get list of all variables - var_list = self._get_variable_list(data, target_column_name, - id_column_name) - data = (PreProcessor - .train_selection_validation_split(data, target_column_name, - self.train_pct, - self.selection_pct, - self.validation_pct, - self.stratify_split)) + if not (continuous_vars or discrete_vars): + continuous_vars, discrete_vars = self._get_variable_list_by_type( + train_data, + target_column_name, + id_column_name) + + # get list of all variables + var_list = PreProcessor._get_variable_list(continuous_vars, + discrete_vars) - # Select train data to fit preprocessing pipeline - train_data = data[data["split"] == "train"] + log.info("Starting to fit pipeline") + start = time.time() # Fit discretizer, categorical preprocessor & target encoder # Note that in order to fit target_encoder, we first have to transform # the data using the fitted discretizer & categorical_data_processor - if self._continuous_vars: - self.discretizer.fit(train_data, self._continuous_vars) - train_data = self.discretizer.transform(train_data, - self._continuous_vars) - - if self._discrete_vars: - self.categorical_data_processor.fit(train_data, - self._discrete_vars, - target_column_name) - train_data = (self.categorical_data_processor - .transform(train_data, self._discrete_vars)) - - self.target_encoder.fit(train_data, var_list, target_column_name) + if continuous_vars: + begin = time.time() + self._discretizer.fit(train_data, continuous_vars) + log.info("Fitting KBinsDiscretizer took {} seconds" + .format(time.time() - begin)) + + train_data = self._discretizer.transform(train_data, + continuous_vars) + + if discrete_vars: + begin = time.time() + self._categorical_data_processor.fit(train_data, + discrete_vars, + target_column_name) + log.info("Fitting categorical_data_processor class took {} seconds" + .format(time.time() - begin)) + + train_data = (self._categorical_data_processor + .transform(train_data, discrete_vars)) + + begin = time.time() + self._target_encoder.fit(train_data, var_list, target_column_name) + log.info("Fitting TargetEncoder took {} seconds" + .format(time.time() - begin)) self._is_fitted = True # set fitted boolean to True # serialize the pipeline to store the fitted output along with the # various parameters that were used self._serialize() + log.info("Fitting and serializing pipeline took {} seconds" + .format(time.time() - start)) + def transform(self, data: pd.DataFrame, target_column_name: str, - id_column_name: str=None) -> pd.DataFrame: + id_column_name: str=None, continuous_vars: list=[], + discrete_vars: list=[]) -> pd.DataFrame: """Summary Parameters @@ -312,27 +290,56 @@ def transform(self, data: pd.DataFrame, target_column_name: str, Description id_column_name : str, optional Description + continuous_vars : list, optional + list of continuous variables + discrete_vars : list, optional + list of discrete variables Returns ------- pd.DataFrame Description + + Raises + ------ + NotFittedError + Description """ + start = time.time() + if not self._is_fitted: msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") raise NotFittedError(msg.format(self.__class__.__name__)) - var_lists = ([col + "_processed" for col in self._discrete_vars] - + [col + "_bin" for col in self._continuous_vars]) + if not (continuous_vars or discrete_vars): + continuous_vars, discrete_vars = self._get_variable_list_by_type( + data, + target_column_name, + id_column_name) + + # remove "split" column as this is the column + # making the train-selection-validation split + if "split" in discrete_vars: + discrete_vars.remove("split") + + # get list of all variables + var_list = PreProcessor._get_variable_list(continuous_vars, + discrete_vars) + + if continuous_vars: + data = self._discretizer.transform(data, continuous_vars) + + if discrete_vars: + data = self._categorical_data_processor.transform(data, + discrete_vars) - data = self.discretizer.transform(data, self._continuous_vars) - data = self.categorical_data_processor.transform(data, - self._discrete_vars) + data = self._target_encoder.transform(data, var_list) - data = self.target_encoder.transform(data, var_lists) + log.info("Transforming data took {} seconds" + .format(time.time() - start)) return data @@ -408,10 +415,39 @@ def train_selection_validation_split(data: pd.DataFrame, return (pd.concat([df_train, df_selection, df_validation]) .reset_index(drop=True)) - def _get_variable_list(self, data: pd.DataFrame, - target_column_name: str, - id_column_name: str=None): - """Get list of variables and split into numeric and categorical + @staticmethod + def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: + """Summary + + Parameters + ---------- + continuous_vars : list + Description + discrete_vars : list + Description + + Returns + ------- + list + Description + + Raises + ------ + ValueError + Description + """ + var_list = ([col + "_processed" for col in discrete_vars] + + [col + "_bin" for col in continuous_vars]) + + if not var_list: + raise ValueError("Variable var_list is None or empty list") + + return var_list + + def _get_variable_list_by_type(self, data: pd.DataFrame, + target_column_name: str, + id_column_name: str=None): + """Get two lists of variables (numeric and categorical) Parameters ---------- @@ -423,25 +459,29 @@ def _get_variable_list(self, data: pd.DataFrame, Name of the id column """ - columns_by_datatype = utils.get_column_datatypes(data, - target_column_name, - id_column_name, - self.numeric_threshold - ) + if not self.numeric_threshold: + raise ValueError("threshold_numeric_is_categorical is not allowed " + "to be None") - self._continuous_vars = columns_by_datatype["numeric_variables"] - self._discrete_vars = columns_by_datatype["categorical_variables"] + columns_by_datatype = utils.get_column_datatypes( + data, + target_column_name, + id_column_name, + self.numeric_threshold) - log.info("Numeric variables: {}".format(self._continuous_vars)) - log.info("Categorical variables:".format(self._discrete_vars)) + continuous_vars = columns_by_datatype["numeric_variables"] + discrete_vars = columns_by_datatype["categorical_variables"] - var_list = ([col + "_processed" for col in self._discrete_vars] - + [col + "_bin" for col in self._continuous_vars]) + log.info("Numeric variables: {}".format(continuous_vars)) + log.info("Categorical variables:".format(discrete_vars)) + + var_list = ([col + "_processed" for col in discrete_vars] + + [col + "_bin" for col in continuous_vars]) if not var_list: raise ValueError("Variable var_list is None or empty list") - return var_list + return continuous_vars, discrete_vars def _serialize(self) -> dict: """Serialize the preprocessing pipeline by writing all its required @@ -458,21 +498,16 @@ def _serialize(self) -> dict: } } - pipeline["train_pct"] = self.train_pct - pipeline["selection_pct"] = self.selection_pct - pipeline["validation_pct"] = self.validation_pct - pipeline["stratify_split"] = self.stratify_split - pipeline["threshold_numeric_is_categorical"] = self.numeric_threshold - pipeline["categorical_data_processor"] = (self - .categorical_data_processor + ._categorical_data_processor .attributes_to_dict()) - pipeline["discretizer"] = self.discretizer.attributes_to_dict() - pipeline["target_encoder"] = (self.target_encoder.attributes_to_dict()) + pipeline["discretizer"] = self._discretizer.attributes_to_dict() + pipeline["target_encoder"] = (self._target_encoder + .attributes_to_dict()) - pipeline["_continuous_vars"] = self._continuous_vars - pipeline["_discrete_vars"] = self._discrete_vars + pipeline["_is_fitted"] = True + pipeline["threshold_numeric_is_categorical"] = self.numeric_threshold if self.serialization_path: path = self.serialization_path From 32603a430af89483ce99e57b57049cf832be0416 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 16 Jan 2020 09:13:05 +0100 Subject: [PATCH 39/98] Clean up API of PreProcessor --- README.md | 9 +- cobra/preprocessing/preprocessor.py | 172 +++++++---------------- requirements.txt | 6 + tests/preprocessing/test_preprocessor.py | 12 +- 4 files changed, 67 insertions(+), 132 deletions(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index 50cb9ef..130a724 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ These instructions will get you a copy of the project up and running on your loc ### What can cobra do? - * Transform given pandas DataFrame to be ready to use for prediction modelling: partition into train/selection/validation sets, create bins from continuous variables, regroup categorical variables and add columns with incidence rate per category/bin. + * Transform given pandas DataFrame to be ready to use for prediction modelling: partition into train/selection/validation sets, create bins from continuous variables, regroup categorical variables, replace missing values and add columns with incidence rate per category/bin. * Perform univariate selection based on AUC * Find best model by forward selection * Visualize the results @@ -47,7 +47,7 @@ As this package is an internal package that is not open-sourced, it is not avail * Clone this repository. * Open a shell that can execute python code and navigate to the folder where this repo was cloned in. - * Once you are in the folder, execute `python setup.py install`. + * Once you are in the folder, execute `python setup.py install` or `pip install .`. ### Usage @@ -55,7 +55,4 @@ TO DO ## Development -We'd love you to contribute to the development of Cobra! To do so, clone the repo and create a _feature branch_ to do your development. Once your are finished, you can create a _pull request_ to merge it back into the main branch. - - - +We'd love you to contribute to the development of Cobra! To do so, clone the repo and create a _feature branch_ to do your development. Once your are finished, you can create a _pull request_ to merge it back into the main branch. Make sure to write or modify unit test for your changes! diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index f17b267..48d8948 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -43,9 +43,6 @@ class PreProcessor(BaseEstimator): discretizer : KBinsDiscretizer Instance of KBinsDiscretizer to do the prepocessing of continuous variables by means of discretization - numeric_threshold : int - Threshold to decide whether a numeric variable is in fact a categorical - one based on the number of unique values of that variable serialization_path : str path to save the pipeline to stratify_split : bool @@ -57,11 +54,9 @@ class PreProcessor(BaseEstimator): def __init__(self, categorical_data_processor: CategoricalDataProcessor, discretizer: KBinsDiscretizer, target_encoder: TargetEncoder, - threshold_numeric_is_categorical: int=None, serialization_path: str=None, is_fitted: bool=False): - self.numeric_threshold = threshold_numeric_is_categorical self.serialization_path = serialization_path self._categorical_data_processor = categorical_data_processor @@ -87,10 +82,9 @@ def from_params(cls, scale_contingency_table: bool=True, forced_categories: dict={}, weight: float=0.0, - threshold_numeric_is_categorical: int=None, serialization_path: Optional[str]=None): """Constructor to instantiate PreProcessor from all the parameters - that can be set in all its required classes. + that can be set in all its required (attribute) classes. Parameters ---------- @@ -140,10 +134,6 @@ def from_params(cls, (e.g. the pure target incidence is used). serialization_path : str, optional path to save the pipeline to - threshold_numeric_is_categorical : int, optional - Threshold to decide whether a numeric variable is in fact a - categorical one based on the number of unique values of - that variable Returns ------- @@ -167,16 +157,17 @@ def from_params(cls, target_encoder = TargetEncoder(weight) return cls(categorical_data_processor, discretizer, target_encoder, - threshold_numeric_is_categorical, serialization_path) + serialization_path) @classmethod def from_pipeline(cls, pipeline_path: str): - """Summary + """Constructor to instantiate PreProcessor from a (fitted) pipeline, + stored as a JSON file. Parameters ---------- pipeline_path : str - Description + path to the (fitted) pipeline Returns ------- @@ -206,37 +197,28 @@ def from_pipeline(cls, pipeline_path: str): target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) return cls(categorical_data_processor, discretizer, target_encoder, - pipeline["threshold_numeric_is_categorical"], pipeline["_is_fitted"]) - def fit(self, train_data: pd.DataFrame, target_column_name: str, - id_column_name: str=None, - continuous_vars: list=[], discrete_vars: list=[]): + def fit(self, train_data: pd.DataFrame, continuous_vars: list, + discrete_vars: list, target_column_name: str): """Fit the data to the preprocessing pipeline Parameters ---------- train_data : pd.DataFrame Data to be preprocessed - target_column_name : str - Name of the target column - id_column_name : str, optional - Name of the id column continuous_vars : list, optional list of continuous variables discrete_vars : list, optional list of discrete variables + target_column_name : str + Name of the target column """ - if not (continuous_vars or discrete_vars): - continuous_vars, discrete_vars = self._get_variable_list_by_type( - train_data, - target_column_name, - id_column_name) - # get list of all variables - var_list = PreProcessor._get_variable_list(continuous_vars, - discrete_vars) + preprocessed_variable_names = (PreProcessor + ._get_variable_list(continuous_vars, + discrete_vars)) log.info("Starting to fit pipeline") start = time.time() @@ -265,7 +247,8 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str, .transform(train_data, discrete_vars)) begin = time.time() - self._target_encoder.fit(train_data, var_list, target_column_name) + self._target_encoder.fit(train_data, preprocessed_variable_names, + target_column_name) log.info("Fitting TargetEncoder took {} seconds" .format(time.time() - begin)) @@ -277,19 +260,14 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str, log.info("Fitting and serializing pipeline took {} seconds" .format(time.time() - start)) - def transform(self, data: pd.DataFrame, target_column_name: str, - id_column_name: str=None, continuous_vars: list=[], - discrete_vars: list=[]) -> pd.DataFrame: + def transform(self, data: pd.DataFrame, continuous_vars: list, + discrete_vars: list) -> pd.DataFrame: """Summary Parameters ---------- data : pd.DataFrame Description - target_column_name : str - Description - id_column_name : str, optional - Description continuous_vars : list, optional list of continuous variables discrete_vars : list, optional @@ -314,20 +292,9 @@ def transform(self, data: pd.DataFrame, target_column_name: str, raise NotFittedError(msg.format(self.__class__.__name__)) - if not (continuous_vars or discrete_vars): - continuous_vars, discrete_vars = self._get_variable_list_by_type( - data, - target_column_name, - id_column_name) - - # remove "split" column as this is the column - # making the train-selection-validation split - if "split" in discrete_vars: - discrete_vars.remove("split") - - # get list of all variables - var_list = PreProcessor._get_variable_list(continuous_vars, - discrete_vars) + preprocessed_variable_names = (PreProcessor + ._get_variable_list(continuous_vars, + discrete_vars)) if continuous_vars: data = self._discretizer.transform(data, continuous_vars) @@ -336,7 +303,8 @@ def transform(self, data: pd.DataFrame, target_column_name: str, data = self._categorical_data_processor.transform(data, discrete_vars) - data = self._target_encoder.transform(data, var_list) + data = self._target_encoder.transform(data, + preprocessed_variable_names) log.info("Transforming data took {} seconds" .format(time.time() - start)) @@ -415,74 +383,6 @@ def train_selection_validation_split(data: pd.DataFrame, return (pd.concat([df_train, df_selection, df_validation]) .reset_index(drop=True)) - @staticmethod - def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: - """Summary - - Parameters - ---------- - continuous_vars : list - Description - discrete_vars : list - Description - - Returns - ------- - list - Description - - Raises - ------ - ValueError - Description - """ - var_list = ([col + "_processed" for col in discrete_vars] - + [col + "_bin" for col in continuous_vars]) - - if not var_list: - raise ValueError("Variable var_list is None or empty list") - - return var_list - - def _get_variable_list_by_type(self, data: pd.DataFrame, - target_column_name: str, - id_column_name: str=None): - """Get two lists of variables (numeric and categorical) - - Parameters - ---------- - data : pd.DataFrame - Data to be preprocessed - target_column_name : str - Name of the target column - id_column_name : str, optional - Name of the id column - """ - - if not self.numeric_threshold: - raise ValueError("threshold_numeric_is_categorical is not allowed " - "to be None") - - columns_by_datatype = utils.get_column_datatypes( - data, - target_column_name, - id_column_name, - self.numeric_threshold) - - continuous_vars = columns_by_datatype["numeric_variables"] - discrete_vars = columns_by_datatype["categorical_variables"] - - log.info("Numeric variables: {}".format(continuous_vars)) - log.info("Categorical variables:".format(discrete_vars)) - - var_list = ([col + "_processed" for col in discrete_vars] - + [col + "_bin" for col in continuous_vars]) - - if not var_list: - raise ValueError("Variable var_list is None or empty list") - - return continuous_vars, discrete_vars - def _serialize(self) -> dict: """Serialize the preprocessing pipeline by writing all its required parameters to a JSON file. @@ -507,7 +407,6 @@ def _serialize(self) -> dict: .attributes_to_dict()) pipeline["_is_fitted"] = True - pipeline["threshold_numeric_is_categorical"] = self.numeric_threshold if self.serialization_path: path = self.serialization_path @@ -545,3 +444,32 @@ def _is_valid_pipeline(pipeline: dict) -> bool: input_keys = [key for key in input_keys if not key.startswith("_")] return sorted(list(valid_keys)) == sorted(list(input_keys)) + + @staticmethod + def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: + """Summary + + Parameters + ---------- + continuous_vars : list + Description + discrete_vars : list + Description + + Returns + ------- + list + Description + + Raises + ------ + ValueError + Description + """ + var_list = ([col + "_processed" for col in discrete_vars] + + [col + "_bin" for col in continuous_vars]) + + if not var_list: + raise ValueError("Variable var_list is None or empty list") + + return var_list diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..784dea5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +pandas==0.25.1 +matplotlib==3.0.2 +scipy==1.2.0 +seaborn==0.9.0 +numpy==1.17.2 +scikit_learn==0.22.1 diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 1548b9e..28432ae 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -1,10 +1,14 @@ from contextlib import contextmanager import pytest +from pytest_mock import mocker import numpy as np import pandas as pd -#from cobra.preprocessing.preprocessor import PreProcessor +from cobra.preprocessing import PreProcessor +from cobra.preprocessing import KBinsDiscretizer +from cobra.preprocessing import TargetEncoder +from cobra.preprocessing import CategoricalDataProcessor @contextmanager @@ -14,9 +18,6 @@ def does_not_raise(): class TestPreProcessor: - def test_from_params(self): - pass - def test_from_pipeline(self): pass @@ -29,6 +30,9 @@ def test_transform(self): def test_train_selection_validation_split(self): pass + def test_get_variable_list(self): + pass + def test_serialize(self): pass From b6060525ce48d3d9e29ebebe6bdeca2f941d8008 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 31 Jan 2020 16:35:53 +0100 Subject: [PATCH 40/98] Bug fix in preprocessor.train_selection_validation_split ratios --- cobra/preprocessing/preprocessor.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 48d8948..49fbb13 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -346,7 +346,13 @@ def train_selection_validation_split(data: pd.DataFrame, predictors = [col for col in column_names if col != target_column_name] + # for the first split, take sum of selection & validation pct as + # test pct test_pct = selection_pct + validation_pct + # To further split our test set into selection + validation set, + # we have to modify validation pct because we only have test_pct of + # the data available anymore for further splitting! + validation_pct_modif = validation_pct / test_pct X = data[predictors] y = data[target_column_name] @@ -363,10 +369,12 @@ def train_selection_validation_split(data: pd.DataFrame, if stratify_split: stratify = y_test - X_sel, X_val, y_sel, y_val = train_test_split(X_test, y_test, - test_size=validation_pct, - random_state=42, - stratify=stratify) + X_sel, X_val, y_sel, y_val = train_test_split( + X_test, y_test, + test_size=validation_pct_modif, + random_state=42, + stratify=stratify + ) df_train = pd.DataFrame(X_train, columns=predictors) df_train[target_column_name] = y_train From 24c38cbdd2f9dcb799e8d19087d051ac70a6aaca Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 25 Feb 2020 13:29:25 +0100 Subject: [PATCH 41/98] Rework README --- README.md | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 130a724..e61bd6b 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,30 @@ # COBRA :snake: -**Cobra** is a Python package that implements the Python Predictions methodology for predictive analytics. It consists of a main script/notebook that can be used to build and save a predictive model only by setting several parameters. The main scripts itself consists of several modules that can be used independently of one another to build custom scripts. +**Cobra** is a Python package that implements the Python Predictions methodology for predictive analytics. It consists of a main script/notebook that can be used to build and save a predictive model only by setting several parameters. The main scripts itself consists of several modules that can be used independently of one another to build custom scripts. -Note that this package is a refactored version of the back-end of the original web-based cobra, developed by _Guillaume Marion_ (former Python Predictions employee). +Note that this package is a refactored version of the back-end of the original web-based cobra. :heavy_exclamation_mark: Be aware that there could still be :bug: in the code :heavy_exclamation_mark: -## Getting started +## What can cobra do? -These instructions will get you a copy of the project up and running on your local machine for usage, development and testing purposes. - -### What can cobra do? - - * Transform given pandas DataFrame to be ready to use for prediction modelling: partition into train/selection/validation sets, create bins from continuous variables, regroup categorical variables, replace missing values and add columns with incidence rate per category/bin. + * Prepare a given pandas DataFrame for prediction modelling: + - partition into train/selection/validation sets + - create bins from continuous variables + - regroup categorical variables + - replace missing values and + - add columns with incidence rate per category/bin. * Perform univariate selection based on AUC + * Compute correlation matrix of predictors * Find best model by forward selection * Visualize the results * Allow iteration among each step for the analyst -### Requirements +## Getting started + +These instructions will get you a copy of the project up and running on your local machine for usage, development and testing purposes. Furthermore, this section includes some brief examples on how to use it. + +### Requirements This package requires the usual Python packages for data science: @@ -29,7 +35,7 @@ This package requires the usual Python packages for data science: * pandas * scikit-learn -These packages, along with their versions are listed in `requirements.txt` and `conda_env.txt`. To install these packages using pip, run +These packages, along with their versions are listed in `requirements.txt` and `conda_env.txt`. To install these packages using pip, run ``` pip install requirements.txt @@ -40,19 +46,19 @@ or using conda ``` conda install requirements.txt ``` - + ### Installation -As this package is an internal package that is not open-sourced, it is not available through `pip` or `conda`. As a result, the package has to be installed manually using the following steps: +As this package is an internal package that is not open-sourced, it is not available through `pip` or `conda`. As a result, the package has to be installed manually using the following steps: - * Clone this repository. + * Clone this repository. * Open a shell that can execute python code and navigate to the folder where this repo was cloned in. - * Once you are in the folder, execute `python setup.py install` or `pip install .`. + * Once you are in the folder, execute `python setup.py install` or `pip install .`. ### Usage TO DO -## Development +## Development We'd love you to contribute to the development of Cobra! To do so, clone the repo and create a _feature branch_ to do your development. Once your are finished, you can create a _pull request_ to merge it back into the main branch. Make sure to write or modify unit test for your changes! From ea4f3eff4044acb1d9fb18c1cc917f2c3f571c0d Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 13 Mar 2020 16:10:46 +0100 Subject: [PATCH 42/98] Add model_building module --- cobra/model_building/univariate_selection.py | 124 +++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 cobra/model_building/univariate_selection.py diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py new file mode 100644 index 0000000..19a492f --- /dev/null +++ b/cobra/model_building/univariate_selection.py @@ -0,0 +1,124 @@ +""" +Module to perform univariate preselection and compute correlation amongst +predictors + +Authors: +- Geert Verstraeten (methodology) +- Matthias Roels (current implementation) +- Jan Benisek (initial implementation) +""" +import pandas as pd +from sklearn.metrics import roc_auc_score + + +def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, + target_enc_selection_data: pd.DataFrame, + predictors: list, + target_column: str, + preselect_auc_threshold: float, + preselect_overtrain_threshold: float + ) -> pd.DataFrame: + """ Perform a preselection of predictors based on an AUC threshold of + a univariate model on a train and selection dataset and return a datframe + containing for each variable the train and selection AUC along with a + boolean "preselection" column. + + As the AUC just calculates the quality of a ranking, all monotonous + transformations of a given ranking (i.e. transformations that do not alter + the ranking itself) will lead to the same AUC. + Hence, pushing a categorical variable (incl. a binned continuous variable) + through a logistic regression will produce exactly the same ranking as + pushing it through incidence replacement (i.e. target encoding), + as it will produce the exact same output: a ranking of the categories on + the training set. + + Therefore, no univariate model is trained here as the target encoded train + and selection data is/must be used as inputs for this function. These will + be used as predicted scores to compute the AUC with against the target + + Args: + target_enc_train_data (pd.DataFrame): Train data + target_enc_selection_data (pd.DataFrame): Selection data + predictors (list): list of predictors (e.g. column names in the train + and selection data sets) + target_column (str): name of the target column + preselect_auc_threshold (float): Description + preselect_overtrain_threshold (float): Description + + Returns: + pd.DataFrame: DataFrame containing for each variable the train auc and + test auc allong with a boolean indicating whether or not it is selected + based on the criteria + """ + result = [] + + for predictor in predictors: + + cleaned_predictor = _clean_predictor_name(predictor) + + auc_train = roc_auc_score(target_enc_train_data[predictor], + target_enc_train_data[target_column]) + + auc_selection = roc_auc_score( + target_enc_selection_data[predictor], + target_enc_selection_data[target_column] + ) + + result.append({"predictor": cleaned_predictor, + "AUC train": auc_train, + "AUC selection": auc_selection}) + + df_auc = pd.DataFrame(result) + + # Filter based on min AUC + auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold + + # Identify those variables for which the AUC difference between train + # and selection is within a user-defined ratio + auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) * 100 + < preselect_overtrain_threshold) + + df_auc["preselection"] = auc_thresh & auc_overtrain + + return df_auc + + +def compute_correlations(target_enc_train_data: pd.DataFrame, + predictors: list) -> pd.DataFrame: + """Given a DataFrame and a list of predictors, compute the correlations + amongst the predictors in the DataFrame + + Args: + target_enc_train_data (pd.DataFrame): data to compute correlation + matrix from + predictors (list): List of column names of the DataFrame between which + to compute correlations + + Returns: + pd.DataFrame: The correlation matrix of the training set + """ + + correlations = target_enc_train_data[predictors].corr() + + predictors_cleaned = [_clean_predictor_name(predictor) + for predictor in predictors] + + # Change index and columns with the cleaned version of the predictors + # e.g. change "var1_enc" with "var1" + correlations.columns = predictors_cleaned + correlations.index = predictors_cleaned + + return correlations + + +def _clean_predictor_name(predictor: str) -> str: + """Strip-off redundant suffix (e.g. "_enc" or "_bin") from the predictor + name to return a clean version of the predictor + + Args: + predictor (str): Description + + Returns: + str: Description + """ + return predictor.replace("_enc", "").replace("_bin", "") From 31165d71dfa153c75b53e3397efe5c5cab067d5d Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 17 Mar 2020 15:25:18 +0100 Subject: [PATCH 43/98] Fix a bug in KBinsDiscretizer.fit method Bug caused None types to appear in the fitted output --- cobra/preprocessing/kbins_discretizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 5d126c9..9ba9449 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -193,8 +193,9 @@ def fit(self, data: pd.DataFrame, column_names: list): bins = self._fit_column(data, column_name) - # Add to bins_by_column for later use - self._bins_by_column[column_name] = bins + if bins is not None: + # Add to bins_by_column for later use + self._bins_by_column[column_name] = bins def _fit_column(self, data: pd.DataFrame, column_name: str) -> List[tuple]: From 198563016cae60486af59d189ca9557ef190bc8f Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 18 Mar 2020 09:13:08 +0100 Subject: [PATCH 44/98] Bug fix in PreProcessor.from_pipeline --- cobra/preprocessing/preprocessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 49fbb13..6890a39 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -197,7 +197,7 @@ def from_pipeline(cls, pipeline_path: str): target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) return cls(categorical_data_processor, discretizer, target_encoder, - pipeline["_is_fitted"]) + is_fitted=pipeline["_is_fitted"]) def fit(self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: list, target_column_name: str): From 9ebb1e02d6c8a2157895f58a0abe79c338c52bd7 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 18 Mar 2020 09:17:47 +0100 Subject: [PATCH 45/98] Bug fix compute_univariate_preselection --- cobra/model_building/univariate_selection.py | 249 ++--- .../categorical_data_processor.py | 868 ++++++++-------- cobra/preprocessing/preprocessor.py | 966 +++++++++--------- 3 files changed, 1042 insertions(+), 1041 deletions(-) diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 19a492f..de8ef3f 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -1,124 +1,125 @@ -""" -Module to perform univariate preselection and compute correlation amongst -predictors - -Authors: -- Geert Verstraeten (methodology) -- Matthias Roels (current implementation) -- Jan Benisek (initial implementation) -""" -import pandas as pd -from sklearn.metrics import roc_auc_score - - -def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, - target_enc_selection_data: pd.DataFrame, - predictors: list, - target_column: str, - preselect_auc_threshold: float, - preselect_overtrain_threshold: float - ) -> pd.DataFrame: - """ Perform a preselection of predictors based on an AUC threshold of - a univariate model on a train and selection dataset and return a datframe - containing for each variable the train and selection AUC along with a - boolean "preselection" column. - - As the AUC just calculates the quality of a ranking, all monotonous - transformations of a given ranking (i.e. transformations that do not alter - the ranking itself) will lead to the same AUC. - Hence, pushing a categorical variable (incl. a binned continuous variable) - through a logistic regression will produce exactly the same ranking as - pushing it through incidence replacement (i.e. target encoding), - as it will produce the exact same output: a ranking of the categories on - the training set. - - Therefore, no univariate model is trained here as the target encoded train - and selection data is/must be used as inputs for this function. These will - be used as predicted scores to compute the AUC with against the target - - Args: - target_enc_train_data (pd.DataFrame): Train data - target_enc_selection_data (pd.DataFrame): Selection data - predictors (list): list of predictors (e.g. column names in the train - and selection data sets) - target_column (str): name of the target column - preselect_auc_threshold (float): Description - preselect_overtrain_threshold (float): Description - - Returns: - pd.DataFrame: DataFrame containing for each variable the train auc and - test auc allong with a boolean indicating whether or not it is selected - based on the criteria - """ - result = [] - - for predictor in predictors: - - cleaned_predictor = _clean_predictor_name(predictor) - - auc_train = roc_auc_score(target_enc_train_data[predictor], - target_enc_train_data[target_column]) - - auc_selection = roc_auc_score( - target_enc_selection_data[predictor], - target_enc_selection_data[target_column] - ) - - result.append({"predictor": cleaned_predictor, - "AUC train": auc_train, - "AUC selection": auc_selection}) - - df_auc = pd.DataFrame(result) - - # Filter based on min AUC - auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold - - # Identify those variables for which the AUC difference between train - # and selection is within a user-defined ratio - auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) * 100 - < preselect_overtrain_threshold) - - df_auc["preselection"] = auc_thresh & auc_overtrain - - return df_auc - - -def compute_correlations(target_enc_train_data: pd.DataFrame, - predictors: list) -> pd.DataFrame: - """Given a DataFrame and a list of predictors, compute the correlations - amongst the predictors in the DataFrame - - Args: - target_enc_train_data (pd.DataFrame): data to compute correlation - matrix from - predictors (list): List of column names of the DataFrame between which - to compute correlations - - Returns: - pd.DataFrame: The correlation matrix of the training set - """ - - correlations = target_enc_train_data[predictors].corr() - - predictors_cleaned = [_clean_predictor_name(predictor) - for predictor in predictors] - - # Change index and columns with the cleaned version of the predictors - # e.g. change "var1_enc" with "var1" - correlations.columns = predictors_cleaned - correlations.index = predictors_cleaned - - return correlations - - -def _clean_predictor_name(predictor: str) -> str: - """Strip-off redundant suffix (e.g. "_enc" or "_bin") from the predictor - name to return a clean version of the predictor - - Args: - predictor (str): Description - - Returns: - str: Description - """ - return predictor.replace("_enc", "").replace("_bin", "") +""" +Module to perform univariate preselection and compute correlation amongst +predictors + +Authors: +- Geert Verstraeten (methodology) +- Matthias Roels (current implementation) +- Jan Benisek (initial implementation) +""" +import pandas as pd +from sklearn.metrics import roc_auc_score + + +def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, + target_enc_selection_data: pd.DataFrame, + predictors: list, + target_column: str, + preselect_auc_threshold: float, + preselect_overtrain_threshold: float + ) -> pd.DataFrame: + """ Perform a preselection of predictors based on an AUC threshold of + a univariate model on a train and selection dataset and return a datframe + containing for each variable the train and selection AUC along with a + boolean "preselection" column. + + As the AUC just calculates the quality of a ranking, all monotonous + transformations of a given ranking (i.e. transformations that do not alter + the ranking itself) will lead to the same AUC. + Hence, pushing a categorical variable (incl. a binned continuous variable) + through a logistic regression will produce exactly the same ranking as + pushing it through incidence replacement (i.e. target encoding), + as it will produce the exact same output: a ranking of the categories on + the training set. + + Therefore, no univariate model is trained here as the target encoded train + and selection data is/must be used as inputs for this function. These will + be used as predicted scores to compute the AUC with against the target + + Args: + target_enc_train_data (pd.DataFrame): Train data + target_enc_selection_data (pd.DataFrame): Selection data + predictors (list): list of predictors (e.g. column names in the train + and selection data sets) + target_column (str): name of the target column + preselect_auc_threshold (float): Description + preselect_overtrain_threshold (float): Description + + Returns: + pd.DataFrame: DataFrame containing for each variable the train auc and + test auc allong with a boolean indicating whether or not it is selected + based on the criteria + """ + result = [] + + for predictor in predictors: + + cleaned_predictor = _clean_predictor_name(predictor) + + auc_train = roc_auc_score( + y_true=target_enc_train_data[target_column], + y_score=target_enc_train_data[predictor]) + + auc_selection = roc_auc_score( + y_true=target_enc_selection_data[target_column], + y_score=target_enc_selection_data[predictor] + ) + + result.append({"predictor": cleaned_predictor, + "AUC train": auc_train, + "AUC selection": auc_selection}) + + df_auc = pd.DataFrame(result) + + # Filter based on min AUC + auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold + + # Identify those variables for which the AUC difference between train + # and selection is within a user-defined ratio + auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) * 100 + < preselect_overtrain_threshold) + + df_auc["preselection"] = auc_thresh & auc_overtrain + + return df_auc + + +def compute_correlations(target_enc_train_data: pd.DataFrame, + predictors: list) -> pd.DataFrame: + """Given a DataFrame and a list of predictors, compute the correlations + amongst the predictors in the DataFrame + + Args: + target_enc_train_data (pd.DataFrame): data to compute correlation + matrix from + predictors (list): List of column names of the DataFrame between which + to compute correlations + + Returns: + pd.DataFrame: The correlation matrix of the training set + """ + + correlations = target_enc_train_data[predictors].corr() + + predictors_cleaned = [_clean_predictor_name(predictor) + for predictor in predictors] + + # Change index and columns with the cleaned version of the predictors + # e.g. change "var1_enc" with "var1" + correlations.columns = predictors_cleaned + correlations.index = predictors_cleaned + + return correlations + + +def _clean_predictor_name(predictor: str) -> str: + """Strip-off redundant suffix (e.g. "_enc" or "_bin") from the predictor + name to return a clean version of the predictor + + Args: + predictor (str): Description + + Returns: + str: Description + """ + return predictor.replace("_enc", "").replace("_bin", "") diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 0ce21e9..dd94a8b 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -1,434 +1,434 @@ -""" -This class implements the Python Prediction's way of dealing with -categorical data preprocessing. There are three steps involved here: -- An optional regrouping of the different categories based on category size - and significance of the category w.r.t. the target -- Missing value replacement with the additional category "Missing" -- Change of dtype to "category" (could potentially lead to memory optimization) - -Authors: -- Geert Verstraeten (methodology) -- Jan Benisek (implementation) -- Matthias Roels (implementation) -""" -# standard lib imports -import re -from typing import Optional - -import logging -log = logging.getLogger(__name__) - -# third party imports -import numpy as np -import pandas as pd -from scipy import stats - -from sklearn.base import BaseEstimator -from sklearn.exceptions import NotFittedError - - -class CategoricalDataProcessor(BaseEstimator): - """ - Regroups categories in categorical variables based on significance - with target variable. - - Attributes - ---------- - category_size_threshold : int - minimal size of a category to keep it as a separate category - forced_categories : dict - Map to prevent certain categories from being group into "Other" - for each colum - dict of the form {col:[forced vars]}. - keep_missing : bool - Whether or not to keep missing as a separate category - p_value_threshold : float - Significance threshold for regroupping. - regroup : bool - Whether or not to regroup categories - regroup_name : str - New name of the non-significant regrouped variables - scale_contingency_table : bool - Whether contingency table should be scaled before chi^2.' - """ - - valid_keys = ["regroup", "regroup_name", "keep_missing", - "category_size_threshold", "p_value_threshold", - "scale_contingency_table", "forced_categories"] - - def __init__(self, regroup: bool=True, regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}): - - self.regroup = regroup - self.regroup_name = regroup_name - self.keep_missing = keep_missing - self.category_size_threshold = category_size_threshold - self.p_value_threshold = p_value_threshold - self.scale_contingency_table = scale_contingency_table - self.forced_categories = forced_categories - - # dict to store fitted output in - self._cleaned_categories_by_column = {} - - def attributes_to_dict(self) -> dict: - """Return the attributes of CategoricalDataProcessor as a dictionary - - Returns - ------- - dict - Contains the attributes of CategoricalDataProcessor instance with - the attribute name as key - """ - params = self.get_params() - - params["_cleaned_categories_by_column"] = { - key: list(value) - for key, value in self._cleaned_categories_by_column.items() - } - - return params - - def set_attributes_from_dict(self, params: dict): - """Set instance attributes from a dictionary of values with key the - name of the attribute. - - Parameters - ---------- - params : dict - Contains the attributes of CategoricalDataProcessor with their - names as key. - - Raises - ------ - ValueError - In case _cleaned_categories_by_column is not of type dict - """ - _fitted_output = params.pop("_cleaned_categories_by_column", {}) - - if type(_fitted_output) != dict: - raise ValueError("_cleaned_categories_by_column is expected to " - "be a dict but is of type {} instead" - .format(type(_fitted_output))) - - # Clean out params dictionary to remove unknown keys (for safety!) - params = {key: params[key] for key in params if key in self.valid_keys} - - # We cannot turn this method into a classmethod as we want to make use - # of the following method from BaseEstimator: - self.set_params(**params) - - self._cleaned_categories_by_column = { - key: set(value) for key, value in _fitted_output.items() - } - - return self - - def fit(self, data: pd.DataFrame, column_names: list, - target_column: str): - """Fit the CategoricalDataProcessor - - Parameters - ---------- - data : pd.DataFrame - data used to compute the mapping to encode the categorical - variables with. - column_names : list - Columns of data to be processed - target_column : str - Column name of the target - """ - - if not self.regroup: - # We do not need to fit anything if regroup is set to False! - log.info("regroup was set to False, so no fitting is required") - return None - - for column_name in column_names: - - if column_name not in data.columns: - log.warning("DataFrame has no column '{}', so it will be " - "skipped in fitting" .format(column_name)) - continue - - cleaned_cats = self._fit_column(data, column_name, target_column) - - # Remove forced categories - forced_cats = self.forced_categories.get(column_name, set()) - cleaned_cats = cleaned_cats.union(forced_cats) - - # Add to _cleaned_categories_by_column for later use - self._cleaned_categories_by_column[column_name] = cleaned_cats - - def _fit_column(self, data: pd.DataFrame, column_name: str, - target_column) -> set: - """Compute which categories to regroup into "Other" for a particular - column - - Parameters - ---------- - data : pd.DataFrame - Description - column_name : str - Description - - Returns - ------- - list - list of categories to combine into a category "Other" - """ - y = data[target_column] - incidence = y.mean() - - combined_categories = set() - - # replace missings and get unique categories as a list - X = (CategoricalDataProcessor - ._replace_missings(data[column_name]) - .astype(object)) - - unique_categories = list(X.unique()) - - # get small categories and add them to the merged category list - small_categories = (CategoricalDataProcessor - ._get_small_categories( - X, - incidence, - self.category_size_threshold)) - combined_categories = combined_categories.union(small_categories) - - for category in unique_categories: - if category in small_categories: - continue - - pval = (CategoricalDataProcessor - ._compute_p_value(X, y, category, - self.scale_contingency_table)) - - # if not significant, add it to the list - if pval > self.p_value_threshold: - combined_categories.add(category) - - # Remove missing category from combined_categories if required - if self.keep_missing: - combined_categories.discard("Missing") - - return set(unique_categories).difference(combined_categories) - - def transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: - """Summary - - Parameters - ---------- - data : pd.DataFrame - Data to be discretized - column_names : list - Columns of data to be discretized - - Returns - ------- - pd.DataFrame - data with additional discretized variables - """ - - if self.regroup and len(self._cleaned_categories_by_column) == 0: - msg = ("{} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - - raise NotFittedError(msg.format(self.__class__.__name__)) - - for column_name in column_names: - - if column_name not in data.columns: - log.warning("Unknown column '{}' will be skipped" - .format(column_name)) - continue - - data = self._transform_column(data, column_name) - - return data - - def _transform_column(self, data: pd.DataFrame, - column_name: str) -> pd.DataFrame: - """Given a DataFrame, a column name and a list of categories to - combine, create an additional column which combines these categories - into "Other" - - Parameters - ---------- - data : pd.DataFrame - Original data to be tranformed - column_name : str - name of the column to transform - - Returns - ------- - pd.DataFrame - original DataFrame with an added processed column - """ - - column_name_clean = column_name + "_processed" - data[column_name_clean] = data[column_name].astype(object) - - # Fill missings first - data[column_name_clean] = (CategoricalDataProcessor - ._replace_missings(data, - column_name_clean)) - - if self.regroup: - categories = self._cleaned_categories_by_column.get(column_name) - - if not categories: - # Log warning if categories is None, which indicates it is - # not in fitted output - if categories is None: - log.warning("Column '{}' is not in fitted output " - "and will be skipped".format(column_name)) - return data - - data[column_name_clean] = (CategoricalDataProcessor - ._replace_categories( - data[column_name_clean], - categories)) - - # change data to categorical - data[column_name_clean] = data[column_name_clean].astype("category") - - return data - - def fit_transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: - """Summary - - Parameters - ---------- - data : pd.DataFrame - Data to be discretized - column_names : list - Columns of data to be discretized - - Returns - ------- - pd.DataFrame - data with additional discretized variables - """ - self.fit(data, column_names) - return self.transform(data, column_names) - - @staticmethod - def _get_small_categories(predictor_series: pd.Series, - incidence: float, - category_size_threshold: int) -> set: - """Fetch categories with a size below a certain threshold. - Note that we use an additional weighting with the overall incidence - - Parameters - ---------- - predictor_series : pd.Series - Description - incidence : float - global train incidence - category_size_threshold : int - minimal size of a category to keep it as a separate category - - Returns - ------- - set - List a categories with a count below a certain threshold - """ - category_counts = predictor_series.groupby(predictor_series).size() - factor = max(incidence, 1 - incidence) - - # Get all categories with a count below a threshold - bool_mask = (category_counts*factor) <= category_size_threshold - return set(category_counts[bool_mask].index.tolist()) - - @staticmethod - def _replace_missings(data: pd.DataFrame, - column_names: Optional[list]=None) -> pd.DataFrame: - """Replace missing values (incl empty strings) - - Parameters - ---------- - data : pd.DataFrame - data to replace missings in - column_names: list, optional - list of predictors to replace missings in - - Returns - ------- - list - list of unique values in the data - """ - # replace missings (incl. empty string) - regex = re.compile("^\\s+|\\s+$") - - temp = None - if column_names: - temp = data[column_names] - else: - temp = data.copy() - temp = temp.fillna("Missing") - temp = temp.replace(regex, "") - temp = temp.replace("", "Missing") - - return temp - - @staticmethod - def _compute_p_value(X: pd.Series, y: pd.Series, category: str, - scale_contingency_table: bool) -> float: - """Summary - - Parameters - ---------- - X : pd.Series - Description - y : pd.Series - Description - category : str - Description - scale_contingency_table : bool - Description - - Returns - ------- - float - Description - """ - df = pd.concat([X, y], axis=1) - df["other_categories"] = np.where(X == category, 0, 1) - - contigency_table = pd.crosstab(index=df['other_categories'], columns=y, - margins=False) - - # if true, we scale the "other" categories - if scale_contingency_table: - size_other_cats = contigency_table.iloc[1].sum() - incidence_mean = y.mean() - - contigency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats - contigency_table.iloc[1, 1] = incidence_mean * size_other_cats - contigency_table = contigency_table.values.astype(np.int64) - - return stats.chi2_contingency(contigency_table, correction=False)[1] - - @staticmethod - def _replace_categories(data: pd.Series, categories: set) -> pd.Series: - """replace categories in set with "Other" - - Parameters - ---------- - data : pd.Series - Description - categories : set - Description - - Returns - ------- - pd.Series - Description - """ - return data.apply(lambda x: x if x in categories else "Other") +""" +This class implements the Python Prediction's way of dealing with +categorical data preprocessing. There are three steps involved here: +- An optional regrouping of the different categories based on category size + and significance of the category w.r.t. the target +- Missing value replacement with the additional category "Missing" +- Change of dtype to "category" (could potentially lead to memory optimization) + +Authors: +- Geert Verstraeten (methodology) +- Jan Benisek (implementation) +- Matthias Roels (implementation) +""" +# standard lib imports +import re +from typing import Optional + +import logging +log = logging.getLogger(__name__) + +# third party imports +import numpy as np +import pandas as pd +from scipy import stats + +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError + + +class CategoricalDataProcessor(BaseEstimator): + """ + Regroups categories in categorical variables based on significance + with target variable. + + Attributes + ---------- + category_size_threshold : int + minimal size of a category to keep it as a separate category + forced_categories : dict + Map to prevent certain categories from being group into "Other" + for each colum - dict of the form {col:[forced vars]}. + keep_missing : bool + Whether or not to keep missing as a separate category + p_value_threshold : float + Significance threshold for regroupping. + regroup : bool + Whether or not to regroup categories + regroup_name : str + New name of the non-significant regrouped variables + scale_contingency_table : bool + Whether contingency table should be scaled before chi^2.' + """ + + valid_keys = ["regroup", "regroup_name", "keep_missing", + "category_size_threshold", "p_value_threshold", + "scale_contingency_table", "forced_categories"] + + def __init__(self, regroup: bool=True, regroup_name: str="Other", + keep_missing: bool=True, + category_size_threshold: int=5, + p_value_threshold: float=0.001, + scale_contingency_table: bool=True, + forced_categories: dict={}): + + self.regroup = regroup + self.regroup_name = regroup_name + self.keep_missing = keep_missing + self.category_size_threshold = category_size_threshold + self.p_value_threshold = p_value_threshold + self.scale_contingency_table = scale_contingency_table + self.forced_categories = forced_categories + + # dict to store fitted output in + self._cleaned_categories_by_column = {} + + def attributes_to_dict(self) -> dict: + """Return the attributes of CategoricalDataProcessor as a dictionary + + Returns + ------- + dict + Contains the attributes of CategoricalDataProcessor instance with + the attribute name as key + """ + params = self.get_params() + + params["_cleaned_categories_by_column"] = { + key: list(value) + for key, value in self._cleaned_categories_by_column.items() + } + + return params + + def set_attributes_from_dict(self, params: dict): + """Set instance attributes from a dictionary of values with key the + name of the attribute. + + Parameters + ---------- + params : dict + Contains the attributes of CategoricalDataProcessor with their + names as key. + + Raises + ------ + ValueError + In case _cleaned_categories_by_column is not of type dict + """ + _fitted_output = params.pop("_cleaned_categories_by_column", {}) + + if type(_fitted_output) != dict: + raise ValueError("_cleaned_categories_by_column is expected to " + "be a dict but is of type {} instead" + .format(type(_fitted_output))) + + # Clean out params dictionary to remove unknown keys (for safety!) + params = {key: params[key] for key in params if key in self.valid_keys} + + # We cannot turn this method into a classmethod as we want to make use + # of the following method from BaseEstimator: + self.set_params(**params) + + self._cleaned_categories_by_column = { + key: set(value) for key, value in _fitted_output.items() + } + + return self + + def fit(self, data: pd.DataFrame, column_names: list, + target_column: str): + """Fit the CategoricalDataProcessor + + Parameters + ---------- + data : pd.DataFrame + data used to compute the mapping to encode the categorical + variables with. + column_names : list + Columns of data to be processed + target_column : str + Column name of the target + """ + + if not self.regroup: + # We do not need to fit anything if regroup is set to False! + log.info("regroup was set to False, so no fitting is required") + return None + + for column_name in column_names: + + if column_name not in data.columns: + log.warning("DataFrame has no column '{}', so it will be " + "skipped in fitting" .format(column_name)) + continue + + cleaned_cats = self._fit_column(data, column_name, target_column) + + # Remove forced categories + forced_cats = self.forced_categories.get(column_name, set()) + cleaned_cats = cleaned_cats.union(forced_cats) + + # Add to _cleaned_categories_by_column for later use + self._cleaned_categories_by_column[column_name] = cleaned_cats + + def _fit_column(self, data: pd.DataFrame, column_name: str, + target_column) -> set: + """Compute which categories to regroup into "Other" for a particular + column + + Parameters + ---------- + data : pd.DataFrame + Description + column_name : str + Description + + Returns + ------- + list + list of categories to combine into a category "Other" + """ + y = data[target_column] + incidence = y.mean() + + combined_categories = set() + + # replace missings and get unique categories as a list + X = (CategoricalDataProcessor + ._replace_missings(data[column_name]) + .astype(object)) + + unique_categories = list(X.unique()) + + # get small categories and add them to the merged category list + small_categories = (CategoricalDataProcessor + ._get_small_categories( + X, + incidence, + self.category_size_threshold)) + combined_categories = combined_categories.union(small_categories) + + for category in unique_categories: + if category in small_categories: + continue + + pval = (CategoricalDataProcessor + ._compute_p_value(X, y, category, + self.scale_contingency_table)) + + # if not significant, add it to the list + if pval > self.p_value_threshold: + combined_categories.add(category) + + # Remove missing category from combined_categories if required + if self.keep_missing: + combined_categories.discard("Missing") + + return set(unique_categories).difference(combined_categories) + + def transform(self, data: pd.DataFrame, + column_names: list) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Data to be discretized + column_names : list + Columns of data to be discretized + + Returns + ------- + pd.DataFrame + data with additional discretized variables + """ + + if self.regroup and len(self._cleaned_categories_by_column) == 0: + msg = ("{} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + for column_name in column_names: + + if column_name not in data.columns: + log.warning("Unknown column '{}' will be skipped" + .format(column_name)) + continue + + data = self._transform_column(data, column_name) + + return data + + def _transform_column(self, data: pd.DataFrame, + column_name: str) -> pd.DataFrame: + """Given a DataFrame, a column name and a list of categories to + combine, create an additional column which combines these categories + into "Other" + + Parameters + ---------- + data : pd.DataFrame + Original data to be tranformed + column_name : str + name of the column to transform + + Returns + ------- + pd.DataFrame + original DataFrame with an added processed column + """ + + column_name_clean = column_name + "_processed" + data[column_name_clean] = data[column_name].astype(object) + + # Fill missings first + data[column_name_clean] = (CategoricalDataProcessor + ._replace_missings(data, + column_name_clean)) + + if self.regroup: + categories = self._cleaned_categories_by_column.get(column_name) + + if not categories: + # Log warning if categories is None, which indicates it is + # not in fitted output + if categories is None: + log.warning("Column '{}' is not in fitted output " + "and will be skipped".format(column_name)) + return data + + data[column_name_clean] = (CategoricalDataProcessor + ._replace_categories( + data[column_name_clean], + categories)) + + # change data to categorical + data[column_name_clean] = data[column_name_clean].astype("category") + + return data + + def fit_transform(self, data: pd.DataFrame, + column_names: list) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Data to be discretized + column_names : list + Columns of data to be discretized + + Returns + ------- + pd.DataFrame + data with additional discretized variables + """ + self.fit(data, column_names) + return self.transform(data, column_names) + + @staticmethod + def _get_small_categories(predictor_series: pd.Series, + incidence: float, + category_size_threshold: int) -> set: + """Fetch categories with a size below a certain threshold. + Note that we use an additional weighting with the overall incidence + + Parameters + ---------- + predictor_series : pd.Series + Description + incidence : float + global train incidence + category_size_threshold : int + minimal size of a category to keep it as a separate category + + Returns + ------- + set + List a categories with a count below a certain threshold + """ + category_counts = predictor_series.groupby(predictor_series).size() + factor = max(incidence, 1 - incidence) + + # Get all categories with a count below a threshold + bool_mask = (category_counts*factor) <= category_size_threshold + return set(category_counts[bool_mask].index.tolist()) + + @staticmethod + def _replace_missings(data: pd.DataFrame, + column_names: Optional[list]=None) -> pd.DataFrame: + """Replace missing values (incl empty strings) + + Parameters + ---------- + data : pd.DataFrame + data to replace missings in + column_names: list, optional + list of predictors to replace missings in + + Returns + ------- + list + list of unique values in the data + """ + # replace missings (incl. empty string) + regex = re.compile("^\\s+|\\s+$") + + temp = None + if column_names: + temp = data[column_names] + else: + temp = data.copy() + temp = temp.fillna("Missing") + temp = temp.replace(regex, "") + temp = temp.replace("", "Missing") + + return temp + + @staticmethod + def _compute_p_value(X: pd.Series, y: pd.Series, category: str, + scale_contingency_table: bool) -> float: + """Summary + + Parameters + ---------- + X : pd.Series + Description + y : pd.Series + Description + category : str + Description + scale_contingency_table : bool + Description + + Returns + ------- + float + Description + """ + df = pd.concat([X, y], axis=1) + df["other_categories"] = np.where(X == category, 0, 1) + + contigency_table = pd.crosstab(index=df['other_categories'], columns=y, + margins=False) + + # if true, we scale the "other" categories + if scale_contingency_table: + size_other_cats = contigency_table.iloc[1].sum() + incidence_mean = y.mean() + + contigency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats + contigency_table.iloc[1, 1] = incidence_mean * size_other_cats + contigency_table = contigency_table.values.astype(np.int64) + + return stats.chi2_contingency(contigency_table, correction=False)[1] + + @staticmethod + def _replace_categories(data: pd.Series, categories: set) -> pd.Series: + """replace categories in set with "Other" + + Parameters + ---------- + data : pd.Series + Description + categories : set + Description + + Returns + ------- + pd.Series + Description + """ + return data.apply(lambda x: x if x in categories else "Other") diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 6890a39..417ff02 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -1,483 +1,483 @@ -""" -This module is a rework of the old cobra data_preparation.py. Here we will make -use of the classes for discretization, preprocessing of categorical variables -and incidence replacement. All of which will be employed to create a -preprocessing pipeline, which can be stored as a JSON file so that it can -easily be re-used for scoring. - -Authors: -- Geert Verstraeten (methodology) -- Matthias Roels (implementation) -""" -# std lib imports -import json -from typing import Optional -import inspect -from datetime import datetime -import time - -import logging -log = logging.getLogger(__name__) -# third party imports -import pandas as pd -from sklearn.model_selection import train_test_split -from sklearn.base import BaseEstimator -from sklearn.exceptions import NotFittedError -# custom imports -from cobra.preprocessing import KBinsDiscretizer -from cobra.preprocessing import TargetEncoder -from cobra.preprocessing import CategoricalDataProcessor - -import cobra.utils as utils - - -class PreProcessor(BaseEstimator): - - """Summary - - Attributes - ---------- - categorical_data_processor : CategoricalDataProcessor - Instance of CategoricalDataProcessor to do the prepocessing of - categorical variables - discretizer : KBinsDiscretizer - Instance of KBinsDiscretizer to do the prepocessing of continuous - variables by means of discretization - serialization_path : str - path to save the pipeline to - stratify_split : bool - Whether or not to stratify the train-test split - target_encoder : TargetEncoder - Instance of TargetEncoder to do the incidence replacement - """ - - def __init__(self, categorical_data_processor: CategoricalDataProcessor, - discretizer: KBinsDiscretizer, - target_encoder: TargetEncoder, - serialization_path: str=None, - is_fitted: bool=False): - - self.serialization_path = serialization_path - - self._categorical_data_processor = categorical_data_processor - self._discretizer = discretizer - self._target_encoder = target_encoder - - self._is_fitted = is_fitted - - @classmethod - def from_params(cls, - n_bins: int=10, - strategy: str="quantile", - closed: str="right", - auto_adapt_bins: bool=False, - starting_precision: int=0, - label_format: str="{} - {}", - change_endpoint_format: bool=False, - regroup: bool=True, - regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}, - weight: float=0.0, - serialization_path: Optional[str]=None): - """Constructor to instantiate PreProcessor from all the parameters - that can be set in all its required (attribute) classes. - - Parameters - ---------- - n_bins : int, optional - Number of bins to produce. Raises ValueError if ``n_bins < 2``. - strategy : str, optional - Binning strategy. Currently only "uniform" and "quantile" - e.g. equifrequency is supported - closed : str, optional - Whether to close the bins (intervals) from the left or right - auto_adapt_bins : bool, optional - reduces the number of bins (starting from n_bins) as a function of - the number of missings - starting_precision : int, optional - Initial precision for the bin edges to start from, - can also be negative. Given a list of bin edges, the class will - automatically choose the minimal precision required to have proper - bins e.g. [5.5555, 5.5744, ...] will be rounded - to [5.56, 5.57, ...]. In case of a negative number, an attempt will - be made to round up the numbers of the bin edges - e.g. 5.55 -> 10, 146 -> 100, ... - label_format : str, optional - format string to display the bin labels - e.g. min - max, (min, max], ... - change_endpoint_format : bool, optional - Whether or not to change the format of the lower and upper bins - into "< x" and "> y" resp. - regroup : bool - Whether or not to regroup categories - regroup_name : str - New name of the non-significant regrouped variables - keep_missing : bool - Whether or not to keep missing as a separate category - category_size_threshold : int - minimal size of a category to keep it as a separate category - p_value_threshold : float - Significance threshold for regroupping. - forced_categories : dict - Map to prevent certain categories from being group into "Other" - for each colum - dict of the form {col:[forced vars]}. - scale_contingency_table : bool - Whether contingency table should be scaled before chi^2.' - weight : float, optional - Smoothing parameters (non-negative). The higher the value of the - parameter, the bigger the contribution of the overall mean. - When set to zero, there is no smoothing - (e.g. the pure target incidence is used). - serialization_path : str, optional - path to save the pipeline to - - Returns - ------- - PreProcessor - Description - """ - categorical_data_processor = CategoricalDataProcessor( - regroup, - regroup_name, - keep_missing, - category_size_threshold, - p_value_threshold, - scale_contingency_table, - forced_categories) - discretizer = KBinsDiscretizer(n_bins, strategy, closed, - auto_adapt_bins, - starting_precision, - label_format, - change_endpoint_format) - - target_encoder = TargetEncoder(weight) - - return cls(categorical_data_processor, discretizer, target_encoder, - serialization_path) - - @classmethod - def from_pipeline(cls, pipeline_path: str): - """Constructor to instantiate PreProcessor from a (fitted) pipeline, - stored as a JSON file. - - Parameters - ---------- - pipeline_path : str - path to the (fitted) pipeline - - Returns - ------- - PreProcessor - Instance of PreProcessor instantiated from a stored pipeline - - Raises - ------ - ValueError - Description - """ - with open(pipeline_path, "r") as file: - pipeline = json.load(file) - - if not PreProcessor._is_valid_pipeline(pipeline): - raise ValueError("Invalid pipeline") # To do: specify error - - categorical_data_processor = CategoricalDataProcessor() - categorical_data_processor.set_attributes_from_dict( - pipeline["categorical_data_processor"] - ) - - discretizer = KBinsDiscretizer() - discretizer.set_attributes_from_dict(pipeline["discretizer"]) - - target_encoder = TargetEncoder() - target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) - - return cls(categorical_data_processor, discretizer, target_encoder, - is_fitted=pipeline["_is_fitted"]) - - def fit(self, train_data: pd.DataFrame, continuous_vars: list, - discrete_vars: list, target_column_name: str): - """Fit the data to the preprocessing pipeline - - Parameters - ---------- - train_data : pd.DataFrame - Data to be preprocessed - continuous_vars : list, optional - list of continuous variables - discrete_vars : list, optional - list of discrete variables - target_column_name : str - Name of the target column - """ - - # get list of all variables - preprocessed_variable_names = (PreProcessor - ._get_variable_list(continuous_vars, - discrete_vars)) - - log.info("Starting to fit pipeline") - start = time.time() - - # Fit discretizer, categorical preprocessor & target encoder - # Note that in order to fit target_encoder, we first have to transform - # the data using the fitted discretizer & categorical_data_processor - if continuous_vars: - begin = time.time() - self._discretizer.fit(train_data, continuous_vars) - log.info("Fitting KBinsDiscretizer took {} seconds" - .format(time.time() - begin)) - - train_data = self._discretizer.transform(train_data, - continuous_vars) - - if discrete_vars: - begin = time.time() - self._categorical_data_processor.fit(train_data, - discrete_vars, - target_column_name) - log.info("Fitting categorical_data_processor class took {} seconds" - .format(time.time() - begin)) - - train_data = (self._categorical_data_processor - .transform(train_data, discrete_vars)) - - begin = time.time() - self._target_encoder.fit(train_data, preprocessed_variable_names, - target_column_name) - log.info("Fitting TargetEncoder took {} seconds" - .format(time.time() - begin)) - - self._is_fitted = True # set fitted boolean to True - # serialize the pipeline to store the fitted output along with the - # various parameters that were used - self._serialize() - - log.info("Fitting and serializing pipeline took {} seconds" - .format(time.time() - start)) - - def transform(self, data: pd.DataFrame, continuous_vars: list, - discrete_vars: list) -> pd.DataFrame: - """Summary - - Parameters - ---------- - data : pd.DataFrame - Description - continuous_vars : list, optional - list of continuous variables - discrete_vars : list, optional - list of discrete variables - - Returns - ------- - pd.DataFrame - Description - - Raises - ------ - NotFittedError - Description - """ - - start = time.time() - - if not self._is_fitted: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - - raise NotFittedError(msg.format(self.__class__.__name__)) - - preprocessed_variable_names = (PreProcessor - ._get_variable_list(continuous_vars, - discrete_vars)) - - if continuous_vars: - data = self._discretizer.transform(data, continuous_vars) - - if discrete_vars: - data = self._categorical_data_processor.transform(data, - discrete_vars) - - data = self._target_encoder.transform(data, - preprocessed_variable_names) - - log.info("Transforming data took {} seconds" - .format(time.time() - start)) - - return data - - @staticmethod - def train_selection_validation_split(data: pd.DataFrame, - target_column_name: str, - train_pct: float=0.6, - selection_pct: float=0.2, - validation_pct: float=0.2, - stratify_split=True)->pd.DataFrame: - """Split dataset into train-selection-validation datasets and merge - them into one big DataFrame with an additional column "split" - indicating to which dataset the corresponding row belongs to. - - Parameters - ---------- - data : pd.DataFrame - Input dataset to split into train-selection and validation sets - target_column_name : str - Name of the target column - train_pct : float, optional - Percentage data to put in train set - selection_pct : float, optional - Percentage data to put in selection set - validation_pct : float, optional - Percentage data to put in validation set - stratify_split : bool, optional - Whether or not to stratify the train-test split - - Returns - ------- - pd.DataFrame - Description - """ - column_names = list(data.columns) - - predictors = [col for col in column_names if col != target_column_name] - - # for the first split, take sum of selection & validation pct as - # test pct - test_pct = selection_pct + validation_pct - # To further split our test set into selection + validation set, - # we have to modify validation pct because we only have test_pct of - # the data available anymore for further splitting! - validation_pct_modif = validation_pct / test_pct - - X = data[predictors] - y = data[target_column_name] - - stratify = None - if stratify_split: - stratify = y - - X_train, X_test, y_train, y_test = train_test_split(X, y, - test_size=test_pct, - random_state=42, - stratify=stratify) - - if stratify_split: - stratify = y_test - - X_sel, X_val, y_sel, y_val = train_test_split( - X_test, y_test, - test_size=validation_pct_modif, - random_state=42, - stratify=stratify - ) - - df_train = pd.DataFrame(X_train, columns=predictors) - df_train[target_column_name] = y_train - df_train["split"] = "train" - - df_selection = pd.DataFrame(X_sel, columns=predictors) - df_selection[target_column_name] = y_sel - df_selection["split"] = "selection" - - df_validation = pd.DataFrame(X_val, columns=predictors) - df_validation[target_column_name] = y_val - df_validation["split"] = "validation" - - return (pd.concat([df_train, df_selection, df_validation]) - .reset_index(drop=True)) - - def _serialize(self) -> dict: - """Serialize the preprocessing pipeline by writing all its required - parameters to a JSON file. - - Returns - ------- - dict - Return the pipeline as a dictionary - """ - pipeline = { - "metadata": { - "timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S") - } - } - - pipeline["categorical_data_processor"] = (self - ._categorical_data_processor - .attributes_to_dict()) - - pipeline["discretizer"] = self._discretizer.attributes_to_dict() - pipeline["target_encoder"] = (self._target_encoder - .attributes_to_dict()) - - pipeline["_is_fitted"] = True - - if self.serialization_path: - path = self.serialization_path - else: - path = "./pipeline_tmp.json" - - with open(path, "w") as file: - json.dump(pipeline, file) - - return pipeline - - @staticmethod - def _is_valid_pipeline(pipeline: dict) -> bool: - """Validate the loaded pipeline by checking if all required parameters - are present (and no others!). - - Parameters - ---------- - pipeline : dict - Loaded pipeline from json file - """ - keys = inspect.getfullargspec(PreProcessor.from_params).args - valid_keys = set([key for key in keys - if key not in ["cls", "serialization_path"]]) - - input_keys = set() - for key in pipeline: - if key in ["categorical_data_processor", "discretizer", - "target_encoder"]: - input_keys = input_keys.union(set(pipeline[key].keys())) - elif key != "metadata": - input_keys.add(key) - - input_keys = sorted(list(input_keys)) - input_keys = [key for key in input_keys if not key.startswith("_")] - - return sorted(list(valid_keys)) == sorted(list(input_keys)) - - @staticmethod - def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: - """Summary - - Parameters - ---------- - continuous_vars : list - Description - discrete_vars : list - Description - - Returns - ------- - list - Description - - Raises - ------ - ValueError - Description - """ - var_list = ([col + "_processed" for col in discrete_vars] - + [col + "_bin" for col in continuous_vars]) - - if not var_list: - raise ValueError("Variable var_list is None or empty list") - - return var_list +""" +This module is a rework of the old cobra data_preparation.py. Here we will make +use of the classes for discretization, preprocessing of categorical variables +and incidence replacement. All of which will be employed to create a +preprocessing pipeline, which can be stored as a JSON file so that it can +easily be re-used for scoring. + +Authors: +- Geert Verstraeten (methodology) +- Matthias Roels (implementation) +""" +# std lib imports +import json +from typing import Optional +import inspect +from datetime import datetime +import time + +import logging +log = logging.getLogger(__name__) +# third party imports +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError +# custom imports +from cobra.preprocessing import KBinsDiscretizer +from cobra.preprocessing import TargetEncoder +from cobra.preprocessing import CategoricalDataProcessor + +import cobra.utils as utils + + +class PreProcessor(BaseEstimator): + + """Summary + + Attributes + ---------- + categorical_data_processor : CategoricalDataProcessor + Instance of CategoricalDataProcessor to do the prepocessing of + categorical variables + discretizer : KBinsDiscretizer + Instance of KBinsDiscretizer to do the prepocessing of continuous + variables by means of discretization + serialization_path : str + path to save the pipeline to + stratify_split : bool + Whether or not to stratify the train-test split + target_encoder : TargetEncoder + Instance of TargetEncoder to do the incidence replacement + """ + + def __init__(self, categorical_data_processor: CategoricalDataProcessor, + discretizer: KBinsDiscretizer, + target_encoder: TargetEncoder, + serialization_path: str=None, + is_fitted: bool=False): + + self.serialization_path = serialization_path + + self._categorical_data_processor = categorical_data_processor + self._discretizer = discretizer + self._target_encoder = target_encoder + + self._is_fitted = is_fitted + + @classmethod + def from_params(cls, + n_bins: int=10, + strategy: str="quantile", + closed: str="right", + auto_adapt_bins: bool=False, + starting_precision: int=0, + label_format: str="{} - {}", + change_endpoint_format: bool=False, + regroup: bool=True, + regroup_name: str="Other", + keep_missing: bool=True, + category_size_threshold: int=5, + p_value_threshold: float=0.001, + scale_contingency_table: bool=True, + forced_categories: dict={}, + weight: float=0.0, + serialization_path: Optional[str]=None): + """Constructor to instantiate PreProcessor from all the parameters + that can be set in all its required (attribute) classes. + + Parameters + ---------- + n_bins : int, optional + Number of bins to produce. Raises ValueError if ``n_bins < 2``. + strategy : str, optional + Binning strategy. Currently only "uniform" and "quantile" + e.g. equifrequency is supported + closed : str, optional + Whether to close the bins (intervals) from the left or right + auto_adapt_bins : bool, optional + reduces the number of bins (starting from n_bins) as a function of + the number of missings + starting_precision : int, optional + Initial precision for the bin edges to start from, + can also be negative. Given a list of bin edges, the class will + automatically choose the minimal precision required to have proper + bins e.g. [5.5555, 5.5744, ...] will be rounded + to [5.56, 5.57, ...]. In case of a negative number, an attempt will + be made to round up the numbers of the bin edges + e.g. 5.55 -> 10, 146 -> 100, ... + label_format : str, optional + format string to display the bin labels + e.g. min - max, (min, max], ... + change_endpoint_format : bool, optional + Whether or not to change the format of the lower and upper bins + into "< x" and "> y" resp. + regroup : bool + Whether or not to regroup categories + regroup_name : str + New name of the non-significant regrouped variables + keep_missing : bool + Whether or not to keep missing as a separate category + category_size_threshold : int + minimal size of a category to keep it as a separate category + p_value_threshold : float + Significance threshold for regroupping. + forced_categories : dict + Map to prevent certain categories from being group into "Other" + for each colum - dict of the form {col:[forced vars]}. + scale_contingency_table : bool + Whether contingency table should be scaled before chi^2.' + weight : float, optional + Smoothing parameters (non-negative). The higher the value of the + parameter, the bigger the contribution of the overall mean. + When set to zero, there is no smoothing + (e.g. the pure target incidence is used). + serialization_path : str, optional + path to save the pipeline to + + Returns + ------- + PreProcessor + Description + """ + categorical_data_processor = CategoricalDataProcessor( + regroup, + regroup_name, + keep_missing, + category_size_threshold, + p_value_threshold, + scale_contingency_table, + forced_categories) + discretizer = KBinsDiscretizer(n_bins, strategy, closed, + auto_adapt_bins, + starting_precision, + label_format, + change_endpoint_format) + + target_encoder = TargetEncoder(weight) + + return cls(categorical_data_processor, discretizer, target_encoder, + serialization_path) + + @classmethod + def from_pipeline(cls, pipeline_path: str): + """Constructor to instantiate PreProcessor from a (fitted) pipeline, + stored as a JSON file. + + Parameters + ---------- + pipeline_path : str + path to the (fitted) pipeline + + Returns + ------- + PreProcessor + Instance of PreProcessor instantiated from a stored pipeline + + Raises + ------ + ValueError + Description + """ + with open(pipeline_path, "r") as file: + pipeline = json.load(file) + + if not PreProcessor._is_valid_pipeline(pipeline): + raise ValueError("Invalid pipeline") # To do: specify error + + categorical_data_processor = CategoricalDataProcessor() + categorical_data_processor.set_attributes_from_dict( + pipeline["categorical_data_processor"] + ) + + discretizer = KBinsDiscretizer() + discretizer.set_attributes_from_dict(pipeline["discretizer"]) + + target_encoder = TargetEncoder() + target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) + + return cls(categorical_data_processor, discretizer, target_encoder, + is_fitted=pipeline["_is_fitted"]) + + def fit(self, train_data: pd.DataFrame, continuous_vars: list, + discrete_vars: list, target_column_name: str): + """Fit the data to the preprocessing pipeline + + Parameters + ---------- + train_data : pd.DataFrame + Data to be preprocessed + continuous_vars : list, optional + list of continuous variables + discrete_vars : list, optional + list of discrete variables + target_column_name : str + Name of the target column + """ + + # get list of all variables + preprocessed_variable_names = (PreProcessor + ._get_variable_list(continuous_vars, + discrete_vars)) + + log.info("Starting to fit pipeline") + start = time.time() + + # Fit discretizer, categorical preprocessor & target encoder + # Note that in order to fit target_encoder, we first have to transform + # the data using the fitted discretizer & categorical_data_processor + if continuous_vars: + begin = time.time() + self._discretizer.fit(train_data, continuous_vars) + log.info("Fitting KBinsDiscretizer took {} seconds" + .format(time.time() - begin)) + + train_data = self._discretizer.transform(train_data, + continuous_vars) + + if discrete_vars: + begin = time.time() + self._categorical_data_processor.fit(train_data, + discrete_vars, + target_column_name) + log.info("Fitting categorical_data_processor class took {} seconds" + .format(time.time() - begin)) + + train_data = (self._categorical_data_processor + .transform(train_data, discrete_vars)) + + begin = time.time() + self._target_encoder.fit(train_data, preprocessed_variable_names, + target_column_name) + log.info("Fitting TargetEncoder took {} seconds" + .format(time.time() - begin)) + + self._is_fitted = True # set fitted boolean to True + # serialize the pipeline to store the fitted output along with the + # various parameters that were used + self._serialize() + + log.info("Fitting and serializing pipeline took {} seconds" + .format(time.time() - start)) + + def transform(self, data: pd.DataFrame, continuous_vars: list, + discrete_vars: list) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Description + continuous_vars : list, optional + list of continuous variables + discrete_vars : list, optional + list of discrete variables + + Returns + ------- + pd.DataFrame + Description + + Raises + ------ + NotFittedError + Description + """ + + start = time.time() + + if not self._is_fitted: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + preprocessed_variable_names = (PreProcessor + ._get_variable_list(continuous_vars, + discrete_vars)) + + if continuous_vars: + data = self._discretizer.transform(data, continuous_vars) + + if discrete_vars: + data = self._categorical_data_processor.transform(data, + discrete_vars) + + data = self._target_encoder.transform(data, + preprocessed_variable_names) + + log.info("Transforming data took {} seconds" + .format(time.time() - start)) + + return data + + @staticmethod + def train_selection_validation_split(data: pd.DataFrame, + target_column_name: str, + train_pct: float=0.6, + selection_pct: float=0.2, + validation_pct: float=0.2, + stratify_split=True)->pd.DataFrame: + """Split dataset into train-selection-validation datasets and merge + them into one big DataFrame with an additional column "split" + indicating to which dataset the corresponding row belongs to. + + Parameters + ---------- + data : pd.DataFrame + Input dataset to split into train-selection and validation sets + target_column_name : str + Name of the target column + train_pct : float, optional + Percentage data to put in train set + selection_pct : float, optional + Percentage data to put in selection set + validation_pct : float, optional + Percentage data to put in validation set + stratify_split : bool, optional + Whether or not to stratify the train-test split + + Returns + ------- + pd.DataFrame + Description + """ + column_names = list(data.columns) + + predictors = [col for col in column_names if col != target_column_name] + + # for the first split, take sum of selection & validation pct as + # test pct + test_pct = selection_pct + validation_pct + # To further split our test set into selection + validation set, + # we have to modify validation pct because we only have test_pct of + # the data available anymore for further splitting! + validation_pct_modif = validation_pct / test_pct + + X = data[predictors] + y = data[target_column_name] + + stratify = None + if stratify_split: + stratify = y + + X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=test_pct, + random_state=42, + stratify=stratify) + + if stratify_split: + stratify = y_test + + X_sel, X_val, y_sel, y_val = train_test_split( + X_test, y_test, + test_size=validation_pct_modif, + random_state=42, + stratify=stratify + ) + + df_train = pd.DataFrame(X_train, columns=predictors) + df_train[target_column_name] = y_train + df_train["split"] = "train" + + df_selection = pd.DataFrame(X_sel, columns=predictors) + df_selection[target_column_name] = y_sel + df_selection["split"] = "selection" + + df_validation = pd.DataFrame(X_val, columns=predictors) + df_validation[target_column_name] = y_val + df_validation["split"] = "validation" + + return (pd.concat([df_train, df_selection, df_validation]) + .reset_index(drop=True)) + + def _serialize(self) -> dict: + """Serialize the preprocessing pipeline by writing all its required + parameters to a JSON file. + + Returns + ------- + dict + Return the pipeline as a dictionary + """ + pipeline = { + "metadata": { + "timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S") + } + } + + pipeline["categorical_data_processor"] = (self + ._categorical_data_processor + .attributes_to_dict()) + + pipeline["discretizer"] = self._discretizer.attributes_to_dict() + pipeline["target_encoder"] = (self._target_encoder + .attributes_to_dict()) + + pipeline["_is_fitted"] = True + + if self.serialization_path: + path = self.serialization_path + else: + path = "./pipeline_tmp.json" + + with open(path, "w") as file: + json.dump(pipeline, file) + + return pipeline + + @staticmethod + def _is_valid_pipeline(pipeline: dict) -> bool: + """Validate the loaded pipeline by checking if all required parameters + are present (and no others!). + + Parameters + ---------- + pipeline : dict + Loaded pipeline from json file + """ + keys = inspect.getfullargspec(PreProcessor.from_params).args + valid_keys = set([key for key in keys + if key not in ["cls", "serialization_path"]]) + + input_keys = set() + for key in pipeline: + if key in ["categorical_data_processor", "discretizer", + "target_encoder"]: + input_keys = input_keys.union(set(pipeline[key].keys())) + elif key != "metadata": + input_keys.add(key) + + input_keys = sorted(list(input_keys)) + input_keys = [key for key in input_keys if not key.startswith("_")] + + return sorted(list(valid_keys)) == sorted(list(input_keys)) + + @staticmethod + def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: + """Summary + + Parameters + ---------- + continuous_vars : list + Description + discrete_vars : list + Description + + Returns + ------- + list + Description + + Raises + ------ + ValueError + Description + """ + var_list = ([col + "_processed" for col in discrete_vars] + + [col + "_bin" for col in continuous_vars]) + + if not var_list: + raise ValueError("Variable var_list is None or empty list") + + return var_list From dc2b1eaefd2f5f97629c1dd445e422ac2211051c Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 18 Mar 2020 10:57:10 +0100 Subject: [PATCH 46/98] Add forward_selection submodule to model_building module --- cobra/model_building/__init__.py | 12 ++ cobra/model_building/forward_selection.py | 190 ++++++++++++++++++++++ cobra/model_building/models.py | 110 +++++++++++++ 3 files changed, 312 insertions(+) create mode 100644 cobra/model_building/__init__.py create mode 100644 cobra/model_building/forward_selection.py create mode 100644 cobra/model_building/models.py diff --git a/cobra/model_building/__init__.py b/cobra/model_building/__init__.py new file mode 100644 index 0000000..63d074b --- /dev/null +++ b/cobra/model_building/__init__.py @@ -0,0 +1,12 @@ +from .univariate_selection import compute_univariate_preselection +from .univariate_selection import get_preselected_predictors +from .univariate_selection import compute_correlations + +from .models import LogisticRegressionModel +from .forward_selection import ForwardFeatureSelection + +__all__ = ['compute_univariate_preselection', + 'get_preselected_predictors', + 'compute_correlations', + 'LogisticRegressionModel', + 'ForwardFeatureSelection'] diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py new file mode 100644 index 0000000..fec7a8f --- /dev/null +++ b/cobra/model_building/forward_selection.py @@ -0,0 +1,190 @@ +import logging +log = logging.getLogger(__name__) + +import pandas as pd + +from cobra.model_building import LogisticRegressionModel as MLModel + + +class ForwardFeatureSelection: + + """Summary + + Attributes + ---------- + max_predictors : int + Description + model_name : str + Description + pos_only : bool + Description + """ + + def __init__(self, max_predictors: int=50, + model_name: str="logistic-regression", pos_only: bool=True): + + self.pos_only = pos_only + self.max_predictors = max_predictors + self.model_name = model_name + + self._fitted_models = [] + + def fit(self, data: pd.DataFrame, target_column_name: str, + predictors: list, forced_predictors: list=[], + excluded_predictors: list=[]): + """Summary + + Parameters + ---------- + data : pd.DataFrame + Description + target_column_name : str + Description + predictors : list + Description + forced_predictors : list, optional + Description + excluded_predictors : list, optional + Description + + Raises + ------ + ValueError + In case the number of forced predictors is larger than the maximum + number of allowed predictors in the model + """ + # prep predictor lists + filterd_predictors = [var for var in predictors + if (var not in excluded_predictors + and var not in forced_predictors)] + + # checks on predictor lists and self.max_predictors attr + if len(forced_predictors) > self.max_predictors: + raise ValueError("Size or forced_predictors cannot be bigger than " + "max_predictors") + elif len(forced_predictors) == self.max_predictors: + log.info("Size of forced_predictors equals max_predictors " + "only one model will be trained...") + # train model with all forced_predictors (only) + (self._fitted_models + .append(self._train_model(data[data["split"] == "train"], + target_column_name, + forced_predictors))) + else: + self._forward_selection(data, target_column_name, + filterd_predictors, + forced_predictors) + + def _forward_selection(self, data: pd.DataFrame, target_column_name: str, + predictors: list, forced_predictors: list=[]): + """Summary + + Parameters + ---------- + data : pd.DataFrame + Description + target_column_name : str + Description + predictors : list + Description + forced_predictors : list, optional + Description + """ + current_predictors = [] + + for step in range(1, self.max_predictors + 1): + if step <= len(forced_predictors): + # first, we go through forced predictors + candidate_predictors = list(set(forced_predictors) + .difference( + set(current_predictors))) + else: + candidate_predictors = [var for var in predictors + if var not in current_predictors] + + model = self._find_next_best_model(data[data["split"] == "train"], + target_column_name, + candidate_predictors, + current_predictors) + # if no new model was found, e.g. because there was no model with + # only positive coefficients, and all forced predictors were + # already tested (i.e. we are now looping through the other + # predictors) break out of the loop! + if (model is None) and (step > len(forced_predictors)): + break + + if model is not None: + self._fitted_models.append(model) + + if not self._fitted_models: + log.error("No models found in forward selection") + + def _find_next_best_model(self, data: pd.DataFrame, + target_column_name: str, + candidate_predictors: list, + current_predictors: list) -> MLModel: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Description + target_column_name : str + Description + candidate_predictors : list + Description + current_predictors : list + Description + + Returns + ------- + MLModel + Description + """ + # placeholders + best_model = None + best_auc = -1 + + for pred in candidate_predictors: + + # train model with additional predictor + model = self._train_model(data, target_column_name, + (current_predictors + [pred])) + # Evaluate model + auc_pred = model.evaluate(data[current_predictors + [pred]], + data[target_column_name]) + + if (self.pos_only and (not (model.get_coef() >= 0).all())): + continue + + # check if model is better than current best model + # and if yes, replace current best! + if (auc_pred >= best_auc): + best_auc = auc_pred + best_model = model + + return best_model + + def _train_model(self, data: pd.DataFrame, target_column_name: str, + predictors: list) -> MLModel: + """Summary + + Parameters + ---------- + data : pd.DataFrame + Description + target_column_name : str + Description + predictors : list + Description + + Returns + ------- + MLModel + Description + """ + model = MLModel() + + model.fit(data[predictors], data[target_column_name]) + + return model diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py new file mode 100644 index 0000000..0ba0398 --- /dev/null +++ b/cobra/model_building/models.py @@ -0,0 +1,110 @@ +import numpy as np +import pandas as pd +from sklearn.metrics import roc_auc_score +from sklearn.linear_model import LogisticRegression + + +class LogisticRegressionModel: + + """Wrapper around the LogisticRegression class, with additional methods + implemented such as evaluation (using auc), getting a list of coefficients, + a ditionary of coefficients per predictor, ... for convenience + + Attributes + ---------- + logit : LogisticRegression + scikit-learn logistic regression model + predictors : list + List of predictors used in the model + """ + + def __init__(self): + self.logit = LogisticRegression(fit_intercept=True, C=1e9, + solver='liblinear') + # placeholder to keep track of a list of predictors + self.predictors = [] + self._eval_metrics_by_split = {} + + def get_coef(self) -> np.array: + """Returns the model coefficients + + Returns + ------- + np.array + array of model coefficients + """ + return self.logit.coef_[0] + + def get_intercepts(self) -> float: + """Returns the intercept of the model + + Returns + ------- + float + intercept of the model + """ + return self.logit.intercept_[0] + + def get_coef_by_predictor(self) -> dict: + """Returns a map predictor -> coefficient + + Returns + ------- + dict + map predictor -> coefficient + """ + return dict(zip(self.predictors, self.logit.coef_[0])) + + def fit(self, X_train: pd.DataFrame, y_train: pd.Series): + """Fit the model + + Parameters + ---------- + X_train : pd.DataFrame + predictors of train data + y_train : pd.Series + target of train data + """ + self.predictors = X_train.columns + self.logit.fit(X_train, y_train) + + def score_model(self, X: pd.DataFrame) -> np.ndarray: + """Score a model on a (new) dataset + + Parameters + ---------- + X : pd.DataFrame + dataset of predictors to score the model + + Returns + ------- + np.ndarray + score of the model for each observation + """ + return self.logit.predict_proba(X)[:, 1] + + def evaluate(self, X: pd.DataFrame, y: pd.Series, + split: str="train") -> float: + """Evaluate the model on a given split (train, selection, validation) + of a data set (X, y) + + Parameters + ---------- + X : pd.DataFrame + dataset containing the predictor values for each observation + y : pd.Series + dataset containig the target of each observation + split : str, optional + split of the dataset (e.g. train-selection-validation) + + Returns + ------- + float + the performance score of the model (e.g. AUC) + """ + if self._eval_metrics_by_split.get(split) is None: + y_pred = self.score_model(X) + + self._eval_metrics_by_split[split] = roc_auc_score(y_true=y, + y_score=y_pred) + return self._eval_metrics_by_split[split] From 4a5195cc6028ab6a52f023fe34fa4e12358f6165 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 18 Mar 2020 15:20:36 +0100 Subject: [PATCH 47/98] Add convenience function to univariate selection This function returns a list of preselection predictors --- cobra/model_building/univariate_selection.py | 23 ++++++++++++++++++++ cobra/preprocessing/preprocessor.py | 2 -- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index de8ef3f..1979ac8 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -84,6 +84,29 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, return df_auc +def get_preselected_predictors(df_auc: pd.DataFrame) -> list: + """Wrapper function to extract a list of predictors + from df_auc + + Parameters + ---------- + df_auc : pd.DataFrame + DataFrame containing for each variable the train auc and + test auc allong with a boolean indicating whether or not it is selected + based on the criteria + + Returns + ------- + list + list of preselected predictors + """ + predictor_list = (df_auc[df_auc["preselection"]] + .sort_values(by='AUC selection', ascending=False) + .predictor.tolist()) + + return [col + "_enc" for col in predictor_list] + + def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame: """Given a DataFrame and a list of predictors, compute the correlations diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 417ff02..12ebcab 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -28,8 +28,6 @@ from cobra.preprocessing import TargetEncoder from cobra.preprocessing import CategoricalDataProcessor -import cobra.utils as utils - class PreProcessor(BaseEstimator): From 3d94f63afb4dec94365a522ee026bd682a6c9224 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 18 Mar 2020 15:31:56 +0100 Subject: [PATCH 48/98] Clean up models.py --- cobra/model_building/models.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 0ba0398..39c7a97 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -35,7 +35,7 @@ def get_coef(self) -> np.array: """ return self.logit.coef_[0] - def get_intercepts(self) -> float: + def get_intercept(self) -> float: """Returns the intercept of the model Returns @@ -65,7 +65,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series): y_train : pd.Series target of train data """ - self.predictors = X_train.columns + self.predictors = list(X_train.columns) self.logit.fit(X_train, y_train) def score_model(self, X: pd.DataFrame) -> np.ndarray: @@ -81,7 +81,9 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: np.ndarray score of the model for each observation """ - return self.logit.predict_proba(X)[:, 1] + # We select predictor columns (self.predictors) here to + # ensure we have the proper predictors and the proper order!!! + return self.logit.predict_proba(X[self.predictors])[:, 1] def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str="train") -> float: @@ -103,6 +105,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series, the performance score of the model (e.g. AUC) """ if self._eval_metrics_by_split.get(split) is None: + y_pred = self.score_model(X) self._eval_metrics_by_split[split] = roc_auc_score(y_true=y, From f05f15479941b4f881d5b6fda81bbeb474b8b102 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 18 Mar 2020 15:32:20 +0100 Subject: [PATCH 49/98] Add functions to explore result of forward_selection One function is to obtain the model of a particular step. The other function is to compute the performance (e.g. AUC) of the models in the different steps for train-selection-validation sets. The result of the later function is cached in each of the models to optimize recomputation. --- cobra/model_building/forward_selection.py | 140 ++++++++++++++++++---- 1 file changed, 117 insertions(+), 23 deletions(-) diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index fec7a8f..7845d88 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -8,16 +8,19 @@ class ForwardFeatureSelection: - """Summary + """Perform forward feature selection for a given dataset using a given + model. Attributes ---------- max_predictors : int - Description + maximum number of predictors allowed in any model. This corresponds + more or less with the maximum number of steps in the forward feature + selection model_name : str - Description + name of the model to use for forward feature selection pos_only : bool - Description + whether or not the model coefficients should all be positive """ def __init__(self, max_predictors: int=50, @@ -29,7 +32,90 @@ def __init__(self, max_predictors: int=50, self._fitted_models = [] - def fit(self, data: pd.DataFrame, target_column_name: str, + def get_model_from_step(self, step: int) -> MLModel: + """Get fitted model from a particular step + + Parameters + ---------- + step : int + Particular step in the forward selection + + Returns + ------- + MLModel + Fitted model from the given step + + Raises + ------ + ValueError + in case step is larger than the number of available models + """ + if len(self._fitted_models) < step: + raise ValueError(f"No model available for step {step}") + + return self._fitted_models[step] + + def compute_model_performances(self, data: pd.DataFrame, + target_column_name: str) -> list: + """Compute for each model the performance for + train-selection-validation sets and return them along with a list + of predictors used in the model. + Note that the computation of the performance for each split is + cached inside the model itself, so it is inexpensive to perform + it multiple times! + + Parameters + ---------- + data : pd.DataFrame + dataset for which to compute performance of each model + target_column_name : str + name of the target column + + Returns + ------- + list + A list containing for each model the performance for + train-selection-validation sets as well as the set of predictors + used in this model + """ + results = [] + predictor_set = set([]) + for model in self._fitted_models: + # Evaluate model + performance_train = model.evaluate( + data[data["split"] == "train"], + data[data["split"] == "train"][target_column_name], + split="train" + ) + + performance_selection = model.evaluate( + data[data["split"] == "selection"], + data[data["split"] == "selection"][target_column_name], + split="selection" + ) + + performance_validation = model.evaluate( + data[data["split"] == "validation"], + data[data["split"] == "validation"][target_column_name], + split="validation" + ) + + last_added_predictor = (set(model.predictors) + .difference(predictor_set)) + + results.append({ + "predictors": model.predictors, + "last_added_predictor": list(last_added_predictor)[0], + "train_performance": performance_train, + "selection_performance": performance_selection, + "validation_performance": performance_validation + }) + + predictor_set = predictor_set.union(set(model.predictors)) + + return results + + def fit(self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list=[], excluded_predictors: list=[]): """Summary @@ -67,21 +153,22 @@ def fit(self, data: pd.DataFrame, target_column_name: str, "only one model will be trained...") # train model with all forced_predictors (only) (self._fitted_models - .append(self._train_model(data[data["split"] == "train"], + .append(self._train_model(train_data, target_column_name, forced_predictors))) else: - self._forward_selection(data, target_column_name, + self._forward_selection(train_data, target_column_name, filterd_predictors, forced_predictors) - def _forward_selection(self, data: pd.DataFrame, target_column_name: str, - predictors: list, forced_predictors: list=[]): + def _forward_selection(self, train_data: pd.DataFrame, + target_column_name: str, predictors: list, + forced_predictors: list=[]): """Summary Parameters ---------- - data : pd.DataFrame + train_data : pd.DataFrame Description target_column_name : str Description @@ -92,7 +179,9 @@ def _forward_selection(self, data: pd.DataFrame, target_column_name: str, """ current_predictors = [] - for step in range(1, self.max_predictors + 1): + max_steps = 1 + min(self.max_predictors, + len(predictors) + len(forced_predictors)) + for step in range(1, max_steps): if step <= len(forced_predictors): # first, we go through forced predictors candidate_predictors = list(set(forced_predictors) @@ -102,7 +191,7 @@ def _forward_selection(self, data: pd.DataFrame, target_column_name: str, candidate_predictors = [var for var in predictors if var not in current_predictors] - model = self._find_next_best_model(data[data["split"] == "train"], + model = self._find_next_best_model(train_data, target_column_name, candidate_predictors, current_predictors) @@ -114,12 +203,16 @@ def _forward_selection(self, data: pd.DataFrame, target_column_name: str, break if model is not None: + # Add new model predictors to the list of current predictors + current_predictors = list(set(current_predictors) + .union(set(model.predictors))) + self._fitted_models.append(model) if not self._fitted_models: log.error("No models found in forward selection") - def _find_next_best_model(self, data: pd.DataFrame, + def _find_next_best_model(self, train_data: pd.DataFrame, target_column_name: str, candidate_predictors: list, current_predictors: list) -> MLModel: @@ -127,7 +220,7 @@ def _find_next_best_model(self, data: pd.DataFrame, Parameters ---------- - data : pd.DataFrame + train_data : pd.DataFrame Description target_column_name : str Description @@ -143,35 +236,36 @@ def _find_next_best_model(self, data: pd.DataFrame, """ # placeholders best_model = None - best_auc = -1 + best_performance = -1 for pred in candidate_predictors: # train model with additional predictor - model = self._train_model(data, target_column_name, + model = self._train_model(train_data, target_column_name, (current_predictors + [pred])) # Evaluate model - auc_pred = model.evaluate(data[current_predictors + [pred]], - data[target_column_name]) + performance = (model + .evaluate(train_data[current_predictors + [pred]], + train_data[target_column_name])) if (self.pos_only and (not (model.get_coef() >= 0).all())): continue # check if model is better than current best model # and if yes, replace current best! - if (auc_pred >= best_auc): - best_auc = auc_pred + if (performance >= best_performance): + best_performance = performance best_model = model return best_model - def _train_model(self, data: pd.DataFrame, target_column_name: str, + def _train_model(self, train_data: pd.DataFrame, target_column_name: str, predictors: list) -> MLModel: """Summary Parameters ---------- - data : pd.DataFrame + train_data : pd.DataFrame Description target_column_name : str Description @@ -185,6 +279,6 @@ def _train_model(self, data: pd.DataFrame, target_column_name: str, """ model = MLModel() - model.fit(data[predictors], data[target_column_name]) + model.fit(train_data[predictors], train_data[target_column_name]) return model From ca199cc9038745c6d6c3d3e246a0df77b58fd907 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 18 Mar 2020 16:42:51 +0100 Subject: [PATCH 50/98] Modify forward_selection algorithm logic Change was made to make sure forced predictors could be included in models with only positive coefficients as much as possible. In the new implemtation, only when there is no possible combination of forced predictors (and other predictors) that lead to a model with only positive coefficients will some of the forced predictors be skipped. --- cobra/model_building/forward_selection.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index 7845d88..fc236ea 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -139,10 +139,9 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str, In case the number of forced predictors is larger than the maximum number of allowed predictors in the model """ - # prep predictor lists + # remove excluded predictors from predictor lists filterd_predictors = [var for var in predictors - if (var not in excluded_predictors - and var not in forced_predictors)] + if var not in excluded_predictors] # checks on predictor lists and self.max_predictors attr if len(forced_predictors) > self.max_predictors: @@ -188,19 +187,14 @@ def _forward_selection(self, train_data: pd.DataFrame, .difference( set(current_predictors))) else: - candidate_predictors = [var for var in predictors + candidate_predictors = [var for var in (predictors + + forced_predictors) if var not in current_predictors] model = self._find_next_best_model(train_data, target_column_name, candidate_predictors, current_predictors) - # if no new model was found, e.g. because there was no model with - # only positive coefficients, and all forced predictors were - # already tested (i.e. we are now looping through the other - # predictors) break out of the loop! - if (model is None) and (step > len(forced_predictors)): - break if model is not None: # Add new model predictors to the list of current predictors From 8d87d32f668c3c15064207ed519d9a0a0b3c1dd9 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 19 Mar 2020 14:14:25 +0100 Subject: [PATCH 51/98] Remove old refactored modules and extend utils.py Removed old data_preparation.py and univariate_selection.py Add clean_predictor_name utility function to utils.py --- cobra/data_preparation.py | 749 ---------------------------------- cobra/univariate_selection.py | 131 ------ cobra/utils.py | 13 + 3 files changed, 13 insertions(+), 880 deletions(-) delete mode 100644 cobra/data_preparation.py delete mode 100644 cobra/univariate_selection.py diff --git a/cobra/data_preparation.py b/cobra/data_preparation.py deleted file mode 100644 index 4445bc7..0000000 --- a/cobra/data_preparation.py +++ /dev/null @@ -1,749 +0,0 @@ -''' -====================================================================================================================== ----------------------------------------------------- DATA PREPARATION ---------------------------------------------- -====================================================================================================================== -''' -import math -import numpy as np -import pandas as pd -from scipy import stats - -# To allow pandas dataframes to display more columns -pd.set_option("display.max_columns",50) - -class DataPreparation(object): - ''' - Class for DataPreparation - Loads, clean, partition, binn, regroup,replace (incidence) - ---------------------------------------------------- - Author: Jan Benisek, Python Predictions - Date: 14/02/2018 - ---------------------------------------------------- - ***PARAMETERS*** - :partition_train: Size of training set as int <0;1> - :partition_select: Size of selection set as int <0;1> - :partition_valid: Size of validation set as int <0;1> - :sampling_1: Size of sampling of target class - :sampling_0: Size of sampling of non-target class - :discret_nbins: ??? - :regroup_sign: Significance level for regrouping categorical variables - :rseed: Random seed for reproducibility (partitioning). None or a number - - ***ATTRIBUTES*** - :_headers_dict: Dict of 4 lists with header names (object, numeric, bool, other) - :_partitioning_settings: Dict with train/sel/valid sets with their size - :_sampling_settings: Dict with sampling settings (how many 1's and 0's we will take) - ---------------------------------------------------- - ''' - def __init__(self, data_path, data_types_path, partition_train, partition_select, partition_valid, - sampling_1, sampling_0, discret_nbins, regroup_sign, rseed): - ''' ***PARAMETERS*** ''' - self.data_path = data_path - self.data_types_path = data_types_path - self.partition_train = partition_train - self.partition_select = partition_select - self.partition_valid = partition_valid - self.sampling_1 = sampling_1 - self.sampling_0 = sampling_0 - self.discret_nbins = discret_nbins - self.regroup_sign = regroup_sign - self.rseed = rseed - ''' ***ATTRIBUTES*** ''' - # Instance attributes = Each instance has its own version self.XY - # Not everyone is initialized here - - # Class attributes = Shared accross all instances DataPreparation.XY - DataPreparation._partitioning_settings = {'train':self.partition_train, - 'selection':self.partition_select, - 'validation':self.partition_valid} - DataPreparation._sampling_settings = {1:self.sampling_1, - 0:self.sampling_0} - - #Set seed for testing - #partitioning will be affected - if rseed: - np.random.seed(rseed) - - - - def transform(self): - ''' - Method transforms given csv - Returns DF - ---------------------------------------------------- - data_path: path to the csv with data file to be transformed - data_types_path: path to the csv with data types of the above csv dataset - ---------------------------------------------------- - ''' - key_clmns = ["ID","TARGET","PARTITION"] - - ##Load csv - df_transformed, df_types = self._loadCSVs(self.data_path, self.data_types_path) - - ##Clean headers - df_transformed = self._cleanHeaders(df_transformed) - self._getHeaderNames(df_transformed, df_types, key_clmns) - - ##Partitioning - df_transformed = self._addPartitionColumn(df_transformed) - - ##Sample - df_transformed = self._sampling(df_transformed) - - ##Preprocessing - #Continuous 2 bins - df_cont = self._prepNumVars(df_transformed, key_clmns) - #Regrouping categoricals - df_cat = self._prepCatVars(df_transformed, key_clmns) - #Rename booleans - df_bool = self._prepBoolVars(df_transformed, key_clmns) - - ##Merge together Preprocessing DFs - df_prep = pd.concat([ - df_cont, - df_cat[list(set(df_cat.columns) - set(key_clmns))], - df_bool[list(set(df_bool.columns) - set(key_clmns))] - ], - axis=1) - - ##Replace groups by incidence rate - df_inc = self._replaceByIncidenceRate(df_prep, key_clmns) - - ##Merge Preprocessing and Incidence DFs - df_out = pd.concat([ - df_prep, - df_inc[list(set(df_inc.columns) - set(key_clmns))] - ], - axis=1) - - ##Cleaning - del df_transformed, df_cont, df_cat, df_bool, df_prep, df_inc - - return df_out - - def _loadCSVs(self, data_path, data_types_path): - ''' - Function loads csv and if no datatype csv is present, - guesses the datatypes. - Returns raw DataFrame - ---------------------------------------------------- - data_path: path to the data csv file - data_types_path: path to the datatypes csv file - ---------------------------------------------------- - Return also data types? watch out if it is not given! - ''' - #Loads Data types - types_exist = True - - #load data_types - try: - df_types = pd.read_csv(data_types_path, header=None) - df_types.columns = ['variable','data_type'] - except FileNotFoundError: - types_exist = False - df_types = pd.DataFrame() - - #load data - df = pd.read_csv(data_path, header=0, sep=None, engine='python') - - #change datatypes - if types_exist: - for row in df_types.itertuples(): #0:index, 1:variable, 2:data_type - if row[2] == 'int': - #Nan is stored as float, hence the dtype. - #Won't work when converting to int with nans - df[row[1]] = df[row[1]].astype(np.float64) - if row[2] in ['str', 'bool']: - df[row[1]] = df[row[1]].apply(str) - - return df, df_types - - def _cleanHeaders(self, df): - ''' - Method cleans headers in given DataFrame. - Returns cleaned DF - ---------------------------------------------------- - df: input dataframe to be modified - ---------------------------------------------------- - ''' - #Define functions - def strip_quot(x_in): - '''Function to remove quotes from variable names and/or variable values''' - try: - x_out = x_in.strip().strip('"').strip("'") - except: - x_out=x_in - return x_out - - def lower_upper(x_in): - '''Function to put 'id' and 'target' variable names in uppercase, - all other variable names are put in lowercase''' - if ((x_in.lower() == 'id')|(x_in.lower() == 'target')): - x_out = x_in.upper() - else: - x_out = x_in.lower() - return x_out - - #Apply functions - df = df.rename(columns=strip_quot) - df = df.rename(columns=lower_upper) - df = df.applymap(strip_quot) - - return df - - def _getHeaderNames(self, df, _df_types, key_clmns): - ''' - Method returns lists with header names (int and obj). - Does not return anything, only initialize the self._headers_dict variable for later use - ---------------------------------------------------- - df: input dataframe from which the headers are retrieved - _df_types: dataframe with types - key_columns: list with colum names of keys - ---------------------------------------------------- - ''' - #Define function - def get_headers(dataframe,type): - '''Function to group variable names based on the data type of the variable''' - return dataframe.select_dtypes(include=[type]).columns.values - - #Get header names into a list - other_headers = key_clmns[:2] - - if len(_df_types) != 0: - bool_mask = _df_types[_df_types['data_type'] != 'bool'] - try: - bool_headers = [n for n in _df_types.loc[bool_mask==False,0].values if n not in other_headers] - except: - bool_headers = [] - else: - bool_headers = [] - - object_headers = [n for n in get_headers(df,'object') if n not in other_headers + bool_headers] - numeric_headers = [n for n in get_headers(df,'number') if n not in other_headers + bool_headers] - - self._headers_dict = {'string':object_headers, 'numeric':numeric_headers, 'bool':bool_headers, 'other':other_headers} - - def _addPartitionColumn(self, df): - ''' - Method shuffle DF and create a column PARTITIONING with train/selection/validation categories - Returns DF with new column PARTITIONING - ---------------------------------------------------- - df: input dataframe which will be parittioned - ---------------------------------------------------- - ''' - #Shuffle and sort target - df = df.iloc[np.random.permutation(len(df))].sort_values(by='TARGET', ascending=False).reset_index(drop=True) - - partition = [] - sorted_target=df['TARGET'] #Just the target since it is allready sorted (see above) - for target in [sorted_target.iloc[0],sorted_target.iloc[-1]]: - target_length = (sorted_target==target).sum() - - for part, size in DataPreparation._partitioning_settings.items(): - partition.extend([part]*math.ceil(target_length*size)) - - df["PARTITION"] = partition[:len(df)] - - return df - - def _sampling(self, df): - ''' - Method takes sample for the dataframe. If no sampling is specified, all data are taken. - Returns sampled DF. - ---------------------------------------------------- - df: input dataframe which will be sampled - ---------------------------------------------------- - ''' - drop_index = [] - for target, size in DataPreparation._sampling_settings.items(): - if size < 1: - sample_length = int(round((df['TARGET']==target).sum() * size)) - - for part, size in DataPreparation._partitioning_settings.items(): - part_length = int(round(sample_length * size)) - drop_index_part = df[(df['TARGET']==target) & (df['PARTITION']==part)].index[part_length:] - drop_index.extend(drop_index_part) - - df.drop(drop_index,inplace=True) - df.reset_index(drop=True, inplace=True) - - return df - - def _prepNumVars(self, df, key_columns): - ''' - Method converts numerical variables into bins. 10 bins is always used - Returns DF with binned columns - ---------------------------------------------------- - df: input dataframe (with all variables!) - key_columns: list with colum names of keys - ---------------------------------------------------- - ''' - - df_out = df.loc[:,key_columns].copy() - - for clmn in self._headers_dict['numeric']: - result = DataPreparation.__eqfreq(var=df[clmn], - train=df["PARTITION"]=="train", - autobins=True, - nbins=self.discret_nbins, - precision=0, - twobins=True, - # TRUE OPTION STILL PRODUCES ERROR IN SORTNUMERIC function AND SCORING procedure !!!!!!!!! - catchLarge=False) - df_out = pd.concat([df_out,result[0]], axis=1) - - return df_out - - def _prepCatVars(self, df, key_columns): - ''' - Method regroup categorical variables based on significance. - If the incidence rate in particular group is not signicantly different - from the average incidence rate, then the variable is regrouped - (will be pushed to 'Non-significants' category) - Returns DF with regrouped categorical columns - ---------------------------------------------------- - df: input dataframe (with all variables!) - key_columns: list with colum names of keys - ---------------------------------------------------- - ''' - - df_out = df.loc[:,key_columns].copy() - - for clmn in self._headers_dict['string']: - # We label missing and empty values for categorical variables as 'Missing' - # Note the interaction with the 'keep' parameter of the regroup function. - mask = DataPreparation.__maskmissing(df[clmn]) - df.loc[mask,clmn]='Missing' - # Perform regrouping function - result = DataPreparation.__regroup(var=df[clmn], - target=df.loc[:,'TARGET'], - train=df['PARTITION']=='train', - pval_thresh=self.regroup_sign, - dummy=True, - keep='Missing', - rename='Non-significants') - df_out = pd.concat([df_out,result[0]],axis=1) - - return df_out - - def _prepBoolVars(self, df, key_columns): - ''' - Method just passes the variables. - In order to be consistent, there is this special method. Otherwise they could be renamed whenever - Returns DF with renamed bool variables - ---------------------------------------------------- - df: input dataframe (with all variables!) - key_columns: list with colum names of keys - ---------------------------------------------------- - ''' - - df_out = df.loc[:,key_columns].copy() - - def passvar(var): - var_pass = var.copy() - var_pass.name = "B_"+var.name - info = ("Passing "+var.name) - return var_pass, info - - for clmn in self._headers_dict['bool']: - # We label missing and empty values for boolean variables as 'Missing' - mask = DataPreparation.__maskmissing(df[clmn]) - df.loc[mask,clmn]='Missing' - # Perform the passvar function - result = passvar(var=df[clmn]) - df_out = pd.concat([df_out,result[0]],axis=1) - - return df_out - - def _replaceByIncidenceRate(self, df, key_columns): - ''' - Method to replace the groups with average incidence rate (the "secret sauce"). - The variables will start with "D_" - ---------------------------------------------------- - df: input dataframe (with all variables!) - key_columns: list with colum names of keys - ---------------------------------------------------- - ''' - df_out = df.loc[:,key_columns].copy() - - headers_for_incidrep = [h for h in df.columns if ((h not in key_columns) & (h[:2]=="B_"))] - - for clmn in headers_for_incidrep: - # Perform increp function - result = DataPreparation.__increp(b_var=df[clmn], - target=df['TARGET'], - train=df['PARTITION']=="train") - df_out = pd.concat([df_out,result], axis=1) - - return df_out - - ''' - ==================================================================== - ==================== AUXILIARY STATIC METHODS ==================== - ==================================================================== - ''' - - @staticmethod - def __eqfreq(var, train, autobins=True, nbins=10, precision=0, twobins=True, catchLarge=True): - ''' - Special method for binning continuous variables into bins - ---------------------------------------------------- - var: input pd.Serie with continuous columns - train: mask with rows which belongs to train - autobins: adapts number of bins - nbins: number of bins - precision: precision to form meaningful bins - twobins: if only two bins are found, iterate to find more - catchLarge: check when groups are too big - ---------------------------------------------------- - - This function is a reworked version of pd.qcut to satisfy our particular needs - - Takes for var a continuous pd.Series as input and returns a pd.Series with bin-labels (e.g. [4,6[ ) - - Train takes a series/list of booleans (note: we define bins based on the training set) - - Autobins reduces the number of bins (starting from nbins) as a function of the number of missings - - Nbins is the wished number of bins - - Precision=0 results in integer bin-labels if possible - - twobins=True forces the function to output at least two bins - - catchLarge tests if some groups (or missing group) are very large, and if so catches and outputs two groups - - note: catchLarge makes twobins irrelevant - ''' - - # Test for large groups and if one exists pass them with two bins: Large_group,Other - if catchLarge: - catchPercentage=1-(1/nbins) - groupCount = var[train].groupby(by=var[train]).count() - maxGroupPerc = groupCount.max()/len(var[train]) - missingPerc = sum(var[train].isnull())/len(var[train]) - if maxGroupPerc>=catchPercentage: - largeGroup = groupCount.sort_values(ascending=False).index[0] - x_binned = var.copy() - x_binned.name = 'B_'+var.name - x_binned[x_binned!=largeGroup]='Other' - cutpoints=None - info = (var.name+": One large group, outputting 2 groups") - return x_binned, cutpoints, info - elif missingPerc>=catchPercentage: - x_binned = var.copy() - x_binned.name = 'B_'+var.name - x_binned[x_binned.isnull()]='Missing' - x_binned[x_binned!='Missing']='Other' - cutpoints=None - info = (var.name+": One large missing group, outputting 2 groups") - return x_binned, cutpoints, info - # Adapt number of bins as a function of number of missings - if autobins: - length = len(var[train]) - missing_total = var[train].isnull().sum() - missing_perten = missing_total/length*10 - nbins = max(round(10-missing_perten)*nbins/10 ,1) - # Store the name and index of the variable - name = var.name - series_index = var.index - # Transform var and train to a np.array and list respectively, which is needed for some particular function&methods - x = np.asarray(var) - train = list(train) - # First step in finding the bins is determining what the quantiles are (named as cutpoints) - # If the quantile lies between 2 points we use lin interpolation to determine it - cutpoints = var[train].quantile(np.linspace(0,1,nbins+1),interpolation = 'linear') - # If the variable results only in 2 unique quantiles (due to skewness) increase number of quantiles until more than 2 bins can be formed - if twobins: - extrasteps = 1 - # Include a max. extrasteps to avoid infinite loop - while (len(cutpoints.unique())<=2) & (extrasteps<20): - cutpoints = var[train].quantile(np.linspace(0,1,nbins+1+extrasteps),interpolation = 'linear') - extrasteps+=1 - # We store which rows of the variable x lies under/above the lowest/highest cutpoint - # Without np.errstate(): xcutpoints.max() can give if x contains nan values (missings) - # However the function will result in False in both >&< cases, which is a correct result, so the warning can be ignored - with np.errstate(invalid='ignore'): - under_lowestbin = x < cutpoints.min() - above_highestbin= x > cutpoints.max() - - - def _binnedx_from_cutpoints(x, cutpoints, precision, under_lowestbin, above_highestbin): - ### Attributes the correct bin ........................ - ### Function that, based on the cutpoints, seeks the lowest precision necessary to have meaningful bins - ### e.g. (5.5,5.5] ==> (5.51,5.54] - ### Attributes those bins to each value of x, to achieve a binned version of x - - # Store unique cutpoints (e.g. from 1,3,3,5 to 1,3,5) to avoid inconsistensies when bin-label making - # Indeed, bins [...,1], (1,3], (3,3], (3,5], (5,...] do not make much sense - # While, bins [...,1], (1,3], (3,5], (5,...] do make sense - unique_cutpoints = cutpoints.unique() - # If there are only 2 unique cutpoints (and thus only one bin will be returned), - # keep original values and code missings as 'Missing' - if len(unique_cutpoints) <= 2: - cutpoints = None - x_binned = pd.Series(x) - x_binned[x_binned.isnull()] = 'Missing' - info = (var.name+": Only one resulting bin, keeping original values instead") - return x_binned, cutpoints, info - # Store info on whether or not the number of resulting bins equals the desired number of bins - elif len(unique_cutpoints) < len(cutpoints): - info = (var.name+": Resulting # bins < whished # bins") - else: - info = (var.name+": Resulting # bins as desired") - # Finally, recode the cutpoints (which can have doubles) as the unique cutpoints - cutpoints = unique_cutpoints - - # Store missing values in the variable as a mask, and create a flag to test if there are any missing in the variable - na_mask = np.isnan(x) - has_nas = na_mask.any() - # Attribute to every x-value the index of the cutpoint (from the sorted cutpoint list) which is equal or higher than - # the x-value, effectively encompasing that x-value. - # e.g. for x=6 and for sorted_cutpoint_list=[0,3,5,8,...] the resulting_index=3 - ids = cutpoints.searchsorted(x, side='left') - # x-values equal to the lowest cutpoint will recieve a ids value of 0 - # but our code to attribute bins to x-values based on ids (see end of this subfunction) requires a min. value of 1 - ids[x == cutpoints[0]] = 1 - # Idem as previous: x-values below the lowest cutpoint should recieve a min. value of 1 - if under_lowestbin.any(): - ids[under_lowestbin] = 1 - # Similar as previous: x-values above the highest cutpoint should recieve the max. allowed ids - if above_highestbin.any(): - max_ids_allowed = ids[(above_highestbin == False) & (na_mask==False)].max() - ids[above_highestbin] = max_ids_allowed - # Maximal ids can now be defined if we neglect ids of missing values - max_ids = ids[na_mask==False].max() - - # Based on the cutpoints create bin-labels - # Iteratively go through each precision (= number of decimals) until meaningful bins are formed - # If theoretical bin is ]5.51689,5.83654] we will prefer ]5.5,5.8] as output bin - increases = 0 - original_precision = precision - while True: - try: - bins = _format_bins(cutpoints, precision) - except ValueError: - increases += 1 - precision += 1 - #if increases >= 5: - #warnings.warn("Modifying precision from "+str(original_precision)+" to "+str(precision)+" to achieve discretization") - #print("Modifying precision from "+str(original_precision)+" to "+str(precision)+" to achieve discretization") - else: - break - - # Make array of bins to allow vector-like attribution - bins = np.asarray(bins, dtype=object) - # If x has nas: for each na-value, set the ids-value to max_ids+1 - # this will allow na-values to be attributed the highest bin which we define right below - if has_nas: - np.putmask(ids, na_mask, max_ids+1) - # The highest bin is defined as 'Missing' - bins = np.append(bins,'Missing') - # ids-1 is used as index in the bin-labels list to attribute a bin-label to each x. Example: - # x=6 sorted_cutpoint_list=[0,3,5,8,...] ids=3 levels=[[0,3],(3,5],(5,8],...] - # The correct bin level for x is (5,8] which has index 2 which is equal to the ids-1 - x_binned = bins[ids-1] - return x_binned, cutpoints, info - - - def _format_bins(cutpoints, prec): - # Based on the quantile list create bins. Raise error if values are similar within one bin. - # On error _binnedx_from_cutpoints will increase precision - - fmt = lambda v: _format_label(v, precision=prec) - bins = [] - for a, b in zip(cutpoints, cutpoints[1:]): - fa, fb = fmt(a), fmt(b) - - if a != b and fa == fb: - raise ValueError('precision too low') - - formatted = '(%s, %s]' % (fa, fb) - bins.append(formatted) - - bins[0] = '[...,' + bins[0].split(",")[-1] - bins[-1] = bins[-1].split(",")[0] + ',...]' - return bins - - - def _format_label(x, precision): - # For a specific precision, returns the value formatted with the appropriate amount of numbers after comma and correct brackets - - if isinstance(x,float): - frac, whole = np.modf(x) - sgn = '-' if x < 0 else '' - whole = abs(whole) - if frac != 0.0: - val = '{0:.{1}f}'.format(frac, precision) - val = _trim_zeros(val) - if '.' in val: - return sgn + '.'.join(('%d' % whole, val.split('.')[1])) - else: - if '0' in val: - return sgn + '%0.f' % whole - else: - return sgn + '%0.f' % (whole+1) - else: - return sgn + '%0.f' % whole - else: - return str(x) - - - def _trim_zeros(x): - # Removes unnecessary zeros and commas - while len(x) > 1 and x[-1] == '0': - x = x[:-1] - if len(x) > 1 and x[-1] == '.': - x = x[:-1] - return x - - x_binned, cutpoints, info = _binnedx_from_cutpoints(x, cutpoints, precision=precision, under_lowestbin=under_lowestbin, above_highestbin=above_highestbin) - x_binned = pd.Series(x_binned, index=series_index, name="B_"+name) - return x_binned, cutpoints, info - - @staticmethod - def __maskmissing(df): - ''' - Method checks which values of a var are empty strings or null values - Returns DF mask - ---------------------------------------------------- - df: input dataframe - ---------------------------------------------------- - ''' - # Check if values are null - crit1 = df.isnull() - # Check if values are empty strings - modvar = pd.Series([str(value).strip() for value in df]) - crit2 = modvar==pd.Series(['']*len(df)) - return crit1 | crit2 - - @staticmethod - def __regroup(var,target,train,pval_thresh=0.01,dummy=True,keep='Missing',rename='Other'): - ''' - Method regroups categorical variables - Returns DF mask - ---------------------------------------------------- - var: input pd.Serie with cat column - target: pd.Serie with target variable - train: pd.Serie with parition variable - pval_thresh: threshold for regrouping - dummy: scale of booleans (?) - keep: keep specific groups (?) - rename: rename the insignificant category - ---------------------------------------------------- - - Each group is tested with a chi² for relevant incidence differences in comparison to a rest-group - - The rest group has the size of the remaining groups and an 'overall average incidence' (if dummy=True) or - - remaining groups average incidence' (if dummy=False) - - Groups with a pvalue above the threshold are relabled to a single group - ''' - - # Define the chi² test condition - # Groups that do not meet the condition are not analyzed and will be unconditionally relabled - def _chi2cond_(var=var,target=target,train=train): - varcounts = var[train].groupby(by=var).count() - train_inc = target[train].sum()/len(target[train]) - factor = max(train_inc, 1-train_inc) - analyze_mask = (varcounts*factor)>5 - analyze_groups = analyze_mask.index[analyze_mask].values - return analyze_groups - - # Compute overal incidence mean - incidence_mean = target[train].mean() - # Create container of which groups will be kept, compared to the groups which will be relabled - keepgroups = [] - # Cycle and test each group that meets the chi² condition - for group in _chi2cond_(): - # Container for target 0/1 observations of the group under scrutiny - obs_group = [] - # Counts of the target 0/1 occurences for the group under scrutiny - obs_group.append(((target[train]==0)&(var[train]==group)).sum()) - obs_group.append(((target[train]==1)&(var[train]==group)).sum()) - obs_group = np.array(obs_group) - # Container for target 0/1 observations of the remaining groups together - obs_other = [] - # Counts of the target 0/1 occurences for the remaining groups together - obs_other.append(((target[train]==0)&(var[train]!=group)).sum()) - obs_other.append(((target[train]==1)&(var[train]!=group)).sum()) - obs_other = np.array(obs_other) - # If dummy=True, we scale the two groups of target 0/1 occurences such that the incidence is equal to the overall incidence - # The size of the two groups of target 0/1 occurences is still equal to the size of the remaining groups - if dummy: - obs_other_size = obs_other.sum() - obs_other[0]=(1-incidence_mean)*obs_other_size # 0(1) index coincides with target = 0(1) - obs_other[1]=( incidence_mean)*obs_other_size - obs = np.array([obs_group,obs_other]) - # Place at least 1 observation to avoid error in chi2 test - obs[obs==0] = 1 - # Perform chi² test - pval = stats.chi2_contingency(obs, correction=False)[1] - # If pval outperforms threshold, append the group in the keepgroups list - if pval<=pval_thresh: - keepgroups.append(group) - #elif group==keep: - # keepgroups.append(group) - # If the specific group to be kept (e.g. 'Missing') didn't pass the test, append it to the keepgroups list - if keep not in keepgroups: - keepgroups.append(keep) - # Makes a list of all groups not in the keepgroups list - regroup_mask = [val not in keepgroups for val in var.values] - var_regroup = var.copy() - # Rename those groups - var_regroup[regroup_mask] = rename - var_regroup.name = "B_"+var.name - info = (var.name+": from "+str(len(var.unique()))+" to "+str(len(var_regroup.unique()))) - return var_regroup, info - - - @staticmethod - def __increp(b_var, target, train): - ''' - Method for incidence replacement - Returns replaced pd.Serie - ---------------------------------------------------- - b_var: input pd.Serie to be replaced - target: pd.Serie with target variable - train: pd.Serie with parition variable - ---------------------------------------------------- - ''' - - #get variable name - name = b_var.name - #get overall incidence - incidence_mean = target[train].mean() - #get incidence per group - incidences = target[train].groupby(b_var).mean() - #construct dataframe with incidences - idf = pd.DataFrame(incidences).reset_index() - #get values that are in the data but not in the labels - bin_labels = incidences.index - newgroups = list(set(b_var.unique()) ^ set(bin_labels)) - #if newgroups, add mean incidence to incidence dataframe for each new group - if len(newgroups)>0: - #make dataframe: - ngdf = pd.DataFrame(newgroups) - ngdf.columns = [name] - ngdf["TARGET"] = incidence_mean - #dataframe with incidences: - idf = idf.append(ngdf) - #dataframe with the variable - vdf = pd.DataFrame(b_var) - #discretized variable by merge - d_var = pd.merge(vdf,idf,how='left',on=name)["TARGET"] - return pd.Series(d_var, name="D_"+name[2:]) - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/cobra/univariate_selection.py b/cobra/univariate_selection.py deleted file mode 100644 index a69ab74..0000000 --- a/cobra/univariate_selection.py +++ /dev/null @@ -1,131 +0,0 @@ -''' -====================================================================================================================== --------------------------------------------------- UNIVARIATE SELECTION -------------------------------------------- -====================================================================================================================== -''' -import numpy as np -import pandas as pd -from sklearn import metrics - -class UnivariateSelection(object): - ''' - Class for Univariate Selection. - Calculates AUC and correlation matrix - ---------------------------------------------------- - Author: Jan Benisek, Python Predictions - Date: 19/02/2018 - ---------------------------------------------------- - ***PARAMETERS*** - :preselect_auc: Minimal treshold for AUC selection - :preselect_overtrain: Threshold for difference between train and test performance - ---------------------------------------------------- - ''' - - def __init__(self, preselect_auc, preselect_overtrain): - ''' ***PARAMETERS*** ''' - self.preselect_auc = preselect_auc - self.preselect_overtrain = preselect_overtrain - - def fit(self, df): - ''' - Method fits (=performs) Univariate selection - Returns auc, correlation and list with filtered variables - ---------------------------------------------------- - df: transformed dataset - ---------------------------------------------------- - ''' - key_clmns = ["ID","TARGET","PARTITION"] - - ##AUC selection - df_auc = self._calcFilterAUC(df, key_clmns) - - ##Correlation - df_corr = self._calcCorr(df, key_clmns) - - return df_auc, df_corr - - - def _calcFilterAUC(self, df, key_clmns): - ''' - Method calculates AUC for train/test - Returns DF with AUC higher than given threshold, drops overfitted variables - and creates column signalizing if a variable has been preselected. - ---------------------------------------------------- - df: transformed dataset - key_clmns: list with key columns names - ---------------------------------------------------- - ''' - headers_for_auc = [h for h in df.columns if ((h not in key_clmns) & (h[:2]=="D_"))] - - def getauc(var, target, partition): - y = np.array(target[partition]) - pred = np.array(var[partition]) - pred = pred.astype(np.float64) - fpr, tpr, thresholds = metrics.roc_curve(y,pred, pos_label=1) - return metrics.auc(fpr, tpr) - - auc_list_all = [] - parts = ["train","selection"] - - for header in headers_for_auc: - auc_list_var = [header[2:]] - # We loop through the two sets ('train' and 'selection') for which an AUC score is needed - for part in parts: - auc_value = getauc(var=df[header] - ,target=df['TARGET'] - ,partition=df['PARTITION']==part) - auc_list_var.append(auc_value.round(3)) - - auc_list_all.append(auc_list_var) - - df_auc = pd.DataFrame(auc_list_all,columns=['variable','AUC train','AUC selection']) - - #Filter based on min AUC - auc_thresh = df_auc.loc[:,'AUC selection'] > self.preselect_auc - # We identify those variables for which the AUC score difference between 'train' and 'selection' is within the user-defined ratio - auc_overtrain = (df_auc.loc[:,'AUC train']*100 - df_auc.loc[:,'AUC selection']*100) < self.preselect_overtrain - - # List of variables which passed the two criteria - df_auc['preselection'] = auc_thresh & auc_overtrain - - return df_auc - - def _calcCorr(self, df, key_clmns): - ''' - Method calculates correlation on train set amongst the "D_" variables - Returns DF with correlations - ---------------------------------------------------- - df: transformed dataset - key_clmns: list with key columns names - ---------------------------------------------------- - ''' - headers_for_corr = [h for h in df.columns if ((h not in key_clmns) & (h[:2]=="D_"))] - - train = df['PARTITION']=="train" - dataforcorr = np.transpose(np.matrix(df.loc[train,headers_for_corr],dtype=float)) - with np.errstate(invalid='ignore', divide='ignore'): - mat_corr = np.corrcoef(dataforcorr) - - #Convert numpy to pandas - df_corr = pd.DataFrame(mat_corr) - df_corr.columns = headers_for_corr - df_corr.index = headers_for_corr - df_corr.fillna(0, inplace=True) - - return df_corr - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/cobra/utils.py b/cobra/utils.py index e7057da..8c55b7d 100644 --- a/cobra/utils.py +++ b/cobra/utils.py @@ -58,3 +58,16 @@ def get_column_datatypes(data: pd.DataFrame, return {"numeric_variables": list(vars_numeric), "categorical_variables": list(vars_cat)} + + +def clean_predictor_name(predictor: str) -> str: + """Strip-off redundant suffix (e.g. "_enc" or "_bin") from the predictor + name to return a clean version of the predictor + + Args: + predictor (str): Description + + Returns: + str: Description + """ + return predictor.replace("_enc", "").replace("_bin", "") From defa4bdd97bbbab7d24baa3d4c55c65d98f712d7 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 19 Mar 2020 14:48:40 +0100 Subject: [PATCH 52/98] Move private function to utils --- cobra/model_building/univariate_selection.py | 24 +++++--------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 1979ac8..60838e8 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -9,6 +9,7 @@ """ import pandas as pd from sklearn.metrics import roc_auc_score +import cobra.utils as utils def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, @@ -47,14 +48,14 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, Returns: pd.DataFrame: DataFrame containing for each variable the train auc and - test auc allong with a boolean indicating whether or not it is selected - based on the criteria + selection auc allong with a boolean indicating whether or not it is + selected based on the criteria """ result = [] for predictor in predictors: - cleaned_predictor = _clean_predictor_name(predictor) + cleaned_predictor = utils.clean_predictor_name(predictor) auc_train = roc_auc_score( y_true=target_enc_train_data[target_column], @@ -81,7 +82,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, df_auc["preselection"] = auc_thresh & auc_overtrain - return df_auc + return df_auc.sort_values(by='AUC selection', ascending=False) def get_preselected_predictors(df_auc: pd.DataFrame) -> list: @@ -124,7 +125,7 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, correlations = target_enc_train_data[predictors].corr() - predictors_cleaned = [_clean_predictor_name(predictor) + predictors_cleaned = [utils.clean_predictor_name(predictor) for predictor in predictors] # Change index and columns with the cleaned version of the predictors @@ -133,16 +134,3 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, correlations.index = predictors_cleaned return correlations - - -def _clean_predictor_name(predictor: str) -> str: - """Strip-off redundant suffix (e.g. "_enc" or "_bin") from the predictor - name to return a clean version of the predictor - - Args: - predictor (str): Description - - Returns: - str: Description - """ - return predictor.replace("_enc", "").replace("_bin", "") From aaff470f24fc16b854b75d5659366e04012f409b Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 19 Mar 2020 15:55:57 +0100 Subject: [PATCH 53/98] Add variable importance computation to models.py Moreover, split param in evaluation was made optional so that you can still evaluate the model on future datasets (for monitoring) without caching the result. --- cobra/model_building/forward_selection.py | 9 +++-- cobra/model_building/models.py | 49 ++++++++++++++++++++--- 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index fc236ea..674d3a4 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -85,19 +85,19 @@ def compute_model_performances(self, data: pd.DataFrame, performance_train = model.evaluate( data[data["split"] == "train"], data[data["split"] == "train"][target_column_name], - split="train" + split="train" # used for caching ) performance_selection = model.evaluate( data[data["split"] == "selection"], data[data["split"] == "selection"][target_column_name], - split="selection" + split="selection" # used for caching ) performance_validation = model.evaluate( data[data["split"] == "validation"], data[data["split"] == "validation"][target_column_name], - split="validation" + split="validation" # used for caching ) last_added_predictor = (set(model.predictors) @@ -240,7 +240,8 @@ def _find_next_best_model(self, train_data: pd.DataFrame, # Evaluate model performance = (model .evaluate(train_data[current_predictors + [pred]], - train_data[target_column_name])) + train_data[target_column_name], + split="train")) if (self.pos_only and (not (model.get_coef() >= 0).all())): continue diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 39c7a97..de09015 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -1,7 +1,11 @@ +# third party imports import numpy as np import pandas as pd +from scipy import stats from sklearn.metrics import roc_auc_score from sklearn.linear_model import LogisticRegression +# custom imports +import cobra.utils as utils class LogisticRegressionModel: @@ -86,9 +90,11 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: return self.logit.predict_proba(X[self.predictors])[:, 1] def evaluate(self, X: pd.DataFrame, y: pd.Series, - split: str="train") -> float: - """Evaluate the model on a given split (train, selection, validation) - of a data set (X, y) + split: str=None) -> float: + """Evaluate the model on a given data set (X, y). The optional split + parameter is to indicate that the data set belongs to + (train, selection, validation), so that the computation on these sets + can be cached! Parameters ---------- @@ -104,10 +110,41 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series, float the performance score of the model (e.g. AUC) """ - if self._eval_metrics_by_split.get(split) is None: + + if (split is None) or (split not in self._eval_metrics_by_split): y_pred = self.score_model(X) - self._eval_metrics_by_split[split] = roc_auc_score(y_true=y, - y_score=y_pred) + performance = roc_auc_score(y_true=y, y_score=y_pred) + + if split is None: + return performance + else: + self._eval_metrics_by_split[split] = performance + return self._eval_metrics_by_split[split] + + def compute_variable_importance(self, data: pd.DataFrame) -> dict: + """Compute the importance of each predictor in the model and return + it as a dictionary + + Parameters + ---------- + data : pd.DataFrame + data to score the model + + Returns + ------- + dict + Map of predictor -> importance + """ + + y_pred = self.score_model(data) + + return { + utils.clean_predictor_name(predictor): stats.pearsonr( + data[predictor], + y_pred + ) + for predictor in self.predictors + } From b2e4e93a8c4c2868f9dfa6ec9131dd697e379a7e Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 20 Mar 2020 09:28:26 +0100 Subject: [PATCH 54/98] Delete old cobra files --- cobra/cobra.py | 501 --------------------------------------- cobra/model_selection.py | 302 ----------------------- 2 files changed, 803 deletions(-) delete mode 100644 cobra/cobra.py delete mode 100644 cobra/model_selection.py diff --git a/cobra/cobra.py b/cobra/cobra.py deleted file mode 100644 index cb5e6d0..0000000 --- a/cobra/cobra.py +++ /dev/null @@ -1,501 +0,0 @@ -''' -====================================================================================================================== ---------------------------------------------------------- COBRA ---------------------------------------------------- -====================================================================================================================== -''' -# -import cobra.data_preparation as dpc -import cobra.univariate_selection as us -import cobra.model_selection as ms -# -import seaborn as sns -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from string import ascii_lowercase - -class COBRA(object): - ''' - Wrapper class for all the child classes for easier usage - ---------------------------------------------------- - Author: Jan Benisek, Python Predictions - Date: 21/02/2018 - ---------------------------------------------------- - ***PARAMETERS*** - :data_path: Path to .csv file which contains the data - :data_types_path: Path to .csv files which contains the metadata - :partition_train: Size of training set as int <0;1> - :partition_select: Size of selection set as int <0;1> - :partition_valid: Size of validation set as int <0;1> - :sampling_1: Size of sampling of target class - :sampling_0: Size of sampling of non-target class - :discret_nbins: ??? - :regroup_sign: Significance level for regrouping categorical variables - :rseed: Random seed for reproducibility (partitioning). None or a number - - ***ATTRIBUTES*** - :_partition_dict: Dict with partitioned DFs X/Y train/selection/validation - :_headers_dict: Dict of 4 lists with header names (object, numeric, bool, other) - :_partitioning_settings: Dict with train/sel/valid sets with their size - ---------------------------------------------------- - __init__: contains variables which are established with the object. - If some of them is changed, then the whole process must be redone(call the class again), - because the model comparison wont make sense - transform: For the reasons before, transform has no parameters - fit_univariate: there I can change stuff when trying different modelling ideas - i.e. what variables will I get if AUC threshold is changed - fit_model: Here I want try many things, so the parametes are changeble in the method. - ''' - - def __init__(self, - data_path, - data_types_path, - partition_train=0.5, - partition_select=0.3, - partition_valid=0.2, - sampling_1=1, - sampling_0=1, - discret_nbins=5, - regroup_sign=0.001, - rseed=None): - - ''' ***PARAMETERS*** ''' - self.data_path = data_path - self.data_types_path = data_types_path - self.partition_train = partition_train - self.partition_select = partition_select - self.partition_valid = partition_valid - self.sampling_1 = sampling_1 - self.sampling_0 = sampling_0 - self.discret_nbins = discret_nbins - self.regroup_sign = regroup_sign - self.rseed = rseed - - - def transform(self): - ''' - Method transforms given csv - ---------------------------------------------------- - only self - ---------------------------------------------------- - ''' - dtrans = dpc.DataPreparation(self.data_path, - self.data_types_path, - self.partition_train, - self.partition_select, - self.partition_valid, - self.sampling_1, - self.sampling_0, - self.discret_nbins, - self.regroup_sign, - self.rseed) - - df_trans = dtrans.transform() - - self._headers_dict = dtrans._headers_dict - self._partitioning_settings = dtrans._partitioning_settings - - return df_trans - - - def fit_univariate(self, df_t, preselect_auc=0.53, preselect_overtrain=5): - ''' - Method transforms given csv - Returns univariate selection and correlation matrix - ---------------------------------------------------- - df_t: dataframe with transformed data - ---------------------------------------------------- - ''' - - unisel = us.UnivariateSelection(preselect_auc, - preselect_overtrain) - df_sel, df_corr = unisel.fit(df_t) - - return df_sel, df_corr - - def fit_model(self, df_t, df_us, modeling_nsteps=30, forced_vars=None, excluded_vars=None, name=None, verbose=False, positive_only=True): - ''' - Method fits and finds best model - Returns dataframe with all the info - forward selection, AUC, importance... - ---------------------------------------------------- - df_t: dataframe with transformed data - df_us: dataframe with univariate selection - modeling_nsteps: how many steps in modelling - forced_vars: list with variables to be forced in the model - excluded_vars: list with variables to be excluded in the model - name: name of the model - verbose: whether immediate steps of the procedure should be printed to the console - positive_only: whether only positive coeficients should be considered (recommended to stay so) - ---------------------------------------------------- - ''' - modsel = ms.ModelSelection(verbose=verbose, positive_only=positive_only) - - df_models = modsel.fit(df_t, - df_us, - modeling_nsteps=modeling_nsteps, - forced_vars=forced_vars, - excluded_vars=excluded_vars, - name=name) - - self._partition_dict = modsel._partition_dict - - return df_models - - def summary(self, df_t): - ''' - Method describes dataset (=prints summary). - Now just few simple things, make printing fancy and add more. - ------------------------------------------------------------ - df_t: dataframe with transformed data - ------------------------------------------------------------ - ''' - print('----------------- SUMMARY -----------------'.format()) - print('Dataset has {} rows and {} columns.'.format(len(df_t), len(df_t.columns))) - print('Train set has {} rows'.format(len(df_t[df_t['PARTITION'] == 'train']))) - print('Selection set has {} rows'.format(len(df_t[df_t['PARTITION'] == 'selection']))) - print('Validation set has {} rows'.format(len(df_t[df_t['PARTITION'] == 'validation']))) - print('Overall incidence rate is {0:0.2f}%'.format(df_t['TARGET'].mean()*100)) - - num_of_missings = df_t[df_t == 'Missing'].count().sum() - all_elements = len(df_t) * len(df_t.columns) - - print('{0:0.2f}% records in the dataset are missing.'.format((num_of_missings/all_elements)*100)) - print('-------------------------------------------'.format()) - - - ''' - ==================================================================== - ================ STATIC METHODS FOR VISUALIZATION ================ - ==================================================================== - ''' - @staticmethod - def plotPredictorQuality(df, dim=(12,8)): - ''' - Method plots Univarite quality of predictors - Returns plot - ---------------------------------------------------- - df: dataframe with univariate selection - dim: tuple with width and lentgh of the plot - ---------------------------------------------------- - ''' - plt.style.use('seaborn-darkgrid') - - #---------------------------------- - #------ Prepare the data -------- - #---------------------------------- - df_uq = df[['variable','AUC train','AUC selection']][df['preselection'] == True].sort_values(by='AUC train', ascending=False) - df_uq.columns = ['variable name','AUC train','AUC selection'] - df_uq = pd.melt(df_uq, id_vars=['variable name'], value_vars=['AUC train', 'AUC selection'], var_name='partition', value_name='AUC') - - #---------------------------------- - #------- Plot the bars ---------- - #---------------------------------- - fig, ax = plt.subplots(figsize=dim) - - ax = sns.barplot(x="AUC", y="variable name", hue="partition", data=df_uq) - ax.set_title('Univariate Quality of Predictors') - plt.show() - - @staticmethod - def plotCorrMatrix(df, dim=(12,8)): - ''' - Method plots Correlation matrix among predictors - Returns plot - ---------------------------------------------------- - df: dataframe with correlation data - dim: tuple with width and lentgh of the plot - ---------------------------------------------------- - ''' - fig, ax = plt.subplots(figsize=dim) - ax = sns.heatmap(df, cmap='Blues') - ax.set_title('Correlation Matrix') - plt.show() - - @staticmethod - def plotIncidence(df, variable, dim=(12,8)): - ''' - Method plots Incidence plot on train partition - Returns plot - ---------------------------------------------------- - df: dataframe with cleaned, binned, partitioned and prepared data - variable: variable for which the incidence plot will be shown` - dim: tuple with width and lentgh of the plot - ---------------------------------------------------- - ''' - def masterOfOrder(x): - ''' - Function converts interval or string (category) to a number, so the incidence plot can be orderd. - In case of interval -> '(151, 361]' to integer 151. - In case of string -> order is alphabetical - Missings and Non-significants are always put at the end - - Parameters - ---------- - x: value to be converted - - Output - ------ - Order of given value - ''' - x_split = x.split(',')[0] - replace_strings = (('...', '0'),('Missing','999999999999'), ('Non-significants','999999999999')) - for repl_str in replace_strings: - x_split = x_split.replace(repl_str[0], repl_str[1]) - x_split = x_split.strip("()[]") - - try: - order = float(x_split) - except: - LETTERS = {letter: index for index, letter in enumerate(ascii_lowercase, start=1)} - order = LETTERS[x[0].lower()] - - return order - - plt.style.use('seaborn-darkgrid') - - #---------------------------------- - #------ Prepare the data -------- - #---------------------------------- - #Set up the variable and dataframe - var_prefix = 'B_' + variable - df_plt = df[['TARGET', var_prefix]][df['PARTITION'] == 'train'].copy() - - #Aggregate the data - avg_inc_rate = df_plt['TARGET'].mean() - - aggregations = { - 'bin_inc_rate': 'mean', - 'bin_size': 'count' - } - df_plt = df_plt.groupby(var_prefix, as_index=False)['TARGET'].agg(aggregations) - df_plt['avg_inc_rate'] = avg_inc_rate - - #create a sort column and sort by it - df_plt['sort_by'] = df_plt[var_prefix].apply(lambda x: masterOfOrder(x)) - df_plt.sort_values(by='sort_by', ascending=True, inplace=True) - df_plt.reset_index(inplace=True) - - #---------------------------------- - #----- Plot the incidence ------- - #---------------------------------- - fig, ax = plt.subplots(figsize=dim) - ##First Axis - #Bin size - y_pos = np.arange(len(df_plt[var_prefix])) - plt.bar(y_pos, df_plt['bin_size'].values.tolist(), align='center', color="cornflowerblue") - plt.xticks(y_pos, df_plt[var_prefix]) - plt.ylabel('Bin Size') - plt.xlabel(variable + ' Bins') - - max_inc = max(df_plt['bin_inc_rate']) - - ##Second Axis - ax2 = ax.twinx() - #incidence rate per bin - plt.plot(df_plt['bin_inc_rate'], color="darkorange", marker=".", markersize=20, linewidth=3, label='incidence rate per bin') - plt.plot(df_plt['avg_inc_rate'], color="dimgrey", linewidth=4, label='average incidence rate') - ax2.plot(np.nan, "cornflowerblue", linewidth=6, label = 'bin size') #dummy line to have label on second axis from first - ax2.set_yticks(np.arange(0, max_inc+0.05, 0.05)) - ax2.set_yticklabels(['{:3.1f}%'.format(x*100) for x in ax2.get_yticks()]) - plt.ylabel('Incidence') - - ##Set Axis - sns.despine(ax=ax, right=True, left=True) - sns.despine(ax=ax2, left=True, right=False) - ax2.spines['right'].set_color('white') - - #remove white line from second grid axes - #the white lines are reguler, Spyder sometimes fails to visualize it (try to export the pic!) - ax2.grid(False) - - ##Description - fig.suptitle('Incidence Plot - ' + variable, fontsize=20, y=1.02) - ax2.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=1, mode="expand", borderaxespad=0.) - plt.show() - - @staticmethod - def plotAUC(df, dim=(12,8)): - ''' - Method plots AUC for train/selection/validation and number of selected variables - Returns plot - ---------------------------------------------------- - df: dataframe with models performance - dim: tuple with width and lentgh of the plot - ---------------------------------------------------- - AUC on optimal number of vars - ''' - plt.style.use('seaborn-darkgrid') - - #---------------------------------- - #------ Prepare the data -------- - #---------------------------------- - df_plt = df[['last_var_added','auc_train','auc_selection','auc_validation']] - df_plt.columns = ['variable name', 'AUC train','AUC selection','AUC validation'] - - highest_auc = np.round(max(max(df_plt['AUC train']), - max(df_plt['AUC selection']), - max(df_plt['AUC validation'])), 1) - - #---------------------------------- - #-------- Plot the AUC ---------- - #---------------------------------- - fig, ax = plt.subplots(figsize=dim) - - plt.plot(df_plt['AUC train'], marker=".", markersize=20, linewidth=3, label='AUC train') - plt.plot(df_plt['AUC selection'], marker=".", markersize=20, linewidth=3, label='AUC selection') - plt.plot(df_plt['AUC validation'], marker=".", markersize=20, linewidth=3, label='AUC validation') - #Set x/yticks - ax.set_xticks(np.arange(len(df_plt['variable name'])+1)) - ax.set_xticklabels(df_plt['variable name'].tolist(), rotation = 40, ha='right') - ax.set_yticks(np.arange(0.5, highest_auc+0.02, 0.05)) - #Make Pretty - ax.legend(loc='lower right') - fig.suptitle('Multivariate Model AUC - ' + df.name, fontsize=20) - plt.ylabel('AUC') - plt.show() - - @staticmethod - def plotVariableImportance(df, step=None, dim=(12,8)): - ''' - Method plots variable importance for given model - Returns plot - ---------------------------------------------------- - df: dataframe with models performance - step: for which model the importance will be shown - (the best if not specified) - dim: tuple with width and lentgh of the plot - ---------------------------------------------------- - Importance on optimal number of vars - ''' - plt.style.use('seaborn-darkgrid') - - #If step not specified, give model with highes performance - if step == None: - step = df['step'].iloc[df['auc_validation'].idxmax()] - - #---------------------------------- - #------ Prepare the data -------- - #---------------------------------- - #dict_plt = df['importance'].iloc[model_row] - dict_plt = df['importance'][df['step'] == step] - df_plt = pd.DataFrame.from_dict(dict_plt.iloc[0], orient='index') - df_plt.reset_index(level=0, inplace=True) - df_plt.columns = ['variable name','importance'] - df_plt.sort_values(by='importance', ascending=False, inplace=True) - - - #---------------------------------- - #------- Plot the bars ---------- - #---------------------------------- - fig, ax = plt.subplots(figsize=dim) - ax = sns.barplot(x="importance", y="variable name", data=df_plt) - ax.set_title('Variable Importance in model ' + df.name) - ax.set_title('Variable importance in model {}, step {}.'.format(df.name, step)) - plt.show() - - @staticmethod - def plotCumulatives(model_list, df_trans, dim=(12,8)): - ''' - Method plots cumulative response and gains in one plot for multiple models - Returns plot - ---------------------------------------------------- - model_list: list of tuples with model DF and step number (which model to take from the DF) - df_trans: dataframe with cleaned, binned, partitioned and prepared data - dim: tuple with width and lentgh of the plot - ---------------------------------------------------- - Max 5 models is allowed, on train partition - ''' - plt.style.use('seaborn-darkgrid') - - if len(model_list) >5: - raise ValueError('The maximum number of input models is 5') - - colors = ['cornflowerblue','forestgreen','firebrick','darkmagneta','orange'] - - avg_incidence = df_trans['TARGET'][df_trans['PARTITION'] == 'train'].mean() - - #---------------------------------- - #------- Plot the data ---------- - #---------------------------------- - fig, (ax_cresp, ax_cgains) = plt.subplots(1, 2, sharey=False, figsize=dim) - # - #Cumulative Response - # - for i, model in enumerate(model_list): - #------ Prepare the data -------- - cum_resp = model[0]['cum_response'][model[0]['step'] == model[1]].tolist()[0] - #------ Plot line for each model -------- - ax_cresp.plot(cum_resp, color=colors[i], linewidth=3, label='cumulative response - ' + model[0].name) - - ax_cresp.axhline(y=np.round(avg_incidence*100), color="darkorange", linewidth=3, ls="--", label='average incidence rate') - ax_cresp.set_title('Cumulative Response', fontsize=20) - #Format axes - ax_cgains.set_xlim([0,100]) - ax_cgains.set_ylim([0,100]) - #Format ticks - ax_cresp.set_yticklabels(['{:3.0f}%'.format(x) for x in ax_cresp.get_yticks()]) - ax_cresp.set_xticklabels(['{:3.0f}%'.format(x) for x in ax_cresp.get_xticks()]) - #Legend - ax_cresp.legend(loc='upper right') - # - #Cumulative Gains - # - for i, model in enumerate(model_list): - #------ Prepare the data -------- - cum_gains = model[0]['cum_gains'][model[0]['step'] == model[1]].tolist()[0] - #------ Plot line for each model -------- - ax_cgains.plot(cum_gains, color=colors[i], linewidth=3, label='cumulative gains - ' + model[0].name) - - ax_cgains.plot(ax_cgains.get_xlim(), ax_cgains.get_ylim(), linewidth=3, ls="--", color="darkorange", label='random selection') - ax_cgains.set_title('Cumulative Gains', fontsize=20) - #Format axes - ax_cgains.set_xlim([0,100]) - ax_cgains.set_ylim([0,100]) - #Format ticks - ax_cgains.set_yticklabels(['{:3.0f}%'.format(x) for x in ax_cgains.get_yticks()]) - ax_cgains.set_xticklabels(['{:3.0f}%'.format(x) for x in ax_cgains.get_xticks()]) - #Legend - ax_cgains.legend(loc='lower right') - - #Make pretty - plt.tight_layout() - - plt.show() - - @staticmethod - def plotAUCComparison(model_list, dim=(12,8)): - ''' - Method plots AUC comarison on train/selection/validation - Returns plot - ---------------------------------------------------- - model_list: list of tuples with model DF and step number (which model to take from the DF) - dim: tuple with width and lentgh of the plot - ---------------------------------------------------- - ''' - plt.style.use('seaborn-darkgrid') - - #---------------------------------- - #------ Prepare the data -------- - #---------------------------------- - df_plt = pd.DataFrame() - for model, step in model_list: - df_aux = pd.DataFrame(model[['auc_train','auc_selection','auc_validation']][model['step'] == step]) - df_aux['model'] = model.name - df_plt = pd.concat([df_plt, df_aux]) - - df_plt.reset_index(inplace=True, drop=True) - - df_plt.columns = ['AUC train','AUC selection','AUC validation','model'] - df_plt = pd.melt(df_plt, id_vars=['model'], value_vars=['AUC train','AUC selection','AUC validation'], - var_name='partition', value_name='AUC') - - #---------------------------------- - #------- Plot the bars ---------- - #---------------------------------- - fig, ax = plt.subplots(figsize=(12,8)) - - ax = sns.barplot(x="AUC", y="partition", hue="model", data=df_plt) - - ax.set_xlim([0,1.2]) - ax.set_title('AUC comparison') - plt.show() \ No newline at end of file diff --git a/cobra/model_selection.py b/cobra/model_selection.py deleted file mode 100644 index f986051..0000000 --- a/cobra/model_selection.py +++ /dev/null @@ -1,302 +0,0 @@ -''' -====================================================================================================================== --------------------------------------------------- MODEL SELECTION -------------------------------------------- -====================================================================================================================== -''' -import pandas as pd -import numpy as np -from scipy import stats -from sklearn import metrics -from sklearn.linear_model import LogisticRegression - -class ModelSelection(object): - ''' - Class for Model Selection - Finds best model using forward selection - ---------------------------------------------------- - Author: Jan Benisek, Python Predictions - Date: 19/02/2018 - ---------------------------------------------------- - ***PARAMETERS*** - :modeling_nsteps: how many variables will be used for modelling - :forced_vars: Force variables to be used in forward selection - :excluded_vars: List with variables to be excluded - :verbose: Whether more info about the ouput should be printed - - ***ATTRIBUTES*** - :_partition_dict: Dict with partitioned DFs X/Y train/selection/validation - :_optimal_nvars: Optimal number of variables - :positive_only: Whether all coeficients should be positivr - ---------------------------------------------------- - ''' - - def __init__(self, verbose, positive_only): - self.verbose = verbose - self.positive_only = positive_only - - - def fit(self, df_trans, df_unisel, modeling_nsteps, forced_vars, excluded_vars, name): - ''' - Method fits (=performs) Model Selection - Returns DF with model performance and list - ---------------------------------------------------- - df_trans: transformed dataset - df_unisel: dataframe with univariate selection - modeling_nsteps: how many variables will be used for modelling - forced_vars: variables forced to be used in the modelling, list - excluded_vars: variables to be excluded - ---------------------------------------------------- - ''' - - self.modeling_nsteps = modeling_nsteps - - ##Create partition - self._partition_dict = self._getTrainSelectValidXY(df_trans) - - ##Perform forward selection - df_fsel = self._forwardSelection(df_unisel, forced_vars, excluded_vars) - - ##Cumulative respone/gain and adds it into df_fsel - self._cumulatives(df_fsel) - - ##Calclates importance and adds it into df_fsel - self._calcImportance(df_fsel) - - ##Give name - df_fsel.name = name - - return df_fsel - - def _getTrainSelectValidXY(self, df): - ''' - Method split given DF into train/test/validation set in respect to X and Y. - Returns dictionary with DFs - ---------------------------------------------------- - df: transformed dataset - ---------------------------------------------------- - ''' - - dvars = [n for n in df.columns if n[:2] == 'D_'] - - mask_train = df['PARTITION']=="train" - mask_selection = df['PARTITION']=="selection" - mask_validation = df['PARTITION']=="validation" - - y_train = df.loc[mask_train,'TARGET'] - y_selection = df.loc[mask_selection,'TARGET'] - y_validation = df.loc[mask_validation,'TARGET'] - - x_train = df.loc[mask_train,dvars] - x_selection = df.loc[mask_selection,dvars] - x_validation = df.loc[mask_validation,dvars] - - dict_out = {'y_train':y_train, 'y_selection':y_selection, 'y_validation':y_validation, - 'x_train':x_train, 'x_selection':x_selection, 'x_validation':x_validation} - - return dict_out - - def _forwardSelection(self, df_sel, forced_vars, excluded_vars): - ''' - Method performs forward selection - Returns DF with performance - ---------------------------------------------------- - df_sel: DF with selection from Univariate Selection - forced_vars: list with varibels forced to be in the forward selection - excluded_vars: list with variables to be excluded - positive_only: whether or not all coefs in logit should be positive - ---------------------------------------------------- - ''' - if not excluded_vars: - excluded_vars = [] - - if not forced_vars: - forced_vars = [] - - #Sort - df_sel = df_sel.sort_values(by='AUC selection', ascending=False) - - #Build list of variables to be used for Forward selection - preselected_vars = df_sel['variable'][df_sel['preselection'] == True].tolist() - preselected_vars = [var for var in preselected_vars if var not in forced_vars+excluded_vars] - all_vars = ['D_' + var for var in forced_vars + preselected_vars] - - df_forward_selection = pd.DataFrame(None,columns=[ - 'step', - 'coef', - 'intercept', - 'all_coefs_positive', - 'auc_train', - 'auc_selection', - 'auc_validation', - 'predictors_subset', - 'last_var_added', - 'auc_train_rank', - 'selected_model', - 'pred_training', - 'pred_selection', - 'pred_validation', - 'first_rank' - ]) - - - f_position_forced = lambda i, forced, all_vars: len(forced) if i <= len(forced) else len(all_vars) - - n_steps = min(self.modeling_nsteps + 1,len(all_vars)) - predictors = [] - row = 0 - - #----------------------------------------------------------------------------------- - #------------------------------- ITERATE FOR EVERY STEP -------------------------- - #----------------------------------------------------------------------------------- - for step in range(1,n_steps): - - pos = f_position_forced(step, forced_vars, all_vars) - remaining_predictors = [var for var in all_vars[:pos] if var not in predictors] - - #----------------------------------------------------------------------------------- - #-------------------------------- FOR EVERY COMBINATION -------------------------- - #----------------------------------------------------------------------------------- - for predictor in remaining_predictors: - predictors_subset = predictors + [predictor] - #Train - train model - logit = LogisticRegression(fit_intercept=True, C=1e9, solver = 'liblinear') - logit.fit(y=self._partition_dict['y_train'], X=self._partition_dict['x_train'][predictors_subset]) - - #Train - predict and AUC - y_pred_train = logit.predict_proba(self._partition_dict['x_train'][predictors_subset]) - AUC_train = metrics.roc_auc_score(y_true=self._partition_dict['y_train'], y_score=y_pred_train[:,1]) - - #Selection - predict and AUC - y_pred_selection = logit.predict_proba(self._partition_dict['x_selection'][predictors_subset]) - AUC_selection = metrics.roc_auc_score(y_true=self._partition_dict['y_selection'], y_score=y_pred_selection[:,1]) - - #Validation - predict and AUC - y_pred_validation = logit.predict_proba(self._partition_dict['x_validation'][predictors_subset]) - AUC_validation = metrics.roc_auc_score(y_true=self._partition_dict['y_validation'], y_score=y_pred_validation[:,1]) - - #check if coefs are positive - all_coefs_positive = (logit.coef_[0] >= 0).all() - - #Update DF - df_forward_selection.loc[row] = [ - step, #Step - logit.coef_, #coef - logit.intercept_, #intercept - all_coefs_positive, #all_coefs_positive - AUC_train, #auc_train - AUC_selection, #auc_selection - AUC_validation, #auc_validation - predictors_subset, #predictors_subset - predictors_subset[-1], #last_var_added - 0, #auc_train_rank - False, #selected_model - y_pred_train, #pred_training - y_pred_selection, #pred_selection - y_pred_validation, #pred_validation - 0 #first_rank - ] - row +=1 - - #Only positive coefs - if self.positive_only: - all_coefs_negative = len(df_forward_selection[(df_forward_selection['all_coefs_positive'] == True) & (df_forward_selection['step'] == step)]) == 0 - - if all_coefs_negative: - if self.verbose: - - print('No models with only positive coefficients, following step skipped.') - print(df_forward_selection[(df_forward_selection['all_coefs_positive'] == True) & (df_forward_selection['step'] == step)]) - #Skip the next steps and go the next iteration - #The fitted models are not of interest if the user explicitly - # says positive_only=True - continue - - ''' - Previous solution, I kept it FYI - if len(df_forward_selection[(df_forward_selection['all_coefs_positive'] == True) & (df_forward_selection['step'] == step)]) == 0: - raise ValueError("No models with only positive coefficients","NormalStop") - ''' - - ##Find best model - #Sort AUC by size (if two with same performance, ranked by first appearence) - df_forward_selection['auc_train_rank'] = df_forward_selection.groupby('step')['auc_train'].rank(ascending=False, method='first') - - #Find model where AUC is highest AND all coefs are positive - convert to boolean flag - df_forward_selection['first_rank'] = df_forward_selection[df_forward_selection['all_coefs_positive'] == True].groupby(['step'])['auc_train_rank'].transform(min) - df_forward_selection['selected_model'] = (df_forward_selection['first_rank'] == df_forward_selection['auc_train_rank']) - else: - ##Highest AUC, regardless of coefs - df_forward_selection['auc_train_rank'] = df_forward_selection.groupby('step')['auc_train'].rank(ascending=False, method='first') - df_forward_selection['selected_model'] = (df_forward_selection.groupby(['step'])['auc_train_rank'].transform(min) == df_forward_selection['auc_train_rank']) - - ##Add next predictor - add_variable = df_forward_selection.loc[(df_forward_selection['selected_model'] == True) & (df_forward_selection['step'] == step), 'last_var_added'].iloc[0] - predictors.append(add_variable) - - #Return only DF with selected models - clmns_out = ['step', 'coef', 'intercept', 'auc_train', 'auc_selection', 'auc_validation', - 'predictors_subset', 'last_var_added','pred_training','pred_selection','pred_validation'] - - df_out = df_forward_selection[clmns_out][df_forward_selection['selected_model'] == True] - - - #Reset index - otherwise lots of nasty errors later - df_out.reset_index(inplace=True, drop=True) - - return df_out - - def _cumulatives(self, df): - ''' - Method calculates cumulative gains/response - Returns nothing, adds cgains/response into the dataframe - ---------------------------------------------------- - df: df with best models - ---------------------------------------------------- - ''' - - def cumulatives(y,yhat,perc_as_int=False,dec=2): - nrows = len(y) - npositives = y.sum() - y_yhat = pd.DataFrame({"y":y, "yhat":yhat}).sort_values(by='yhat', ascending=False).reset_index(drop=True) - cresp = [] - cgains = [0] - for stop in (np.linspace(0.01,1,100)*nrows).astype(int): - cresp.append(round(y_yhat.loc[:stop,'y'].mean()*max(100*int(perc_as_int),1),dec)) - cgains.append(round(y_yhat.loc[:stop,'y'].sum()/npositives*max(100*int(perc_as_int),1),dec)) - return cresp,cgains - - cresp_all = [] - cgains_all = [] - - for i in range(0,len(df)): - - out = cumulatives(y=self._partition_dict['y_selection'], - yhat=df.iloc[i]['pred_selection'][:,1], - perc_as_int=True, - dec=2) - cresp_all.append(out[0]) - cgains_all.append(out[1]) - - #Add it to the models dataframe - df['cum_response'] = cresp_all - df['cum_gains'] = cgains_all - - def _calcImportance(self, df): - ''' - Method calculates importance of each variable - Returns nothing, adds importnace into the dataframe - ---------------------------------------------------- - df: df with best models - ---------------------------------------------------- - ''' - importance_all = [] - for row in df.index: - importance_dict = {} - for pr in df.iloc[row,:]['predictors_subset']: - corr = stats.pearsonr(self._partition_dict['x_selection'].loc[:,pr].values, df.iloc[row,:]['pred_selection'][:,1]) - importance_dict[pr[2:]] = corr[0] - - importance_all.append(importance_dict) - - #Add it to the models dataframe - df['importance'] = importance_all From 4770ced89c9326d5e1ece382b61788962d3334a1 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 20 Mar 2020 10:31:14 +0100 Subject: [PATCH 55/98] Improve model perf comp code of forward selection Add a splits argument to compute_model_performances so that we can compute the result on several splits, even if we do not know in advance which ones. --- cobra/model_building/forward_selection.py | 45 ++++++++++------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index 674d3a4..543334c 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -56,7 +56,9 @@ def get_model_from_step(self, step: int) -> MLModel: return self._fitted_models[step] def compute_model_performances(self, data: pd.DataFrame, - target_column_name: str) -> list: + target_column_name: str, + splits: list=["train", "selection", + "validation"]) -> list: """Compute for each model the performance for train-selection-validation sets and return them along with a list of predictors used in the model. @@ -70,6 +72,8 @@ def compute_model_performances(self, data: pd.DataFrame, dataset for which to compute performance of each model target_column_name : str name of the target column + splits : list, optional + list of splits to compute performance on Returns ------- @@ -81,36 +85,27 @@ def compute_model_performances(self, data: pd.DataFrame, results = [] predictor_set = set([]) for model in self._fitted_models: - # Evaluate model - performance_train = model.evaluate( - data[data["split"] == "train"], - data[data["split"] == "train"][target_column_name], - split="train" # used for caching - ) - - performance_selection = model.evaluate( - data[data["split"] == "selection"], - data[data["split"] == "selection"][target_column_name], - split="selection" # used for caching - ) - - performance_validation = model.evaluate( - data[data["split"] == "validation"], - data[data["split"] == "validation"][target_column_name], - split="validation" # used for caching - ) last_added_predictor = (set(model.predictors) .difference(predictor_set)) - - results.append({ + tmp = { "predictors": model.predictors, - "last_added_predictor": list(last_added_predictor)[0], - "train_performance": performance_train, - "selection_performance": performance_selection, - "validation_performance": performance_validation + "last_added_predictor": list(last_added_predictor)[0] + } + + # Evaluate model on each data set split, + # e.g. train-selection-validation + tmp.update({ + f"{split}_performance": model.evaluate( + data[data["split"] == split], + data[data["split"] == split][target_column_name], + split=split # parameter used for caching + ) + for split in splits }) + results.append(tmp) + predictor_set = predictor_set.union(set(model.predictors)) return results From 9d4f90a7ab5c98a7753fc342d8fbaeebe6c001da Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 20 Mar 2020 10:54:14 +0100 Subject: [PATCH 56/98] Modify processor.train_selection_validation_split Changed train_selection_validation_split to allow the validation_pct to be 0.0 (i.e. has no validation set) --- cobra/preprocessing/preprocessor.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 417ff02..cc33367 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -234,7 +234,6 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, train_data = self._discretizer.transform(train_data, continuous_vars) - if discrete_vars: begin = time.time() self._categorical_data_processor.fit(train_data, @@ -366,6 +365,20 @@ def train_selection_validation_split(data: pd.DataFrame, random_state=42, stratify=stratify) + df_train = pd.DataFrame(X_train, columns=predictors) + df_train[target_column_name] = y_train + df_train["split"] = "train" + + # If there is no validation percentage, return train-selection sets + # only + if validation_pct == 0.0: + df_selection = pd.DataFrame(X_test, columns=predictors) + df_selection[target_column_name] = y_test + df_selection["split"] = "selection" + + return (pd.concat([df_train, df_selection]) + .reset_index(drop=True)) + if stratify_split: stratify = y_test @@ -376,10 +389,6 @@ def train_selection_validation_split(data: pd.DataFrame, stratify=stratify ) - df_train = pd.DataFrame(X_train, columns=predictors) - df_train[target_column_name] = y_train - df_train["split"] = "train" - df_selection = pd.DataFrame(X_sel, columns=predictors) df_selection[target_column_name] = y_sel df_selection["split"] = "selection" From 1c49c9fd0a11d00e672436907c9f6cd083b1b13d Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 20 Mar 2020 11:37:37 +0100 Subject: [PATCH 57/98] Update preprocssing modules to fix pandas warnings Also fixed bug in KBinsDiscretizer serializer so that constant columns are now included as None in serialization. --- .../categorical_data_processor.py | 21 +++++----- cobra/preprocessing/kbins_discretizer.py | 17 ++++---- tests/preprocessing/test_preprocessor.py | 40 ------------------- 3 files changed, 20 insertions(+), 58 deletions(-) delete mode 100644 tests/preprocessing/test_preprocessor.py diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index dd94a8b..5b3a4fc 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -271,12 +271,14 @@ def _transform_column(self, data: pd.DataFrame, """ column_name_clean = column_name + "_processed" - data[column_name_clean] = data[column_name].astype(object) + data.loc[:, column_name_clean] = data[column_name].astype(object) # Fill missings first - data[column_name_clean] = (CategoricalDataProcessor - ._replace_missings(data, - column_name_clean)) + data.loc[:, column_name_clean] = (CategoricalDataProcessor + ._replace_missings( + data, + column_name_clean + )) if self.regroup: categories = self._cleaned_categories_by_column.get(column_name) @@ -289,13 +291,14 @@ def _transform_column(self, data: pd.DataFrame, "and will be skipped".format(column_name)) return data - data[column_name_clean] = (CategoricalDataProcessor - ._replace_categories( - data[column_name_clean], - categories)) + data.loc[:, column_name_clean] = (CategoricalDataProcessor + ._replace_categories( + data[column_name_clean], + categories)) # change data to categorical - data[column_name_clean] = data[column_name_clean].astype("category") + data.loc[:, column_name_clean] = (data[column_name_clean] + .astype("category")) return data diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 9ba9449..4ad153d 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -125,7 +125,7 @@ def attributes_to_dict(self) -> dict: params = self.get_params() params["_bins_by_column"] = { - key: [list(tup) for tup in value] + key: [list(tup) for tup in value] if value else None for key, value in self._bins_by_column.items() } @@ -193,9 +193,8 @@ def fit(self, data: pd.DataFrame, column_names: list): bins = self._fit_column(data, column_name) - if bins is not None: - # Add to bins_by_column for later use - self._bins_by_column[column_name] = bins + # Add to bins_by_column for later use + self._bins_by_column[column_name] = bins def _fit_column(self, data: pd.DataFrame, column_name: str) -> List[tuple]: @@ -273,7 +272,7 @@ def transform(self, data: pd.DataFrame, # can be None for a column with a constant value! bins = self._bins_by_column[column_name] - if bins: + if bins is not None: data = self._transform_column(data, column_name, bins) return data @@ -305,14 +304,14 @@ def _transform_column(self, data: pd.DataFrame, column_name_bin = column_name + "_bin" # use pd.cut to compute bins - data[column_name_bin] = pd.cut(x=data[column_name], - bins=interval_idx) + data.loc[:, column_name_bin] = pd.cut(x=data[column_name], + bins=interval_idx) # Rename bins so that the output has a proper format bin_labels = self._create_bin_labels(bins) - data[column_name_bin] = (data[column_name_bin] - .cat.rename_categories(bin_labels)) + data.loc[:, column_name_bin] = (data[column_name_bin] + .cat.rename_categories(bin_labels)) if data[column_name_bin].isnull().sum() > 0: diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py deleted file mode 100644 index 28432ae..0000000 --- a/tests/preprocessing/test_preprocessor.py +++ /dev/null @@ -1,40 +0,0 @@ -from contextlib import contextmanager -import pytest -from pytest_mock import mocker - -import numpy as np -import pandas as pd - -from cobra.preprocessing import PreProcessor -from cobra.preprocessing import KBinsDiscretizer -from cobra.preprocessing import TargetEncoder -from cobra.preprocessing import CategoricalDataProcessor - - -@contextmanager -def does_not_raise(): - yield - - -class TestPreProcessor: - - def test_from_pipeline(self): - pass - - def test_fit(self): - pass - - def test_transform(self): - pass - - def test_train_selection_validation_split(self): - pass - - def test_get_variable_list(self): - pass - - def test_serialize(self): - pass - - def test_is_valid_pipeline(self): - pass From 64bff7994c73dd6b0c17e7588d8a34db246b03de Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 20 Mar 2020 12:00:29 +0100 Subject: [PATCH 58/98] Bug fix in models.compute_variable_importance Changed output from tuple (Pearson's corr coeff, p-value) to only the Pearson's corr coef --- cobra/model_building/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index de09015..53f8c3c 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -145,6 +145,6 @@ def compute_variable_importance(self, data: pd.DataFrame) -> dict: utils.clean_predictor_name(predictor): stats.pearsonr( data[predictor], y_pred - ) + )[0] for predictor in self.predictors } From fd2d3bccbc97435136b83f9f71c44e942fce818e Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 20 Mar 2020 13:59:21 +0100 Subject: [PATCH 59/98] Add usage section to README --- README.md | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 96 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e61bd6b..cc7293b 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Note that this package is a refactored version of the back-end of the original w - add columns with incidence rate per category/bin. * Perform univariate selection based on AUC * Compute correlation matrix of predictors - * Find best model by forward selection + * Find the suitable variables using forward feature selection * Visualize the results * Allow iteration among each step for the analyst @@ -53,12 +53,104 @@ As this package is an internal package that is not open-sourced, it is not avail * Clone this repository. * Open a shell that can execute python code and navigate to the folder where this repo was cloned in. - * Once you are in the folder, execute `python setup.py install` or `pip install .`. + * Once you are in the folder, execute `python setup.py install` or `pip install .` (preferred). ### Usage -TO DO +This section contains detailed examples for each step. We assume the data for model building is available in a pandas DataFrame called `basetable`. + +```python +from cobra.preprocessing import PreProcessor + +# Prepare data +# create instance of PreProcessor from parameters +# (many options possible, see source code for docs) +path = "path/to/store/preprocessing/pipeline/as/json/file/for/later/re-use/" +preprocessor = PreProcessor.from_params(serialization_path=path) + +# split data into train-selection-validation set +# in the result, an additional column "split" will be created +# containing each of those values +basetable = preprocessor.train_selection_validation_split( + basetable, + target_column_name=target_column_name, + train_pct=0.6, selection_pct=0.2, + validation_pct=0.2) + +# create list containing the column names of the discrete resp. +# continiuous variables +continuous_vars = [] +discrete_vars = [] + +# fit the pipeline (will automatically be stored to "path" variable) +preprocessor.fit(basetable[basetable["split"]=="train"], + continuous_vars=continuous_vars, + discrete_vars=discrete_vars, + target_column_name=target_column_name) + +# When you want to reuse the pipeline the next time, simply run +# preprocessor = PreProcessor.from_pipeline(path) and you're good to go! + +# transform the data (e.g. perform discretisation, incidence replacement, ...) +basetable = preprocessor.transform(basetable, + continuous_vars=continuous_vars, + discrete_vars=discrete_vars) +``` + +Once the preprocessing pipeline is fitted and applied to your data, it is time for the actual modelling. In this part of the process, +we first start with the _univariate preselection_: + +```python +from cobra.model_building import univariate_selection + +# perform univariate selection on preprocessed predictors: +df_auc = univariate_selection.compute_univariate_preselection( + target_enc_train_data=basetable[basetable["split"] == "train"], + target_enc_selection_data=basetable[basetable["split"] == "selection"], + predictors=preprocessed_predictors, + target_column=target_column_name, + preselect_auc_threshold=0.5, + preselect_overtrain_threshold=5) + +# compute correlations between preprocessed predictors: +df_corr = (univariate_selection + .compute_correlations(basetable[basetable["split"] == "train"], + preprocessed_predictors)) + +# get a list of predictors selection by the univariate selection +preselected_predictors = (univariate_selection + .get_preselected_predictors(df_auc)) +``` + +After a preselection is done on the predictors, we can start the model building itself using _forward feature selection_ to choose the right set of predictors: + +```python +from cobra.model_building import ForwardFeatureSelection + +forward_selection = ForwardFeatureSelection(max_predictors=30, + pos_only=True) + +# fit the forward feature selection on the train data +# has optional parameters to force and/or exclude certain predictors +forward_selection.fit(basetable[basetable["split"] == "train"], + target_column_name, + preselected_predictors) + +# compute model performance (e.g. AUC for train-selection-validation) +performances = (forward_selection + .compute_model_performances(basetable, target_column_name)) + +# After plotting the performances and selecting the model, +# we can extract this model from the forward_selection class: +model = forward_selection.get_model_from_step(5) + +# Note that model has 6 variables (python lists start with index 0), +# which can be obtained as follows: +final_predictors = model.predictors +# We can also compute the importance of each predictor in the model (dict): +variable_importance = model.compute_variable_importance(transformed_data) +``` ## Development -We'd love you to contribute to the development of Cobra! To do so, clone the repo and create a _feature branch_ to do your development. Once your are finished, you can create a _pull request_ to merge it back into the main branch. Make sure to write or modify unit test for your changes! +We'd love you to contribute to the development of Cobra! To do so, clone the repo and create a _feature branch_ to do your development. Once your are finished, you can create a _pull request_ to merge it back into the main branch. Make sure to write or modify unit test for your changes if they are related to preprocessing! From 26cadd5ef8369e195ea9f1ba3d7e4f23ec8e9858 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 23 Mar 2020 14:08:47 +0100 Subject: [PATCH 60/98] Clean up of repo --- LICENSE | 21 ------------------- datasets/analysis_settings.csv | 10 --------- datasets/data_types.csv | 36 --------------------------------- datasets/titanic.xls | Bin 296960 -> 0 bytes examples/testing.py | 19 ----------------- 5 files changed, 86 deletions(-) delete mode 100644 LICENSE delete mode 100644 datasets/analysis_settings.csv delete mode 100644 datasets/data_types.csv delete mode 100644 datasets/titanic.xls delete mode 100644 examples/testing.py diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 269bf5f..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2018 Jan Benisek - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/datasets/analysis_settings.csv b/datasets/analysis_settings.csv deleted file mode 100644 index f374672..0000000 --- a/datasets/analysis_settings.csv +++ /dev/null @@ -1,10 +0,0 @@ -partitioning_train,50 -partitioning_selec,30 -partitioning_valid,20 -sampling_1,100 -sampling_0,100 -discretization_nbins,5 -regrouping_signif,0.001 -preselection_auc,0.53 -preselection_overtrain,5 -modeling_nsteps,30 diff --git a/datasets/data_types.csv b/datasets/data_types.csv deleted file mode 100644 index 895323b..0000000 --- a/datasets/data_types.csv +++ /dev/null @@ -1,36 +0,0 @@ -age,int -workclass,str -fnlwgt,int -education,str -education-num,int -marital-status,str -occupation,str -relationship,str -race,str -sex,str -capital-gain,int -capital-loss,int -hours-per-week,int -native-country,str -TARGET,int -ID,int -scont_1,int -scont_2,int -scont_3,int -scont_4,int -scont_5,int -scont_6,int -scont_7,int -scont_8,int -scont_9,int -scont_10,int -scat_1,str -scat_2,str -scat_3,str -scat_4,str -scat_5,str -sflag_1,int -sflag_2,int -sflag_3,int -sflag_4,int -sflag_5,int diff --git a/datasets/titanic.xls b/datasets/titanic.xls deleted file mode 100644 index acbf5ece74e71f6f952f87cf933470858e181183..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 296960 zcmeFadu-%ccHdV$JG0aK)Q+^fdhhrt-~Df2`J=z~8$a;JrJ4VJ zwDf_}|Mm1kr4JU$U*hNG7ec`&O4L7ns^8+jU#0Fx`SS*UevCgY{?O$A|L_095ctq_ zHu8fnmG&3^>z9B3cdnH__;RWA$E6oaf9u8H*&NUv!QOsn)&e!5or1xjBjeZS4@yY27qvA=)a+Wj5-`>Unz=9^Nf_QU_{ zCl>9eYo#B5;m54qzhQsR+20@M?`x&MZQuNTD=C+5mHs!rc>2?SnGyBhcU{{rbxLQY ztTZeor7iw_!oRzvK4odCSNfOhix*2DEotRi{QFt?e@V-yKmB*KMSSx9zR02bzP@;^ zbV9p^&5AAhH$Qf*4cxoz$)nPMzmw7ie;=1lN+b5-Go{zBU8A+lh}JL9v{s83-?PO( zU@_0{;~%i#SMoVsHHun%@;zI8&eY{Zm-q@n_teIYqqdcQUsq~@u=%ZA+{>&Did9hl0iCJ8mEB$IR2T@B(pDVr2 zpU;&RN?$HbO21jUS*ZEUi(lmu{Gn%k_7hNptaMoV$>((NM(O3!4`1EEEyjM{=wAPs z7uQ+X-1ED5@0rBT#--Ovi=}Hn1q3Fmer*aKs{3;3a*Ju$Jge0l5S+8Gy^kNWpFi{B ztVTJpKqKlizXpGv;0pTBvt_}kpgIV=CMpGrRS;$LEie(d>E^;JD)E zO)bvBp_mWaDE$qCe;hP^pN|_qwxNN4{^j?UepMX7PkjCtZ!Rt@{iLmo#tRFddGVvA zU*l+&7VKAk|5)j7a?axK6nv!g>*9dD!5=}=_22*gG_imEuir^jel3>&S*&_~x$aEs zXQutL%GK*t?f+#izXlDO{O9oeehLf@zZGg;{CR`_&DM)|+?jPjp*M)|+{jPhT3 zM)|Kkqx@O(`FEbt{%<^^{8{~b`|rC1A8&uT+~;$-+~;$-{8|0~hRySG`~UbE<cKCcgGF4upTQC3Uq&(rr%jGlUN)7O5682}x5gvU$Iouqtz9*LdLFv!&;}67f zzblsK^LM5qIG`1^T$C6?2_!{7JiC0N0$;QXQ3erBFrz~253kRM9F zk;CUxv3#ce%amW!UHaqvVE;MXUgi?A@^8+T|N52X+$LA{976##Seb#`pwd}rpmuUix&{>KZ&xZWPij*a09>g zaaiSR-@4wK`2Y@xX3zTIh3lcG7is3)!Z+z)wiDQn_wMBB8_(~g`hGg8RHr+cU4M;E zp1uATE>C?SA5|AMsPwc5uoqrr{i^uAfBN4%efIk2r#ktC(g&~ZNSvf zQWm-C4;RX4ZvTDj`s}1&6sIQj#?_Ph(?9*wOOtxzjW;e$N@cT?QrYaJR2Jq0lx8P| z)KZw#|M{(FfcvUR{fmRt>_KWWDL;*Z{-sI9ZF&0bYdNGYuk5vaWugK4?@E;ZrC<7` zypxCn|Kh+sdoX{LY52j^%Gj4_9QfAtw`VYUiDoaA;`w{}^xgb}IN9I)&EI@l3af~a zv9hp=h_HDXND~n@F9T^VQ_}{dA&B;CV0+ z+xtFX^88LD%f4qP?`}N5lh00d!Y^0Dp^2w=Ao=H!h-w6W5`+>>x zJ4vQG;g_pn^88L-eLtN@Q~f-cNbGpeEzXXf-^mZYpHAN0cz!28GSvycTn&?ACqMhb zcb7iF^;N1Yd?C4+vzk$T2z24UYcGE0>BmqpE|k7dO3();H|1->H zGS&zH*DyG}{9<KYjP= zVLmxKO#SjOx1KXheKO4T?C9_P@t+UuUd`wFirtv;7ty2B@|Lh<6Fd}ID zm(#Q#o*m{2O#A$EhACoNrTS-o>n~nC%x7kYxdPMXpEFDm(<;^9|NY;;dYI474s!*j zz5bkGikMcZKK*+~R}YiS4s!*jEj(wKBBoWUKl97s+|2IHkIW8p1*X-WGfWZFD%D^9 zyFYW)YCeiy5xy(GH(N9J<_|x8`I&bmcC4TO@2>9jbCXWNsM$_`WTw+17L`hW@~!gK zoqm4ODflzn=@(`?E#gecgl)Ukr=k2&7=nEHV9RW$H)c95VhBg+cS={!^ox^D!HwBY zZ_adD#D`L8W$>S0-RY~7PQirPPRlc$7I(i?I{&r1S9N;5G|#nHT22qYbA4%g=dbV^8#Pn1?mZ{xqGz38c8v)M_d87Vmn+{_G1MN60`tiJe3HW;=i7 ziq4(>J)`rdZ~W@J{|gII=hb58(7@Twzx#^Loen;ub7u`+u+D46&Y^`f1mOeUb4BN( zhrVMk7r`Q|t$+LaWvqSSx^;2^5@c+>oacHRI{o*vFMJqo23^s5?|cmUjHkExg--&5 zby}Z;ze1D=$#{xMAa1nm2LBI9#wSWm(Xx$B?=(9MG~@ovWWEntCT~AYULIFPs_}sw zgLC>7DVDU~A7rH;#pQ`>XDb_z1}P+IhL~UbA&9vgJhjpRcqn4z;)&5<>?m=)zksnN z91_s*+a*9gH$7L}B0orHI%y&*>Po==P-#g=YO6oWzH|N2^d~QTm?Nc6M$AF#NJ*16 zGP>+Y^`wWM&|?2BQa%4IQa%6eXJ7aTemCg7w#Ose;3qSKeu}QQZONCnxOgjwh)!*> z3MMo0-{MUCw>T63P1yP6Z@&bnK0iH$mtF$yJBFE2H{ihfaOo?;%>B-&3$)79Uwz=a z*bTYnjsULUb@@PC6LNpmy6BIz;WO%q4}8CMA%Oc1y=XWzrGgOLBdPJ##@~cduath2 zJt{p+2R%_Wr@(JaPxOV4K+GR9!T`LDgzYNIt9#xc{4>{oy!0~#J<4BZb+7RMKAk+W z)$TFn3;Su4wToVkL9+@_?35lG22ENN-D4Z2KX*;$l)uE?@u#1=#!uHvqs}Ppbq=aO zcm4ZHKUWx4wfav*zle+YFBa-l{;z|`vOE9HYx?)vi{DGj_|5NLyI%TTe);Xa7fS!U z^r@tb^6sC&5TNUAchkY?>&fQeR`M|Ij~gX85*&6w{vvV9~}B1ceDQBI7_}Z?(_~Wm`1zLXcu0$ zdiW$A940&cvux16(5L;g^HIMSd)!M0X|kI=$_@_Fq*%-3TixvKw0Fq-_tT4Re~{c8 zz)?;*7}$(v+RSgJT-2-G&O2$c-aF0)LuPXWa^CI_daO#*;=A4cqiirr;zANW>pt%E z(qs+)`(CnzPkPBR9}dzp>?Z9&fB1-nw$t;`tt2eb~-xA2Ay8!bA6PJPLj2>I~;L+-@u5Ge{6tXO9z8aHcEE6^xESyPQoxO z#HTb!4*_{He+MvSy@M=S9(0b6hspj)dXWum9h0x(I{c$gbL*$wuw6Tu&}Aj-b%qI> zwF25^2C^Jt4?(34&;dETRui!~YAa8yx5u4Bpr;AmVg|q+0DfNWClzc#tV!m#2c5LX z>Mw>jZqTfAVAyNBYrE7svVzN<)A&&`zdL4g9&`?M6mEdvEB*0tH?E1eVb|A_|oiW;oN$s`s5_-rpd~9#3F9AXgxkUO73z3 z1?G+P@Zwf7zcD@#Xc%bF8IFKxn(Q(0YykUucq5sg0}qo;j%wPQyYc!=AA22u(e4k= z`rz@|xOZsn^X4G?%2;ssO$|tpfTS8=2<%oN(8H7TNkGfx{&9@Voeuh49fFPixHHVKF|Cf7dyZ3fj_qu0j)%kS0_wKXvxq_OqfJ79V^<{#Mq?cbij#I z%|$ODvmLhe!IKUPn_C}1FplSLe3mAhy0bJiTF!cJrS!$VJVs}qcDpQ#7NcPCH*Ic1;5oc8axjGMtcIg#j@|{l~|k z@(p!8NP7qA2(sCUGv7$Z973A`1VRl?*+7BTPV`wiOm@=2NqWX!Q#%F%;I1FUwX6r~ z2jl^ziuL+_eVC>P$z4u)_Yl!xx|(TDeGduco)f|DfOu>#_@&(vLLGpVH}^)_iOt(q z{q~s=uHGA0H!&c@HUocLsr| z?lK=PL4gal7+?wwnAE-W;Lnet2nbc96AomtmibIobG$p<3kJAIcCxH{gKfQ=g??IK z{dkb9L6Yxg9j?4MtJYcC?dVY5?L2aVY`DGB*Rj<`^g22U7~Ap&!iJ3uTE?Y4zmtyb zaJG}>ey017Gj_Lg-0k#z0S`I{h{c@AyZ!EA)`I|tqqmoyjls|j7ByW(@3AhK=Su7x zr^($x`l$b8c$(^(QuE=#$yw)cE7e*h{`2hPQX z=cjj7^ua&%YQP?P^a~lDDF4*RzG2CkE~m3|maJKd#M~SAPQf!B_Pc{D>m6|~2KFZs z!@w74%~LHLqKVyp$^0_Z81w|02L16cyCLX}j|&v)77T_H5D5#-0JmGV`b>O;wRAX4AB92E zv$OONu4DDI%dHg)d&2}VcRZx^>gfBC+*=L%e@ls@0HG@KBuWra<4zk&QCZ^ zm&)Kk)#K#j{j}F(8zIrF-QlBgJn<1F?xknhP?sK;0|$cH*#XR1ZKLvBKC5S)u*qCu zx|~q!ourRI+*Mc(t|(uZkP`Z8y)qAL>m0491w*lpJDnZ`(A#f}JI5VePaTd_-1$|G zsWt=@&%_yI13*1<`>pkRTs)#-g}YA!0J>rkH>`&2=j^_NPIH@-1z_mYejps}lYYN< ztjpPjh(MOa3g~VLSMS_P+8JKViKrKv!MS^@4+l@vP^2?PHkmr7djX#z|ZiF*fr+_I7eWodbnZZz3o8WNk`)E@Zy-h4CAqwk3XyTb9~d`B2z)ae1VyXoMl12OlG#cVLwyC)K_PO_sTFW!?Z*;g+ez%@J#dks=% z@4&lTd5_*{ZD766<>G?I96xrlIZ>Rjt=53yFOt?V&C=Uci2$EL^7uaR$hIz*eImH# ztc=bg$qrvYUMHGG_mP#7yPZQgk?#?~H)3tT zYIXY^28BAWrJPF>fO16703XF7Yd+8Q-tlAc2;1GX!(S{AqBGUve!qtt46Y+NKwEdS z4inF`rnU6+6b2I!AuM_)qafKq4u{7>(wk`jQcbl0lR2R3xzjyFca{Y12hin)qbt>VA2?V(7Jno$I(ba(GRKpkh>{^eE`GVt+ zfX~G%$=O#r2#93;LwM}!4vSjj;vB-fCwG%A4jo*uX(4PhX3YhZ%x&@aoMw};*CX-M zm#aUf>IhI66D>MB%*C;7ILiU-B2$B!A}yT4$QfX*s|_|dwGrTk3*`^9-eK1+qmEa1 z(aks?;Zv|pg?4sQ>lTDR?LY1uI7rryVc`yhL(!!&G}OnJBJnC4
  • 1W2@SD5e;2=q5LxcG%88EQKNXdT32PIO6hWaFx%Ql(iiu!6VT5Z`D)3nYinoX;!9@>j)(<FKH87 z8l+vf$=0->_L9DCmn(wSN_in!DOUuy#Y(cg=#a3jH1M|<)%o(0%2y03E4Gg1hE1fb zZdVP??W&h(#Vc0cUaDegt{-X(m1aXfRGZDJ|FE#IxL{SaYGYA9 zlouP7CCK1j`}R(2cYQCZlp76f0qt9@TS;Z9x>Q%!i%>XOFVyS07m_Rg&xW?@UHH5O`d zUe#KoR-68~v8Z`As*Q!%sNAg7R9I#En$`tF&wS-mSt>6yimR(zUyBQc0qG+wvD(Oi zY!d_8HD6hi_4`y8%icp}pu~A=&)2l3%o8h}=wf0d&fLm?;Ps-JL&9`WA zwmbl++NdsRoZ7-tWl8)zVJ+>iUPqx?K z4be(t1Ul=Z7N0a?j%F|0y2m&+V#+O?AV{MIWlAgDZSWz3ZXagG7Y*OvIY^I@X4f~k zWZV6uH9F~XV^a{;R9v}E0)mPye7!Zt8B}g3R(XyJUo>j$)^2)^-G&vkUMrREcep(dumdsx%iHUAY_(S`j@~1pEOZ|Y_N1Qz7JMEcce*1ucJ&hny0jOw8HnTqlQD0A%+a`VS zvx8=*1pxrF&?|B(p8)TzH^BlV9F$SAzUAZTfE5S@pP2xw!@;&gwy-_fk=@*a2U|{u zQl;%Z>5Nc#h=70?`X1Ph;O2!Y&u%v1YAo6m7b}hW6>G6;p-S&Q z7q0N7MPD5tsaBi14!8ktw{F$A^s9Bdld5dQlJ1jgQ+wj$G|QC$TW&CM1+ClqWTC!j z4HlV@I5uuIabWxtezR#dny^P~^2Sb5t}iZL-b+@KzK-w_xn;{+O&1#bm}ZD*Pb)v&GgX49(JPv6)kfLefcUxBy0 zV{BnvTwc7GL|6Gvz)_8oEfcf<}>b^qTMGmY{9I zaEyA(Xro`>E$%g%VE{WtCyW`$y3_)7&9Rju6GQsFZK+Ew6mOy^(hFF|4u=ABcO6 zci!rDiob4TJ>a@wR*7TGK{PTLw8jIegIe6BofHB~3ot^n3)**h1h{fZJd^jJ=HhGKSN)?r|knGnl`oKwS336P4(Et zMM`KzRn(R8=|0X0c)96xNi@8`CFV3&tZDM6eLmVu5ALT@gz`nOS;RG~@(@pfd^C^| z?jPzux0%WcC@U0Aa&`F~nC#J8{UdI;o({=on)S}pT*Iz2&@$gbn9z=meKNsxO3S6+fBa zQH#Au=J?yKqBFI*;nYScpaH-IT%4NCgRRNV%7Tp0SttX-DukqT@U!S8+plh9kkK1Z zLR!HLNqlXtw4zLO3Fw9aT!$S-{%8e|LGYh1K>wk53mBAHu76wl^_z5Og8^j_z9 z%vH=$;>2rJ{k}d;b>V;nvfycOFV(d;zYMTbU56?RCoy_C1rx6I!@U{j019HOE|k)B zFqYoaYT-xX`AWInykYJK5lEMSh~}toUXi(f99xvivcK6M1{enCRWrWWH{(apH|=v+ z6!%f5KP>B;*(zu`W|6ynCvybmF3!AEPArABb3v zakhqK%F*1RovaFs>}}U>7P@wS5>044IG+%r={~bL9qp+G;EvxA_uv$T3{$nTD(Wa* zoItt4F79L}0|??H2rPzTsons7m#gp$ageuvUWEHNOYY+v zEw>bnt7&berycRk) z)!IU1Q7RzR`RMYcqn6yfwBQ=?TD9&9n$ zTOUlo)!0lV|3Wq-OeU>0Y$S;AKEdYVlGe7cFl{R_Z8*c!7Ld9tiwo6>9ixn5hMp#N z6`N!g%f-^vl&VoH-=tWeG&9U%Gq)yGt7TL52uyC+sbS%nvC=e}ANIYG%F1*psBV_Z zvLT@PU+~oeR@l#iIjFV-1w|Q9xLIJBKv+nCkYJ`}@D{vx2EEKHz))vPlZV+O5n7>O z_GCEW#RJjq_b$$}-i4~+gZo{$MW+(s^Ho@~;fQJ-vNX9!whvM{Kdj($3P-|ak3s?x zu?=^uOJ4)K!!~t2Btv~V97)UUhswsyAVSAID2jN7Sk0$M_*wUa``XkGaCBpN*la)M zaRr9n?PCl-9%t|PID6TEd+V@Ii`{H|fb%d@zlR08cuJg{!&#a-db+iS zCy+7fM>&%ywRP45o;&G9lw2QRh=$gn^9MwaE?Dvo+(Eby!qrZAij)N0ka-RA$ znfj2gdfAQF)TWmX5Be;ut!o1xv*=agjzP&GZl?sY9%j4GZcP8Pt<4=8!X;vSlu9bf z-m$*=BMtT-h2Fo_mtz}MgAAwPoLhr6k^k|Iz-iZa3TEY~bhm$q|M7(y;2s}2>Ja>{Ay zs}h9yJ+8Tot%ET_p%OlJVAQP?mLSX^I0x<>Ff9`qaY$7JkLC`yws`=91o&@!Uf*Sp z*v23TPBiq1yPe_Jv%by(cX93HFghkiw=&6TDQ-84-ALNttGM@UPN3d3y}pje8U&mw zc3!d$Kj4X-j-_BRg6>%2+#$=@ka!GSk8kQHA8u7HA+8nhp~CWLffk0V>Pw+tIrfAp zMlM1#3qOOGVB(zQP-YR}=kn(GxWA}U$^rK&tC=airYq>qYiv{8`6SsuppaYbXe3E~m!6#{1>VYLxGaLl zy$Np?78t0`T~6)f6NjidDB2fX*fW*bY;>xTzd*FPz41E=Fqxa`WO4w|LiMskB>hPz z!tWL!z}?4dS&5Cux;TS2s`h^`O}j_A54s#`&QCH}mA&*BsM=BCSV95uj|P5wDQc6P z6bFYbX;k-6R)7u>vE}lw5zkUs4Ri`z8<0hu#1F&`+VX?DB%s`rE-RX=WiI7Cx}e98p%AXf)(X+3U(qNF3qg~cf5-BsJ`6!R zt8#)wY0SxYHQ!$E0>W09;12Z19MT~zlS;a?>F2e{{1*3wq7h6p2kjI}oM(ryyYv=L zh)arG(H8b5Xyq-Ke!lbDDMH)@=e$tOh%@z*;F=MNI}iXO!}zETr8f;|Fuf6xQVKGT zCaN!NeQe-26`B`Qc4rDGLBVvwz2Ccu5{SqZR;7@@&Gc=nBiM3!D8E|;1TJiN2d|Cv zSSJxxJsx}2sTNG-auY?E$s1FiaFPN$!yBA*`sJYN&W~&G5Zz^t15u^60uexugR_%9 z(lHCz7bL}&4uN16Itdh*&7|W=AE^B%$I;{x7P7Sjhanlvv095P>pF zw z*<@16_6W{Quu#Sf4fyl>AhrQjT3+3Al14o~*eAH!MPx{=Sj%9=FC8(t800yCZdANNzzMh? zzc@7>p9Oo5;SFJa-3;zeL_NYQhV}TpK8IUz_d@!SVjGYQ#sPJIPu`@V2DpY3oZ$F@ zh|XS2s`CAt`XJzyE#;n!Lz54z%mXztFa(%g#iBv6 zx{ji|_d1U;UU*A4b^%(q(jpz z8q(^l@~cc4Gu0T22{q7GgwG5r7AdPuoh~HT0v5-}6BRyUiDAlO=DP4^MZMGLps&8V z{e%AFJoIPJl=^1;JLpIg3H#7}3pfUg&L+xQ!E3^%_${YV&n_&|$EGR_O_3w_`pE6! zzVUg2Z!K2GUfMV0s)h`D05q|ji%c%TDWzM0*`ne$0t&?QHVu2Y=CBL^C`p+>}%7& zW(y7x&|{FIW2bOdG7K0Q14yDZQY6GEh*siob*4FDfX)F8a@D5OOWR$kow?01Rz`bZ zfc`*m4%R*I)4tNIFPD-#NU`sN;#B2!w|~SJ`*2RiY3aP;S%E1huos*u`57o|*9URP zCqWm_u6hBLsh_zPYG|zPF%t-8a1UJ^4DdtrHL*_zsJ6Og8)~YO0b^|;R|nc`;$HFH z%oJd~Pk;(%pny(VoPr*```tc*jTR>wpHqC?EIKly?)T4f(7UP)4bxx*{{U9(%ljal zIx3NW!a21O21k)_3g+4>R`%yHDus1LnrRpVitqQwM|NYg5ooQu9aQR+omnrebVUwA zm?U)nW_SV1nehlc-8&sw7-Gw?&(25;V>4TccNM6pgx}DI7C#2Y0U&#$x3cq822M)f z(&$bQ@|zr-r{taXq5vKh8h7=7%ux-1KId@0Y5coAptmBm`-KUF zP-Lk(G^Nd~dy2m2G>F-9RKWpSY!{uP5PX0RA=KH4hgpCYD*&2J@+Pb^rYzV(cS|Ck z_66D!#gJJ8`~pVUHyu4W%HTNnWvr$3V0^*ehXzio@VT;Y2gwd(%x34i7Gg0?+u<;i z#x8Fp;20wX#AiF8PNS2*1kauEuz5ub0na{{jsvk!)K!>eU7cYZgmKQ4_~-!=SAi3; z$Tslk0a0O5C=|7Gw+6I)4**Poy1VmL3lhvlA>|A{#&RZj(0@E(wNCpC^bl8(+$W~x zvHc)8-gRMDat;uHQtDn&EH%=;fTbKrH1qlKHVjrj_}|3|Z*;`vj9`TxO6Oz!1CX4@ z*QO`)3b5+2ojB(_>fA)l^ymi2@(=@9@W|5GFiE%L1Z=n2s)wCI2>v~MJRV8CLckd( zbEshGMrRCb^sJ@E)D_hs;m)lUh4`p)gi|t zEDsa}Qu09IJfWE9>be8xKjOZK;h%7fgNw>+IdItMyz>r1E`SIDbQ0ybBY+i>Dq=K@ z*Bxf|NP!c6?)=O~tN`S9`XN9)ut8{xz>X1nN%oRR)8UcvZXp#2+GI?@%Bf-AG%7=%)s!RdjqcSwg5VI4-j2Wk`49x&FH#imS3z^Ox$cPOXAU2GiSQtaF^AOOs} z>F|{8^tCwJSZS^T+(@5*`Ug^R68Nr_c-w&f++Y+(pM=#ztiPO>LQ#HDfnLXbH+0#Rb8Uixd9yQRlRjkDRdy5r zgnB>px0{iB4Xy|TB0LyT89lq`Ww?)sqkcB)gTxzqwqgy|=1TwzqW zf-_!@;8zGdpW9;WIS5E>vztBUEV{FkHZx}Q@B?5IpA`Bn=tFBytbdX;@e`Ew%v*=e z!VmAQ6ebIM0{vPc*d9y4_aUYMf^|*z)bF2J9E5a6YuE)%D)iY)#jB2|O2tMM3psru z*G!?MfEG&UWur%=K@lw~vzFmF1hNl{K zx(?0tW+<7nUI{CnEoNlJ`lQbQ1WMkcGh^ zTAiuZiakq1L@vpr!4DUrEHCT}8B^}1NKOT2#sz_mDcVCxp5@F`*=oR2GWq z`oPQfxEp4{aYNsWY0B{(lWB#EWt^WodL-51|`}V zE-&;2ZRo%#Ps(hq6w-s8aP76U_&2~Su67tmeMzf(1^^O^5v4@Lfp$n41$3V8!{Fw3 zfZTDCmc|XG?6SLB0harqz3GtXFwbr{hl{tI_PE9GlV8XMVWV1+DG8?&+_~SATGtm7 z3>o*(E#FsIL9T-^*4L=H@*_wuDWzGz$C5VE6Zio;rI#pfvUkxvLIf6eP1@N+0`29P zZ(zsv;q6ZGE7Qt}hCIok;1*g=(FQC|q1^0O5Gpnbl5KoIOX`@6MNFCkvTzr>EBZN< zE#tn%(5|rerYvY&Q5)GsE^CLKiZ>W2!aP6`cv$>q z!>uWJYP>$)1rFi3{jI1l;GpK!%;r*kv=_Eu>*eG{q+|by&H(<%c8h8WfagM3i$fSp zOiqbpuLeexLO&QkT-LTb+z5F&d*fys1msB}9S36$mv_uWNGHcSN158+Nqt^s&5 zUmS+m6gdJ32d>nleN;mn-Y0=(L|_Rl+*a6~;5h=QJu<*qK-C?P*EB$}<<@4;CK>7? zpA#+)9|rQZiIaAOzN-dJGSXL^Q#7MK)8b47%PiMtyQPjzLn(=(B5*`TY6F=>`^z1N zV<=}JGGl)A1ovKn9P=yZ7kS=Um2v-_R2fO(^wTHC{>aGZG!mmB#kq9gT}3J_?}gC0 z{C3#MIrDBDA5JT%0^Vqy2=if;PMV|Ejp6aHbIUt}yZ2K%gmg}d+4EQq;>rJthn-Hc zGbu#2(#Hd711E^&f^1?6pO;FdAxZqs`S(19O~*&AjUV32{8J|bLR}dUR_aPcu-dTH z3=IW5lD9x1VQnjFH&xxPTaJRV@(-+7E`pV+K5kc)?qDTkZm^b%A!EWKqpVh!lGR!n zK0gFael4jIYiLafptGnqeqV~AiuE%4J`)GIxIjl1ch~UHM&cJO{86EewM$)FmLlL>q(lij!>C>(wjPVUuQNi{5#2p?-PxOP9ZEEX}Sl zkK=5}pRroU>(Ga-+7SBPhGtB! zgj{}@eO1N_LQ)$|wrY~303 zAHyGieV-fR>*q*x*mAiVW8~saXEc)AhkX2ZT3kYe+Z^IcBzYfSNHI!?kyiySBDDsK zU@785|$yulBk7M0rS=v41(v|dh;XP2vZp&3P}JMW1l3Chd^n2P3p9)cZMjO zNWufGV>Fc)nulk~;C7Uv022!eWWpiBXkgPi(J7McR=GRh<^!_Q_B!X8ID=)XW0C}F z;64=8M~p_cJx`y2BAd%d%cOrJ z7X((r5lj{-KZNECRaiAMFMzcbLzg*QmVd+1F)RnXnIOkrdVoWZ0*OJ#7Cztm{nHD6 zzL#RH5X~T4VG;7|6X=Fx$g1*6+cw3{o^fOg0omd=av5T#&|cf0BE*uRJKH1Tg$z5#TSpOKb427GE;_c)NaIbz|_sUr#)SxGy@&J!_kpbpdTX0>ZMsQ8gWBXQKVq*-hJJz7fQ)U z2tD7kwob~iAtH~1%xTz~LIX*?5oizg;Q-q3wZn{t4h1NK_M-pfCQFB#zv%~v;44!4 zb0i>*ZG00^-4no#_nIFwg!%$4)oKqUWb0HUpzv*sH2LW~>GatvlK?ns5YPL{!y)D4 z@QDyEhY7798;_Kt1ki5^XJJ?Q(G?jS7fYrGpM1tl0#rHK~HO5{jZ!60^vc0AwXT_8G(J~&f;%K0vri~CFh{tu(8K79-;ELPFuwdRw zvvUO#Z43_08lGMW?cx*$B_ZJ7JCHMqco@yvU?lODQ**v(>GY?DUB z`*F!t?Jo#Oi@hwTBtsr3Xl@y?wI9T!-Sp@Pe>0XxX)zhI*@0Zw-<+vrDTP|sRi&(s zB%R7?Fi96Gg#z6|mLw$GT7vT@=W3>6O_nyjxiBEQF6GLasTc5YOlJ@aauq$xdlqo_ zh?75)^=Sh+?I`8J6FTZCtdMu+fKxsJ#B*hM#B{D?1ypk8%y$XJ(ph$BWms~8cL>fD z78%XFVq!y?%+rK@>#$W}zOi?L!q8JV_oH>FYbIBCP{c%XS;hErl3UXJKMYzEcO6gI zDSd$!!Ba1`2zEXS@Gjz7C3JtX|E9_Ez8%7INi1i;;GO{mT81__xCoku94kY3$l)aznstSpl4TvThk->CAgZKbd_?faw*l(_ z`i~LEbs2jXMDZf`>w$DaFi&KE^^nHN1a zs9oD54u-R68^QMz8#lARfEe+S7t)d7>HUNv>jw+eWLss>U-pDtg2QC%66MKOl2kZw zy#m8pY|>MK>|U)D(tbubFv^~c{9q(QK*AX~ZsA5I&y8SO5Ht#P+vEge7i0i3V#Y%{ z(cWY`zdM$>Tlk3@6Nhk)$c#D4kGwt}t3f{DJgQT;kyqJTQTBGNbOJG=63#lQV&jp3 zCgG+6E|(-%G6$f|g8@2aw&qPxK(uM5a6mdx@F13IGqU488K{nBurCyEj8DBJ_v^Zc zEXGDk!U<;?omeX8oHiTphbas_xUex|DWe2eOu#_nKvrF21<#Cq-Lko3Xpb~Mpp}}3 zvI4HzCQ77AhYAjreSUISop@uQI6mTPXRD{H^&8SpI4k@n&A(0C2&RQ#XT3RF)ue8c zoP;khq45oi@~t&=hScyZCGk-~dl`%&2Z}@20>Qm-K^Q6Br;HqJiM8ZZHbJWHWP{?{K8BG!HGLSit)-y!p?9P03dOA(Xu12M4M`8 zvC$bHW{*1~|0;k$2mOMZkZ>(D3(O+W@Kso&WFs5O7a!B4iDYwMyxcBcBqSy|Nn&vN z96H&&!zrVR34in`rV{&3vX2nVLZy&z_ur1tbK{WqN?xgARe@PJJEpO=s*dGVF{t^Y zH7%Kq3srn%_t;uGGDEWvlSFf8lQhm4Hss8?lEM-ADELGlb`H7JD3hv|K8)&_Q-Q)- zRnAe;6=p-jalx>By-SFln1L1I!2_*;s1laVd?wFt8S6rpZSykMlLDd2dg6@Z)j+@k zKcM#LT1>d5KY3LRdygy@H|$UFkj06HNmmmeQ(`oLC_$FbY1|VIr`N7!50@@jax;pJqgC5C+^)M;c6HlCqVoTd7 z*V;qZr)%W5WDCQx8d0Qp!0`Fh2ceFY6)WAkL)vUudE(vB9AE_6R0!Q!iq~A&_Pb^s zC4Ma2?ucKPo6+-ZJB-nU(#i0v=49lG9qxO&EqP^9vx7uf91gg^+Gtxr|Ap%cjF@mxBpGN$pn3(r#jZP|~|)voNfbrQ#$Jv(;A? zU!YY(e_1MI)4#V#;;Z&e6}4+-)UX`Q?Ikm9SRU(k)AB@DrLe~aVkwj@Z}y63fHqAk zJt>v*vS)Y>6=wOU)EKjExt$x<#I)VCmu8**+(=>>wAH?4D`?r$T9&D~RbGHZF4vnt zVu@7VUy&FsK(ySCT7QU(U5Iev9p{1!B#;sZZdaQKh2CEKyi~MuO@ld>Vy)+?b8nk4iPsxKi1Y-U#jCWY z2>oj*_pDt;+goadoN>m{fCo5(OKLu$pP@PDf0xj=8Nyn%1nS8j;nT=pzU~afvow2gY z*lX;yw|7Sg`(aa*hDH1i z*OzHf1CA6R!Cu0Nka=IRf7=4wGScnzT2Ue#oDe_5_0Hx@02^k{gd|3?aMA!5C-h(+ z48?N&ge1ji|3iXdOacf!jyr2}wh>~?ibG1@!T!Wlt(plSa~mMYTB?DhaC>@Uut{w1vOmj%3#3v!6{S^2_+nja-+DwjY5F<2hA&Z;Dwgr4+_j}_*Xb=>@@X^;h0TnAcf{?wQ zJkT=YKA=IeRA*anavT)iH<@hce}FB#w?{}ff8&>a3_jS$Oav0g1U26@msE& zMrEKQ37rGuVJUfylpTp9M}3<#y@`FB<42j&q57$|607Ah;u?4Z6 z=r2#6slbFYd_aaeT-|cGbMoPd%nf_pi0%Zzb=2K1RZx&?Z}NoQZdH*wlHyi(L@Wrx zeL?X)*|JGaY>+W@i^FfO5Z{zWKCY}-3^f2?L;R`qkX^n23P|ZV?_*YYjAUvrNg)}fQQF}KO{L~msI$a}L1ZFH7FTjCWTH|-yaS}~r(*G4g5zbj>~KM4 zBfNpJgL&}-n#j>gY1KdguS)V#^Asw$EQSr9-LpO>Ap}A+Nxyhg2VLL; zq%AyOxmi_KOpR=y@ubMyZW1ATVD4-y7Avngdv_>(ayoPlNHB1JbcYW}s#m1}PC$ns zK`&)k|8fR`WdTt?XulGht?IIT@C z@fIU))U6KWWMMia=o9i%bPV0ILU5nIl0|+>$qko3SB9W`#C*t*GDry@j3iBss6(3A zc<}dS-Y{8^cMloPs!b);-{C!(CT=@$;v~hd%ZJE{nss>t(cV3VXumM2tS>5Od}ybX zXC%~b)MHV-DgP&Zy5x!4m&^^xy0FG%rBhnEa)q?);bVE~@Wp~9U1uT<$$SlSRw`58 zkY|kcD4xoo0Tg*SWcRBbhYZNSwzD1@8KRxTLtIA`8iy#f2Ya0vG0Vc}8)70@sg@EPvVtVyF4* zrD}pjSZsh0I>iBuAzAa~?t`eh3nv5vFDIAvEq7+L!5A_XC9f%;Aq3`j zs>fY`X=@oXJQC>n2{)Js%@%T2J3Hj!ojmyhBb>`A=0m}O-bC*2sE64tWJM_TW;)WnIm>XcdUTd-ncH8bs^S3xIc(So%;awCO9zNiLA_`xRaZ5qsQ$-k@kUBH4dE|vTEV+KP{-zL6PL>L+gzeBP-bG;UNgB>E(_uHn5}!PKA%Wp;^sEok9<3}?lA{MF!W3k zgTU9#>}{S43Qjuy3mwjU#Nmlj6e8P*TkI5rxj9{Zojgm(<#BfE6*NQ6M_H|Rb71o& z7i_&5w}%%!+}#s@UUvuP2Y=(EUhv+mD9>e?)RvsHc9K7RQsyxZ^mW`8=BX*yWA~~w z7gnA|slhG<2hiAsd1dk*S6!Mz-tN%w?WDHgKEF8IdSCj%XNG^WUIy_AxYulxarjJc zuNWQHNa2kU<{&C;KE^f_?zH8yM7hHIEar(Coqg@9v##9vbIF&#)VSiKg{3PBukJqd z5ip4@hW3nMJz3I^Rm!Xfy~k)T z64n!eX)&>9RqVyTa+Q~6l4Wl5cCw;-ubz~R1yDKD(2IRF%f3pATftlp_IVauyq4=6 zx4riE?T31<#(vqp?GNhNw>1_KzNuSVBVfu0YaaJ)<*4P^Bf3fsFauL<~`27EY>xqby{9*+P>NggW3kYFxV(Vg_JJVXbcYy>4hA< z_}8p0#t-#0P|PYDnl{Z>?4h9WVBtb@ zNex-)ttyW#8Q;Z&J28##fS;cd_jzieAkTq95YOx_aY zK_!1TQOSoJ+~!+K;~WjQZffqM9>e7ghtQjEO6w_?784z~LJ(k~f3vkgm0+S1%m;iw zswcZi2e84UzJOA5pCLsfVCF2eo3w1yC>Gmy@c4YUO^)17igrWnyMJS~C?ObS z(l7Vuq=iHyf9(XbwsIPou)f8+u`msikk;`_;t^a4P>LpMb>7m`rdu5z$_;UDh?irF zf!yl5iy*gzzKp@9NW&&2Y@zxLR#3=04;N?3I<<9yY*mzKQWz~BRpmOO>M@ET@6Ch* zv0Z2j*pBU2xh-PdKCff&@&Fgpbg@N4-kvH&oDE{>yPVXtHBs0#_dtYQ`K=1IISJF^`g4Y6ACb#xqB1^q&pn)PLG48%czgMusV z9pw`}lVNXQQyb70#RQ2)C8`NRdLNrQHad@qIwn^j>|>$Iw^QU{T2?QvhDAnnhP!q zreol!f^X<$JXk&^5CL3@1j$2czfQVdBN995W8$!57ki`ZsQ)Fkjd&sDsG-1oODVBr zDh0<=)9lKtP{DDN)&YKZ<9oz!KTYfT~PjlUhTEJx*&1^da*cnVCwm z9w{;;k&}t-EvdIdWK29Xo{!w~NN5A{2G;w&oKobCtmi-`by{X2M*WcVuUfX{d{TCA z`B@8@eG57EraYQLK!G?r9FrTwo4QLFgo=0PdH;c%Cv;(NRdH?LS1=-c{4|tD4X%?( zdy>hmO0hdV4W#U*!W_)U`SD{;*s9El#j2qGd=kJBoRtzO{VXPwDg%V{n?f{Y1JVcp zV4_E#I5u*tykn0SS;JgehaR*eS0^uxgh?nm9&(f^hE!w)?>@_<<{wP*Vr`*;Lxl^) z(Fgf$4^GSf6U8n;IQnT$xngO;RP$9kNm8(_fzs*N$#c{+?gPlyX1TDA`_KS$a4ZR zShU?NcURiyCa|U#A|To(kTsZtMOIG^uo z(z?ZGc@3Wxs*?G;cu^pIgG>nErck~RpJU;|VS6G-n5jIx$uW;3%GaLn z0hi}V(y~e8f#4_=#&kl&?T2fKK44P8TPz@e)+B5uVW?L-J?7XFFoUXS;srAK-z?ls1To3@AW zXVdoR!BstgZb8TBwHHl`pg?A;ZY(wzDgv16T|%U~t8`FinC_v9u)g#$+7x1Xqb|bC z2==OPSi(|xffvEvmyCkLwM|V|Q!IJZZip;ilIC1%t1d0LE~cs%fDJ%|Z2M%9!moOQ z*u&Hn`0Y!nEt=%ZbVG1;V-ZEW;10a(6`U&X^j2jh(1>NkVgTcB(KH&~8?WRvtI*s_ zdME`xw_9w7nI(8|$0M=PP%P%VX!`ydmK1azyz}E49=+?^yxe1@an&#$v#k(zGkP$X z^lvCUY$Ft{F1=E-Ud`jMo62ioTDK5B4!q-&yVVAIg4Wk=SBdXmXjIjuUNzSA^slM? zECE1;2GSh*i@b_A_jqfMg9pMDgRp+E#p(xXE^^q}h$ui7O@p>nE7uGignSMi)PvTB z4X9WU_|&+I6;YX~q9rXHm1RU~S2Lo?49^bJVZQPuH8O=~{Zc=kcR&+q3K*m^V697K zKOD-BAmo%@k$*TdaW5+imwIYmQDob~A^2oxRW?o_z|_fJxJS}2H_8{HlAWmn@4_#5*nvjL@`d?YZM}&ec3JOxw=0#TZM2`9L>U|SsDF6DQ9nrGmt=60 zj3{TzM=f=aBKx%H28?UgU)D-R09Lk(m{~ssF%Gu z%s=1Idv=C7RGS_ev_Zmyp_%pt=F&E=M?L8c67ECOa1K$JFMOaI?VwUQUgdGrOah%u z46KEX@(c9RV2oRrzwL~PE5YIQOztPl#Z(RG`SPh3+p_n0iU&^P zbX+*F_31E!TtIUv zKXVXpN9={B{aY!=#^H1NbH57G&{&Jck}fpXwNRIR2Y$`{8yZUuxA(vjohcTA;qw4F+5nSbT2YDrG1}rzDKTbAh!jS~)={6oe#z~Zqu97vn zC}mE8oV|ygK@hGy+#K!0L!Lbf%2y&N2?obbey$*zhg_Z6V{VVhn>{Ito37q-dovwQ zVpK7DpejQKDwImHwqmj$OT8YShsG63xuBJHJlDBc1pQdqd_y*J#nd94hH@AQm3Knv z!}Oi55?FpC)ukS#H(WJ&u8uCIi@CaGJ7P88NMRHR#8eVwY?mgNC@ z_YI!Yh}lq?4T{m*aE5>Xm<&rmm#S@-oru3$Bu8jpSHA;uZkBXZ#MpIIrNl?f?6=N2$B715j@!_$`*+)r`#s=yK zV+{Gh?bj+X?p;4bS<4*&Bx?!FdH@LBv@)`;RDRb3`KilFK z*>ic=RV#~N?*rwq8e1^(a+@m_w3A`_;HBR`RlXeYj4G4ZLGQaKUB^Iz&FKOb-`66ho17>I^v@&%u{(6f;u6(c-Rkg!V`3 zCelan{52|f=v0AqQQw#HvTTw-TMr3a#1>}^D+RGGaN*l>5S#Mmmy(v)6vdq?&dbzM zk_bezq7xCtlk_>-8!#gGelaIRokq{l#)f-(I@)+%vgSgNEbZpxLqCT&9taD-EmsbML__uZL7IzjV45M*oA=OYRw{ICdhmhKKis_5j8&LuXtK4uJeo@(>7El9#O5e+;<)v z=)2_Jo)88e1rAWD=2v_TwTMAOWI|~fB}YLgE>FyL9U``tEb(jRAyjn@*B+i4{kL-}PdB@3sV z2y3R={HR3K0u5oLXtHlb5Er%3f?F1Z)h~(~-y6lY?2qZewH~ zv^Jutv5liQC7x@MJ_DIUH)Cx?$j#xZhKR_GW(Np5??2!+6Jdvci#D$b5<1#*+qvp7 zXdoRP+?>XsNkeDtL}KNg<98S;LxU*r3v?g6G>9_C1;2|dVd(T%u!#Y4%I?19eM{kJ z$5c!96IL&SL#A%?YY6wO00T~wgNG^FDnZDyX`LMW!8xy4xr};?l8YhtQf@6S`YgSB zx(eqDPaeO5*i>60uy&1~*b3ixf{JbA26vdL8gV_;x`dg=(xaiWL#n;^TZsN0U_7a9 zolsSl%bcGR(jF(L*S{Se8P^D&GS zuX%ou$5~Y&(=E%{x?IzJw_3HhYKw$kwzRM9#U>W2ebS~x<3o(x$9uTj!Go44C@d{s zACN40jN1nxrTv~97<%MMPgdbNxjqwk%;cn>MfUaLKps*^zwLJ!h1Ik&p0$N9_V+k_S`Hcjk8ow_q2i_>#N0*SBr&hWYnegXi3V0SyHE zDgI%}!_GT>8RpUPwz`Lb=@P<5Ft{V#c*s_P?S#PlbnJZ*Bq#5VX%#CSq206x&q)+- zeB~oQIJH7Cv&zd0kAlq!eGst1L>$!vS1*)b@jRo`sCn7?&nv+C;08I^gx3+au>uc z0k?PPmFy-aZ>f*VDLA$1F_1-M!DCE+B8s*gj10plt67stQcjvc2UNCGm zS>~Zxd}$^8?AIce~(^<@_?;0b_*}bSlRBf9l>IP%#AqCKlgM+`7~T28foY-dCOwaQ9ID zf2jN?_5OtHD)^K2v_+wgae`qw&NB@A)j}I!b#BsBX;hifb&kVkY;wtAf)S-khV6Qdn`}EaU4Em#K(JA;Fj!B8hZ-72nf= z{D_!pah|u`&^1ha*Pf*7qW{2I&XSqoxayC1*@?i@u`hQ;orgA>k|$sT-3R=GV1Oxi z8C!226zUn?^;NFF?hLEb)rMvAj*0bXoyY>rg}0gVrnj;aa@=(u+H9*CL;}@@tjo_4 zPcLwg%5x~5o!^{kl+l3 z6Ss?=O_|g>71M>{faMZby-^p(ia1L=96$?z1ggPOK-oMqM-g#CTP+l)U6_i9TKFoR zk_H853-Lv&%pD=a6EN+_PLGs~f!*iDt1nHJ0wyme z8K7=w$@Y!~WKHDFNSmTR14dJ6Fy)iuZa{jI1BV`t=ieZrjLmbn^ixPHHWD_xfo0yO? zbaNc!VZ{xAPXZt7>3#{c1fNk_$KLL8DoFGPd&9l4^(JfFlxrxNM&LGrLB24exELb~ zgR!_b%Tr_*Q+$<-SwMi!!o3}g zS@ByHYj#NaBRZg*eAFl(%_QlWxMkWU4p6yn!K7`vI)X3kin12d-JtjfTw>fTQ=oR5 zPSubOij(~?<^!`+XpaeDk(h*;fX^W||NMxMv4-PeI8oS2j+dE}$Jj=%5TrDV;s)(z=t zA+mPLw(f!JLP8iou4OrNsddgO2PyRZ!Xx`1eb&jdGoZnrMXc^TP& z?2qTTFMDq|{rT3oi~PjU9{W{id^D42h$-*rMen|J;VrOa`z$??51}$!&Bg(Yel^jwA|P24$Z20XsUCZHd*k*eV8x z)jl;x0(!u~CK>>?WK7?{Oo--wyNA}@WFL(8M@K^z(jrs8at>{+K+%gBNiXX#^?K4` z>rDgLRMvjhm0R_5Ljd0d#ATY`3H+RsT3PD-0bGr>+~!v~7kTQbt$R=<6F`8V@e7D4 zA>)m%ZF{wp`e4%w#5tJtKs55MFq=sO>%!&i4dQAb-P{h7YF#kFB-kIGjMIc{n(iOL znL>svc*5It4tBa^IbpnGsD`Hl8g^2yhDjR+2%Pr4!A&a8&taAP#^HM4ae*W$*;|}& z;A~2PyFry8y$>6P*HwCq!_@`k3#icLZVHZR91y*z65_tW*S(xq7@uqRzrQX8D+9eJ z=Leozxppqf& zw4gm`b&oT`Y``%EQ-}v=HAcR8(mLj{wW+O`0RMJi%x{rW9BCc-m?a)^nJlNVD!)(N z=UNIUQi#A4;FxSq;OFC!Nv7#L?+~~WXR0h)J*78B{1(IMAk?lfBi1e17awLEX$-Wu z)Nr*L0Qfh^#<89$M43Cp-H2y50U26)!4m}+E*qbil!535PlP&6Vv}TM;s@-KFc9LK zmcqyDn(bK3#00PcRA#d$m}@Bnwq~>04Xnr^j-| zGmgBRa{!r$e1k4v%<-T`{JXhFh|*Cpd5T2@j^2S};1|acTmHXkdk^p~iYM@UlTbp3 zPz)^u2oQQNDo7v@dJzy*6bKM{4M-prHZXJm zME5;g>QIl%v7^DM~3*186%V=U=>z9d;wp8N(82I?p4EmYKJknes~1? zVpy~iv=|MZftS;&UHg_$1<*rk8q&~c8(b9wB7Vg@jsyEf!Z%}jw4+zg;5IIK;dFXP z5J9`pxFS)3)Fd$+>PW~A8wqxR^L2vu1HGYM93nsD9GLmW^y5g#!LP8Pl(# z_>wPdz*R#|3FpNN#NgJ?9dA)55P|7PST2Us_@2(w75Z4HiFIuQ4gs5BF?fB!z62?> z4g_TP78*UlvuZuMLd^#@=m5t(mN5?S_rT-`>d4?azHd02&ZzdJUR91YIkefu6h)&>K+I^kj?0Pu!z7{f!> z8(IfKT$?t~;OIr=;kls00lneMr(iQ$%c^zHTsR!B+t)ahYq$$tCNvFv5kWm1tn-<4 zum{c8>w|BX;b5x=x7xLUt$sAv5Z|E=&CR?$m&TF!3{8E=DY&uIBazUdz{8Zl4Lg9Q zd%{V1OpM{7y?&Mz1{6#XiR*&k#7XW}Ab8KDyZX&k6|m^cA(A*Obxb$ z9u0B_SlSsIG_)<4{H0$Oi%x4w*6b37X|O^AD1IO>ETRJss?uXTROw>tKr3k< z4Lv*0G4rK7mE83uJ?@E)^nMLXlFsXD!{IfP@Jeu$>7ulmx<-$V^J^#Z`t*c04kuMH zo$bjX47zYw(-+Y-ocI~tsaLmd{7|1(H7vjgR{*aufTty6V2T{rzJ4!Q z@5S_~sMCJX(jnoX$zIgxOq~bb915lvn_#HX9^W(^+8AC!3jK)Lh{4=ZwJG4}S8KCb zV{!F%c)}1$99s2gvDOeuzyw`;@tsC`%2Rij1>IZV@UBf%rlgv>c^fi5w%H)2FTOh+ zT@Aio73LAc;&rWHby2uzX3!@_)!@j~fPs`l5_f~#P8}g0p>Ev|>G^8alx3r?dbM?mI(WbWFo=k04P^x~ z6*#Zav!nSY(XDy;Q}CmPkj3#m6l~7$SURl4Mm=r=h$yh!6cq1FrO#Yfcnt_)uqbaR zH!>O@VQSp4eJ8zal#a`NVLc>z*9nPo#*gb40tf-dO`kSn#u@-&D$!b8b1zB~l# z*0=&h!wxWO+4lq@1aT-sSHd@j;I>YXcsOY`%!`71QsC%)X&b!60S|_4m-AG*CfwFp zr?3nFe%}Q;d#Hv%D%r4AyUy*+1CRQ49@w`br(STWtGo9MZvb4dFH06o5@~h1U2chF%X0;Ou5TvjJZtC@2vM!B@rGPY}i% z_2}BG8x+WVj|s!e-QW&SB?|j`GsQ7*J!3$q=<<7*YvYX;JOZUJ`navX!AIWVg&gf+ zH3+I3#N>jO(5P3BZWzN+tL8wIXzG5bockNDbGg0}8V`FAVmP%UciklAXbJ_9wipP^*Im~F znLNA$8CslPV^v>iRY<~crBbNU!h-{JdPh~5;kJY4f=kdoM$6hgTzzUTJ%u(G-nYWq zc%Ale0~=ow-6W=^Yp4jzk4Ix1g82)vJee5=Pw+TFYY3t~AzU9zvEoiIx#F7`Fl55l z1GJ9GUmU;BNd$bK(`!A#49`^wnm@cRqY5NSp;!utj_nA&B6y$KhXY-MXn0vAoDAAv z5AavBm8=2P?k>=B@zD-rAjmUdU<}#-pLEN86A0kC;eq_l&TimHkjjEL&{zV-X1p6l zN!JElk;b--OA+^Mha<-(m@b>-iSuE6vn0r26vs&e$l=vRkP~qCo(p%>&mVJ}6Nebx62{EA(9)G!!r&Ss zHC!{yABQ)^^o4i{Qt*F}vc~zH9y+f8gZQ??UN8!d#doEf?BBf=^u<9aG#p3EI1=cI z7OjNy9N;d70f%4=gHOcAgvPZMP5{f6^ooVHgLfL>0c6N!E48F`L!2y?1!m14XlGEO zfO}{B)%KJZ0t_cc;WRl1?^D8|d*Z5cy=eM`vm}lgEz_DFfCY^VfXb~_lALM;rgB&JJY-vY4I&>~@Y1;JR~1%Y8KEE(0Cs2AmlOv@rOM z#uq2Kzez5Q@ECZbyL^~nE0|1wTW^Ne3-r`ZCh00PCw&J3tDX?R!KNQ9QEd5I)s`)z;l2liU}mtL5BgR70xtM~4F#9f z@|+KHk!UzZOrL0&>4OWGc)n2sE>NcsbQoG>A{x3W_ZLxddiAZUvq_D4q+p&(gr1D^jKDWdAf$zC;8?Wk6HF$8?9I1~NgYVCx?;h^ZaB zCAQZ#(8+kWabs?S?TywY%aG{X5MWf@y zcH`Iov4f$5Lren&;S4O#Rv4P#8xQy{Is~@1UD0R>zeg7ht++!6oJH2Y4@7j&ccLi& zqUE8Wr*Jlt2f>^M8Z5wlgq}TmwT3j>`2r}ODD}!xyr4W2;pL+xd_X=caUX)s!PyA< zbR7zBz(cXg_z4ZipaV6_G2^XQUXvOTAXCTbc6DOlE<7}SNIJ|$JZgfiP_hBl!NGJD zDHf)_#k6gUU1xP)p-D#-m_QlR11tk6G-$vqO2hY)^)QRJcBAQp?zqQY5$L))JX#fv z&sgeQjh4hl3;coJt6+&}{se6dpXuGpaIEma)zyWq1()GEpA zzllW$gbA2%OUdW|zVIAMYgCs^@C7(`f^YDo1qjDj>W7*moE_SGb-b;mLnOpK$UCq@ zrx|ZByrJ4j^{#y>-E)s<>qPgVd2kS#oy{gP!?nSsp)Pn&fbk_J5NPzS(9Ez_371JX z99s6p+@VG*Z58s6fT&@Ajj`>Ql_|7Ycu)SL0ZtyYy zGaxZ81NU;!ive3#Yxu(Hegxe0Xc-d=Nhz!#2iZSoFqBxqY4n|V>;Yc=KS=ANgLW#GGHF+zkM3F>9Aa3HunC--+@MGLbRLKnI48FYJdPhr%c8+uL7(|Ky@SSw_nIL) zt_@f2x>15%H7-mKchMlG7)|LM1k>Y6rsD*cs}tR}S9>v_)2;-8s!lYxgYWi~9%bVF zDbN9|$&W9*N6)JdceQ%pgODHw^=ct5$OhdI4Dz5wf8yQ-iV_<+0>FI{urAu6GZqYB zUod!gnCcA31 z0h0^x)a|y%Magh89EM$k*RTx5b(rh4s)l!g^_A$sq`r9EE5G*tnvU!Lq{6qK^_2?3VOmDFD5!^o#VOT3QPrpS#{MEe8;z^_I!AgY;51q zsu0qg4d{pRm`)k4DoyF>7K1kp>gXyEnvbd?-JlhBhV}a(K-KB$%*@uo7YqyQpwh~u z@NRYEE7-kJWiz%5;zp1^=7%=C)&v_39AfdNmo=%*Y}LJK0mLbUW4M+sP< zw;VmhR2)|3H%BU24AzuM4Ewhv@M1kTyJZhL5DfB0ThvT!~U8;MsI2wL0@7!OTXxjeWy&$m5|}M3%rr z9`ck<5Hctva_llU)p0q+n4S;^-QPjqafL4G10e*ohEiTn47>H=9_zKfLiPs{7Q-^+ ze|p^H3_pxZ&<-c?%qzrgf4NL-7Dcpm0ZsURs_XpdxX+-x6NL zhgSz+G^{pdw3;y4zbZd9Lr=K3huak}JtVd@)ZF#_Z+vYF-+Tfy!rKQrL0^P@Lv-gd zcy!lxIJ*1-q<}D!B-*(n)DRZ)iRpxcL7;^aC9B*d=ut5BKc+`nv6YZ%0%WM(OgF=E zGO<&PgJN=9c#9q0L5JhQ8y58|f?-#Ej}iJpJ!zblro=KQJUr93t)3|D+QIXE4cZG9 zKLjHgU*d^OUFe0xkO)(^s56aD_<~Y!lE`+jC<9*{4P+ZYMFqOoytt`-OSo-AJw5=2 z>xU7b=^>Vx#6a_x+kx8Qp)Bo{4Z4D>fKgyv2$u&_*kR-kN$h*&pD2^+2p0}vQFSY> zR63Zh_-!ervO!F34ysfTJKuh&)O3ib@bVIM9(s`XkVeb|uOD7Xsc>kM*Wgy(U}!WQ ztH5!fRs9D)C&AC}t16Wbdg=^OO09vP&*A3;xV%f%l==>O*e%uJ9MF1~*MyJ;xo(a+ zO2vaKm#wSRSr~ids;ATNWhV*h;BA&_vs{QR)eBRco)*5#WgKq|`l# zYpc5`1!9$|yHefZe9^H=?Sk`N>jmdbs?^VYlzIy1%kiyJLqKtj1}L>IHSi6Dh4SI} zYX>Wp1&&{Im{Q|F50k%F>Kz=g|8S+Q!u3c00LO#t&p!f=1?NpWQYoB+vf@Y36`cR> z&yYvL^*{avbOG0$e5_Ipz%D(~()G^k~3UXaO7(c+UI|*8=Cyyad?b{6m(4PQec4 zmnk(AIHHy-)dTcVZY9jj27TmQ4c!&!;~D%60iLseD3u88&}WTObK&PJ_?Z{<^mwgO zS%I^~I;DOBeQntQ=LY+v-3-SD&Ouw1S_*u(w<#3_T$OjiOxlb}b=U=d0({kWgUvv1 z#rMGV0B7dCpnuR?vVEXu&|97TO7#WqWd}eP;6GCjDzz8*a~+242e>mFQ7SiZpM#%w z;peoYpbOAv^W$(H(C4fZpmVs6J}1GK;W|D$1=k1s>rcaRfWN~TI6m-KI1Baw{#@sP z1@!$4e)_?6Y&fsfKKMEI0vrqUU-u$#g5OQLtkl9ZN`+m6M^~Yry>dgTzrY{P-UeI2 z^^Cm--v|Bz55PZx`^-bo0dV_2hV~b#)WN4p{Rw*B`w#dcaJ75^*9sg3Un!Ld&bRK3 zQgh%uJ>SE%fd2CN`;q=~CH7OPKyQ@;{ZwJl)86EMYBlJmV=6z@80=XhwV%okdbys) zPyG)1xDw>2=7Szu1!jVOAmBf62>Ro%CIm*LoZ#B>ePe%Bzg9q^3@V92aZYvba+ zt@ic2D`o4QTiSTUpe(`VqvxfW_|Jwdo5ITukE=0aa)Wu_-tN}6_Noe_M)$}udhYR8 z<@^tn?wvR8ZCvU@y*|Ad{9weGayRxxp1BpUc18BjEBtlq<;1Y-qyN}Yu}s}1H6sIu ztd1Xh^5MdlQTsAgF0y}pzUM{e7926_&==bqz0ADgZnK!u$4W1}9QrK9PeD6Yq#V)w zNZZiNg>P^Dc>0Kk7fyfCY15@qt48%N)9hf~ZSOO*cwX^t`V$8SHXR5WdH+5jfxhOQ zUT`NK^n@?i08^^w&?CkcErS0K{pCo99mP@(e>`mSkYfWrJ-#D2Pw%4hMn@K%GeBJn zAJ{He)(cr5T&TAvN%D<>L+3tz@q2}L%YJ-$ZTy|kOp_NS*>OFx;(>P8dRN#waPYoi za~I?ZY}%*W_jA6zu%`K@`p4#!KU6&Hyy|O<{Th_!hjaBdHCR@;^|5_vkJqevp;XJT zfeXfTZ+<4n#Oax@mZ^Q>d8P(u%C=bc%ffx{SMMlxaC*IM1D6-NRW(tUvg2#Fzu9uu z!lE}CZXUfbX3@!zqifc^JpC3f#e1P0`%cI-1TGL_n*twKFih+mQ`Gf8%#DeLlo&lE z<-t11cZ8;mxHe$cH>1y$ZM8AHYRgpzmTml3MNZ99Y0u2#GcKIDbz?}`!&fq#no=v( z7lo!KO*Jy+P>~=0eA2(rf~*VASI@SqP^W8qGTy$|CwSi?f4%C1f_ansm$SqFIJnE_>mU-0f>w&H!G*0a zp*Uhc{Nl)u7osw3`QTN>z)RWRkA2f>MA@^ergdIDW$4)&`?{YFX*w=@`=8ou%9`Fk z&xe_k&uf2g-qEIM=5-Do)a>}yHW$X18TKZ{fRbZxW&ZZ!r9vOh&C~JoxyN2y?*I4Y z6EC8U-zxYb_x)@)hNjQ-=f!99#y+^(XXoy)>1!`u*zw)0Yae|zv(HcAKXyOxb&J26 zoqPLa$boj%hL+o&`s|+mN!MJg|HHYuEB96__S1iP%Iqlq)sz)smHq3EYu|n9%=AyP z9&0(T(V~pGtF$~b;?qQ3y4KFydg!!SJ@5ThWZH}Hsk;Xs7?iF~lH;Yz6w1)Mf1?S3 zTdw-mncAvK(;^2;w#gUSzxle!?PZR-StJ>VDDH2U6~? zma%l(5)}^SdtUy^;x)1FlSk~zyR6CMQ~lOloZ&z7SYYGHJ&p}{HLcBp2W>x3H9q9a z{@*;f)Ztuw$9IdfPQ9?@P3As{Yrg$9>wn)>j0?{KzdwDKDL39kc$| z@Apg3?re`9eS)n!Auz`Ir6m5`{iGIuAVv-y1ewnqe6yyy>>LYGVtk(KWBV2u)^!VPK{gsu+Op^1K;0UedW8i$5Rj8wQTm2 zXAc$(EI;tovnfa3zqvLd@#VTlXQj?DHZbvNG`W{Mju^>>0jq$`>gktE5bk zVQ|+UA50HSb@h4T*=4>;I%V6&adGiyX`;LPy>yW0)Vb6R~h zF1CHTki$2AAMmX8vKw7;_Uts}V<3@J+ODxPW7rro28 ztzGh0k(^OqrP&)EJ+;oz*x$-!ud*a(=CP4;MmLW7Hf->wLY)%jJQdX`^WI$Nutoyd*%6xBK$OohE<)44>-p|GF zl$nw>*^(x;UX@P!!MEqr+|5;D%!trxWoEqop?LOlG1V&lFndy}k3+jJY*q1kwfeL7 zuHQE_)pzj+Kdv%8GNk^d(wDvoEOUIo+O!vwe0?ha(&SZ|P0Bant7f|nF1)gBaNV?x z3k0m{y0CAR>m}z6sB`k|i`mnA&+GMR*L*wsb$qn6#>+An2iA|@*e>MC&TdCKm09uP z)!SxUFTLOVUDu7jUD?uM`Oqw-zUq?k#H9`AH&$!<=U?e7--{>|e)&#^7l&t6`1y6Q z@JAf{T-ir~Y_g z=}ylJKkXTkeMR?IJHIS)V8Mw-<;w*vemZu>FFQ_rU;WscSNHQ5`eymrx~EP&8yok> zfb>D@utew4 zNB18Jo0V%$zu+{-g4>?TU+qZbjXJTHzM6M(X1>T-w}%}6^4P+%?|$A@H8xf62j?SN zeVxDE!>NB<{PNWBv6*LmabQ~fxQf}LN-oS^Rx+(u6o$LR0Gq+Ov>|B&va={ z#1%YN<5Xxa|9q+Qe0t#514m9ow3?T5WYhKjgP$B8+i~!cO&xEB-fJ`VY3sq&D%aas zCRgrh(+6)~@MZMoMF)$fo-m

    U$wWZ=}C7`tZ1>fi({<|1C!C8M-6I%pO?|6(3Y~ z^7_$B?{{5VJZ9D6tqmHL9zQDgM=3r%{b|qQ^{&<2AN$e$Lca{^Jz;5P|3$;XIu0G) z@LY@MGqYSO^*BrZ;Y+`M(J6Q459^d_QsG4XjOR)}D$wTOicSF?S8n=z+?me{N2Z)S z|C|3BZ8($X&-EuatQk=Ga-Ptuecu%N`bD->mFBFSd~jyo%w6}dIQg*C$%okwB%WUB zQTaNR8@@e%{Lz;epM^wEOP!?X$(P%@Z0-=ymi>fcKm*dr$WWD>O**Pc0<3V>C zb^EpEfB%jSJ)1Vw$?0Lgrdn6^uOAhc(#oIOC>8O$wBH z(|gU)f8Takf9JPam+IciSt?VysF;dxA08k6;?tgE2K4{*M%ElHX5T9KXRX|sGERCG z-=NXXQ%NWGh6$&geJbKHd_8Whjwl96F5|@j&Ta+d6qkF%0pS-Q} zx`(488f2Q~dhx3ftT=?YE~?<_YyDzhC{fT$T}Si{{Am^}A0i#hx3M zIDGc*gI9A;SyVgG{-zr??7ZT;bU**X>i?X+!q;4XEXi#BTJiu1^Eqd4e4UyryGyuZ z!u-hDF*78No`l1t{)9(Ln5;NQI&C`=r7j7Nk?fBDx8RqPfm|g?s~VL%lyqObpGpFwuoV6JAZY%zrR{G#$RQd5AqHE>hfWKmFb4R8vn1qic21#E`|iCMzG@R^r``> z=jQ<`Z^r<&VnBd8|4V?%HXCGr1gH{w0@T8DAb${`{(2jr_NGas66H#y#>2W!$*Uz& z9X?N_QYn806!3p*m4<##2miz97e-ERet&wrPRsfgo0Gf%$qUl&gX#Y`8eQp&&f^b5 zR2Aqq2)1m4kr#~b;olJ2XDI!j*Z1wa74x}%5BL54q+)*8?{(A`FhZn?R$WwA_!kO) zE329yDXdDU(yEv~@&E1rM9S|ZjH}b6j`PDSy`Q{mU~Bm4=hu!tw?jk!|MUkB?FawS zPuDr1NWg!bll!S)s59boWucX8Miz$JzWI*xQ-8Z_AYOz1huh52{8Xu&P}M4+RD=pq z5jiG=<5qoc_jy{Q*z*T960Lfw_M~gi=KoKESt=KtpR`RJ?Lg=HKvyd|Kc@!fRZ|nR zGWY4wuk@eKz>fme0IC8)oh5N3jPIf52A_dm#U*h*`@=UUm32M`r~vVOXa}l|Yk!Od zZ4x=(PXb3??fRTlB~>NUMETAa2pn-qs`@@B!vbVVXM1vp6O)oUpHsjndQxKNb4p;o z9pHRU1?_NBnDZIRPJSv`BwhAs1;rVa1k|7(&@=HT9Y~cxIP-5}OQ1=PHhbK%ZLpvmtK~VI_(98+H2b%N9Fv=$U z9@Zfe%AlO(kwMAHBSZZp+!sn;9vNyd;rE~<>5-vD5`GWsN(p5c2NTLLE+&-W=slqf zN^l+-w)KSHgA$rY1_c3+3`%Ss8J0W}9uG=<9vPJ7JTlDF-21+zz7guy>N_y0{^SRr z`ujfl!>571PXX|0i0@M(_!Q^+lo&p31c%f|NdliX`939u_;wz=S#JwemDDhBHT@|W zY#Z+TlpH<{@*Oh;d>ZWgloCFD@B3yd_%r~vZHBgn4Fy}?lZyI(KRUJ_M3hLiocH&G zcoM0;r#-@8k3_Ubg$Ue3?+rJ~l$v-ZqN?gX=VU}6A0wHHQbYKxexR=?)f1Y;cmB!Y z7%AWr{E5ITfeI;n8pcHad`wA%F)mWAU?P7a`T;%#KxFVIqFQ^CR+#eHql%lMkMM^G z=UkPFh*I_UypBl9-wsq&)#OR==V?nizdyvYC^eKx{e7g!=1-)Rpvmh`NIINJ0|Y4s zG5k&dk*;J?jKxNGA@IA_og&@oE0lBr5Njn}fS?NybOEF}!-Mh5b=Az1hb@ zHbcLOJp=CX$^Vfjk%uRd;DOu!E9P5{wlI=<=@JuNM(_*NU1FiF z#7y_$|B)`Shc2<8OYEc5M|bmJ6nA)@RMgf>LOiQkTS;8n3WNSnYpXi(7|)06?SBG1 z-*3xY_WUHoupEw$XH6m)l8`@DW`9aT8XCd=1P*8+N=ih3FcJ7Ld`Gv%5a_3gD6Sq6 zu|J^&n(t?CPf8l<`efL?Mgtm>Rgx0j3Z_dc=#mm0u4RtoNr`SG(_wGs=*xI}AknR1 zxty zsTr6gO2xg|%^FHZ8p@gDpH7W8*B}jHeiNluF;6nVlZ-U9o+ENHqQm=W7)!Aq_i6}Z zNTm9c=`cP-DvVqhr;=;B_)nP&_lE3;^nm)0Qa^xGV2n#nl#LqXn>ixBI#FWQ6{RYI ztuUe{*DX(#;ce*YKQkBR5?-zp#I=sOQV6aTthH*yW&BEWr68^mz=h{cp|w};G@UmE zX>K)~5B(>FAWXr8&H0;1XgnV?8;8)?6~}3jYCRLCblEi+j-}gT2@7FLL6}kyrWAxJ zeT2xHl5AUry+5UBo0w;L32{6bsWvcSDnXcv3Cj_o-rLwV6%meP9i}2ezo3nznN&oG z<7;F}B?wayA-*=#uAT{ws0T22==9pdr9A`(WO-zu+nVNNPDFvd;Q4v!fFA; z`$}`kUTFnQTH!UAO?%IeHHS#Gh0mYPb^c(L&f+ysppm1t(-Fl_91+qHMY;dBw_fz5 z3z4KFEn(eIf7`?d{Rw?#H0uR3dWXxjw;bSN>0Oi|aJO4iMz|o&6ifHx7^SrVPZhjd?P-tQi7DP0dp|0goE6k9jf>&({~D z0{R?jKvv8^JnNVzgW$%8 z-==TONOT+5{hZ`UR86AGNObF&E+f%l_NR}IS$$2W%P3j`j`_TFnTT!&(`6F2%*1r= zKIlnBH(e$VT_!=7Nzi2?-5I(}f(}QYUb+v6ZYR@y;L=?f%tg|cO&B{z7t^_oiCiBL z(FWGn2f~^k2%--J5srPmL^!gERJ)ic*rl&v$R;#V)d<{A?``^Wpa zvk<2}1``wZff&(owC$zHOccABBD0{#%oK8TsN+OtqWGEPS7xGUIqP(($Ll(hZ)GN3 zZG>~8H)R&q%S>FRCE|$R;W8~T3vumXt}Mb{S(rb>PG0o7IyK#{ z#&piCq?1joldL{E)KFH^NiA+~S&42O(_yW{OP7u4_Ay;Hm*qlWLWrg-8R6_lvC~JF z%|n+>&}9>J*#uoSL5Ecr=PX8d*@fI~wD`!j? zLJV7&Aw)2Q5Q7<0gpj7jvmLSS(i>$asR>C4HX_1w($VyveWh}C?7Fjuq>_dy}Ba7@~i>$mwR>30s z#3I96E^c4Lh#d+tr!ASN6H0CA5VxgJ(Uw9vCRRz%mg=|Hy?jM(OQF=3;^FuhmqV#7 z;mBNn!mP+d=}>A}6X0w}gx>EZ%1K0rnJA|#4u-%UI!fz`o!%RxvvJU1?3OTRiv>%-*p`A8qTSReTWQ9fdr#thh-cp1Wp;S@83 zyBLC1INLoso^9T{qOMtn6W4C$3MVd88w}^lTL<#0aH5>f zl=-cc`HAv0Q|5P3hN%1=JFVzSc6zdmeI~!}?C2Ny>3lQzeAst7S~0B$XIn(7Gkm@R zbUt0<4ps$NE4J2CfV47(wNikz^7NmYUwpsrW4hJ?q?J9al>&mRfbiY|L^+cwu{ZZp z;%tyeb(Sd$x+p_bLG~Yxd=a=eWzW7kb3vkW*O?0vovAYyBs!Cq79={86BZ=8Sxkp{ zg_o`n(Vb(uLV~Uk>##0!`RIIgkwQf0u8R~RI#U-ZBy?AZ=x{WK7A_<#j2VuXt}xM^ zXS%{Jx-gjGVsCqKV~%3Rdb9=mRbe993&+Q^78XQ>iKsFVVZVwQmBV1#e-UE1zzju* zL096#U=Ph8y(~)2r1tN`pd!Suj~R*(gRxl=(!*>xJ~Chi=Vd5L3>TT9sEZ*4_K>y5 zDAi&wMIUAIjH15t;TbR=bk1PfQZYKiB|bwj*BOFg54uVvdqW*zicwn{3vCH|qGIF^ zgGS7yc2tb6dOu%vF&_omp%`i2)K!X+4oqDivr{i&aU#6TgvDLH5Uh%`9r%6??QNrOn+INxJ>Ulon5fYIg{Yh$Dkq4_38Hd>2uGb>q7RAaHWPg)bo3z;RsO#aeJF@N6ht2q zk%>Pa3L+fydWk+FqB~6Vk$Zi&T723Je~DA+(1XqEAP@nFnmhbE5?0! zEJEL(EKgC)o$-~Ya~m6!r|UO;Q+XmWeG^s^91?T=6^P^plT>hBe~7AJvk0_a=YDAg z;&Ibn8#jN<|_(%=)V+2rClPLeLmiDVxFVxlaCAnH&ps$MNw0ckpw* z>IVO?3ON-1wuEXWRw?Jh@4eygSpM4^9*M;7U_~+l&gHHgp{$ZcR@ov$oh011ibaO4 zCE@p=9eHHcEVAkrSq+P#qV*2p6J%pz-S zku|Z%KDWreu*kl&$eLPYUs+^dTV%~FvgQe77$GA-=Cm!-@*}8i-R8Cx;cEH8um`oR zI9uC_5N#_$w5C&tv?L*(5?UD1hMHVjc@$6 z{1KtEtx8061dfjpsFEP6Bw~3bBKn=9CALK`QDq{!!$g%`ItznmE40pZ4r*pNMX7{D zl?72{K~z}~RTe}@<0XnDqI*mf=^}#XAy`M1Nk@8b6CoqXil#~!Nmevf!bl=K$~ucA zLeu|6`mBg{!${<1szOZnnW>74DGc_|c1u`oj=PD~Q#JE5wp$fqIK~WBgqErhgQ*o% zAuTOpyI~yk5>+Ll2TWAe&rQf(8sY+V%E46_7k*Y*^oC&K6Emb8# zQ=zX)gl1M1MrAKy6cIjT!YG%{LZHWW=PS0FY80`Unrf8LRur+AUN?$ZOidMg1Bb)y4U$v%e&4=c=Wd zk6N9qLU;XxqscPWNgF48$8*m{s7^els*Amx!($?24dU_R{c8wY)LKDdHf^of7G&7%?KTlYmqii`u0Zam@#-Qy8b2XEtp4m&t99( zo{0Ca?K*pys?BX#_gi{zynfYuEku_=Z|5*WZDKg(JBFK~HZd$^2FzO=2IC=hh#@iW zU&m#GU{!}1^c-a~2TIp<>QFrS72*lbz^y~JD8ISEgq-^uk>}PSt~0*lx;?iJS*A8v z20f(?Q7&Uj%%mJjW2d@A2|LgpbzPbYR&|-Oa)h&=sq54glyyCnb&2w<@3?Ntx`GR{ zIrs5(R$q^}KncWEPiU_mdvJZ?(tE22t-X4JtDc9e9&w%a9oNlOPiU{6pv3&rOIe>N z1Nmt6U6f&}K5MU*res#|v$wm!&-^l&dHeN=$kgNOipQM3`x#MQ@EzCP`aUBrtWwhK{TuvQY*=AE463$mFzC;39~d_g>BrY4S}9RlOk zUlKtO6MX5?La_RhwNNp_y`Sz=z9jFN%(3cA9SI;;!M61!+3Aw+m~K0LNm?0@;a4uL zg{iNsT7$WzX^%l4r{Zcy=zQ)gBD(B5hFeQt38Jrv=oWi3Ry4dsUlS1&-l)&{+NHBF z^)(aedmXGZ@aBX>UkjqIg^s=!L|+RctfzR1nh{YZCTix=QHW}$iQ@FVMJIyCce)y( z>&eZ?ddWuSX!Tcbo#Qnlx+}ipx%JhI=<0IIYesZtb}&|UymYv>L?o?ai5}Zr&^2d$ zrH#P-^xpVvs6*G>L)To;H5a;TPIRXB-P}iqwbGklvQGZ9srreP(%I@KsFUK{&cR$Y z#VYAokm3B!u~aj~s_I~td00waB_zWZf;Y9u`@wMb^_I>t&Jkw#dG*$og1heJ!$n7TLEJ zS$~UcfJOG5MK&;j3^U;tAoI4e7SzguVH>ux7Otoi0-tn6OOFL?F`$K~m9-G9tc7T0 zEkrA8;cI0`hb_UOGksq)(Pd`3XjcpfhCMW$&Phzn)O}wx^)pkr?~5kNO8a(F-xp1U zSAECCD1})Q=IPNq`>P&*6YnzJ=BSSC(jhcGVM`*+!bfZA(qAy#LD7Wn3Xd6ywj@IH zd|FE)bU&Zgk|@{uj%z5wZh7_Cnh3KoVQUv*7_1Yc^(YCUcYID`Qr}#<)A-t&N~;!<6!wa!uKWD5r4?X+xBz{@aEKZ}^UB z2(g8<(Jf@Aep;jr5tuC#OV^I*LiuRzTy(*z9n;zL*N*gOo)T+El;$b1 zc0_o~cT6{7J0djWvUWshMvvIrdI{STVNO0;dlzAdYR`o32#uql7Rk*F_x40aPf3l@ z&v3RUy4$|vx#`*qy7okOkM)QBzC&mHuLIHL;-htN(FLmxOjp@uT{GtHK>C}`t)v4{ z_U@iF)zzgHsg-mf$~(T}x+yydtzqurHV zX{~e;{b?uR?VW^HFw^o9bta-bOw?Hrb+!_9CZd^4)R~CpBkpd*G>4hGX{K(v4%v;E9{7&w z)>=2xnpxeX8xcNWtzp*fCG1Xw`T1zw1z~q4)H4VrVRu2;T@ZE`gx!VrcNc`+1tE?B zyo5c7umBVG5QIILQ1?WVu!kV*Aqaa2!XARKhal`B2ysN>5E{>mCBlMC7%Mz4mI?JU zr;;#M5XK6^SV0&o2xA2kj)=TWJ&CChGxZcqJ()@0)6ltNTK2r2f~luq>M59d3Z|Zd z2}fiOlW7gTh^a6$^%9os#Y}qszGUhpn0g7OUV^EYVCp3-*-H@O_|Qw(n+S_AVQ)d$ zn+a_(q&E?o?6Efyy0gdLMETHnTz4GlO_Z49V)Vr^s>5YG>>J`L%KLxgvL^0f>yy4A zqPZMFz9Ax<`u&-e1tC`~TGXZj} z)R&m%0~6i>?MqDV*_M4tSAY4A>$Y59LD^SOV!gmi*^eko@X`9Y{3}THV>?Q|enMyc zgwFa2o%Itm{R9nGD!eq`5>08Q`PM}ftiEMCiaYllBfcf31#HJ}iAlxlHJiRAeLeOa z)2**>1>v_uxRgB&>n2{p{zO=kkJevUwLcTOBZe8L_b0-IOxT|Y{hn+gPwP*FPkhI8 z6ZRK`{RJUbalC{Bh_DnNZGh0>04A&+;oeVY9Rmd606{oF5Dp+Kn=$$TK{!AVVok|8 zlZhSQ5n&l7{LZDrAgF}eV#jyH_B(6uJ8fUz{9NQ>-8i|i+h>}QK?ltuQ7MK;XamWG7b8XQ6seFhQXhfFv~wEjU% zC}&6M_&A6N7jx?$q+?J-B(?rQMEJMwn5OmP2y>9GbQSB)-^5t3oC&dQI)p}tgNg7X zKH6ZR!@)v_dT$e-1{2|Q7+v9c2W!G^9q7D+iSQraG2MO4VA3Jhl2B_H8@xU_9u1qL>8~n0tOt6!d&@lwKL> zdm{PQcU(h)`v0DEWisUNNmnb_&xcznhZAK*KH6}X{eskRYwkH*@D2Cy4HtaFg)N5* zzTtuo`zl9^#+E-2Uj+00Ahh^{)}nP4(I1F%Hf!+*qWodlmnSwgtVC_`2cmr8JFZ)c zKL{Vg{?g$x)*V4ym3aRVf@=g@*Y+&$2;!QUYrSF(XXlBKZ zB*MxN!O#;&65&erJLSx$>i4f(u$B#mXKQf`aUT^aFABk`t6aGkqIbR+k!XJt7 zmG78_5To&rx)*5ClD~<)fT@FFX5$bVEB{12_62#gtljne-ea03BsR<@U`!l zh7fh|lhEN$LWh`Vc?o|e!m50>i7+Tb#oZT|>ieZX z6XhG`AmN`4_q({KDjlxAG{3lYBc9n-DHUj!58!VZ)1z|q81 zo%bItm_{>`Tvt!qcQi3AVEc|HCR#;#T`ltd(Zu!6cU(8uXwsbNMMe{)sj_2s?xh?< zlr{KhV}yOj{13`8f^v+9a*UuHBkVgyP>vClI9~8ljwQ;POgUEQax7EYdgQT0xsY`U zYq~*ROg(ZDeHU&lQNH&b*R9L3L}{YpSYdG-!FVah5oIkt+Bl)hahfvDMmbJUj`L8C z6O`kGF2@PVae@-ZR1T$SE8~f>HdBsw=`sk$$+r5zc;fpVw1}~IJn{LBUP0bCo@mS+ zI2^Hg&-g2yu@0Z{SJxSX)vv4<9Yf@(=2trJB0lf0blz=s;_1A z3Ht|={o;txVKS|A0x{KPrU}AU6PU?1(wZQcCJ3epf@y+anm~3kBdrO-PB?yb2#uY7 zBf@%2_?yt$Z|w7O?I#^Sej~!gthL{W5Dr3m`i%(vSZlxe2+^*;={`2DX#{4*IDU4R zjMgR+Q++<#M8PzXndDqx%`}miOgo%NOzw6#k+}SsYa(%(3fV-Tha(q`;k{gwi0c=) zX0-n#7Z>hf^XW-Mw1l-ZiM6D2rAb5+z%-M5TGGeHwed_;m_&A5#kmsJ61;4aiET8q zO%|RunRR9x(@ZA9Ww0Oi^OK3tJp*GhQQl-*OcpwuOgc0D{AA&6Slw_ajkir9%1+6( z2B)|*7_6pndvMQhFd5wxqFl;)oTByEq6xLpDMXlv^*F_+NA$KSJg$%BZ(@rzv!}4u z;t(3&n@WV8nQ*Gm<5bq8oF}V&Z>k`iDhQ_v!l^z&wDeS;-|0_oLah8aghq$ch_DM2 zP7^wu#)P(YjHVIca@OHAO&C#?G&79|;g#Iv+0z8!G_vbz_CBmUd6}jYQ&(o1?qUjp znyIa2OedNZOarrKq0g*TS=W-L6HOAPnJ#Fi6U`q?gEcQN%?zUH#xyfrG{I_y@G`x( z8QIJrrj^VzLu*Lsb>wCcQ&MJ{A(&`AdiK$6=uSw7F%RXEV88**)of4xka|ZB3o&Zt+L2gTV#J&WNR$4wHDc*7TG$BY`sOc!6Mse zk!`ZbHd|y{EV8W$WY`DJ1ew#eOnjV4ZL2%Ct(hV|&gAG*ECTm4d+Yc(liJp*#JZoE zNo|YP{ClfsO3tMAh3}`sI5?Bqm+6~lQjDy^m4lho+SYOez!u@9oJEv9m~xh&oW+#3 z@#HK)Im<&iOHj@dl(PioEJ2BF)Jr*=C}Wv&wxFENamqI1c{Wk5W?jxEO81+LW)o#H z*5zywt7a3Wnd>u~D9xLU<_JpD-^?M(o=iDMMAJE}OWQs2IYjvfP-0I!hoh-}CTWh3 zNpGXKV<}yypF>P0ug0k5)!bZS>cvcRh34k6=4^TOT){L~FwONbp$E<-J(<-R<_e~H zR;GEx)SH>+2|djdO!|sUuQQLB*07%D5tEs7F^`x`JD5jI^|>9)Bc?yO9bnY<>S;bP zeZx%i1=D=Co$YDF`NXuAndWOf#p(L;eDbd3>|OJP*5(tT8Gp`7L}Z>fUm)y=J(xpeBHThE>dQn6 zg>Nksc9i4Dg+gZwh0YcVoh=kPTPSq4Q0NT%Q!nA~MA(lBe;0(m3#;nAji>!igzMO< zziUFJpXd6W2>(u`k)T+_6!IBT-4iS#iuFvfh$sei znNNA>BBFT46pMr<7m;Sxvo~P|;nmDyV(QOKi(TFntQNEFbZtaxW-&3DxeJR~Gg=pm ziRlS5Ef%(0OnNfyb1@NaU_#7i973bDB}6!Y36}`MC9E}D4RHw(ZeXn~VZx6{YfF4g z7)_VxF%R2y2{CPCCd{Y2OiPLBJ7!ucn3nzzOD-j*jm)%EFfApmJ!P#e^$}uhT}n)w zm8v28EzE@3wZmlM z=}KZ6%uFkZNw0Sqq*ii-mozH{%}PPDQqZgvG%Kk!nm&7_V8blm%eIQxhA`VIp|4eJ zQCn`aiU_xYzOWBkMTG9RL9HUne_3a%1m!BCG}ZG}M7fnUh~o|~E3R{1)mMFHdf2}17dSd3$heWcLwDXd+vsQ4e z6|G<`>B?l&I1cqv{z;TSFy)_u@=tCBw)x9{5~X?a@=u~PPhS2>Or~f2Q|JmuzFvxT zL@|OX*17yASgm7S>6~8gZ7jBqD9kF;>xja=%Je$Y)hpK3I>EJ$xUkNOHpJ1l!)5$> zJ#md>uJtZ1+{0GeT2DmV*?Q}V$UQrFy z&2Iyd>|k%+KqRJxZ4h3wfhcN0R*#y%`hr7Y+SEp(_>ncSQTWM5wh*r+M0*YYaI+xX%$jrO%BJqUSrBd(gqsE7W9^i)A;Q}1eOQfh2#xn`CBk2raI4VaRwgVRf&1ybjrVOOLQ^f-N`&re$yTC#%amJv zlz0VONsp#lvXv;YT7q@nnN;W1Ra~3{tXF`T4fS4K(bbaiRM*9M!QVrDSHJrU|JT)E ztnKy%yO%i)_C|cEBP#XptSV$PQa%M=Y|V z7TGb2?6^gC!Xi6qk)5*0PFrMWEV8o}**S~syhV1wBD-jjU9!k7TVz)f$S~H&gUo4d zCLY97Ya7k2EndWfcy4XBoFJZB+a8Vw@j4#FM^QY87i}w^T2mcv^XTyog=zEKh++&= zY!hvM8&lXuT-%6ZFHm59zKtko#fADUiDVnK`FGsrw-FcSDyR=^XI_%+L^761whNN& zOj5)ZeM|)3P9!Fu+)gAWpWIFqLm-dEShHPdXFEk6v#P>&($z4CJV=PG-XSy=+d+im zm~e-%*pB~Uu^mLXk1e)CTMX8$N2VP__@4E&LlEvDrn+oDj5-dJvENQ&8qZ8Sh1Pa5 zle=d&F?%O5?PsQ)nh93bMW&s^1c?`EZKq(`Nm?^&Z0)i#?INaMnQ50xPeE#zHJ{r> zGzXYw7txsU`!1sKW13w=W3t{|L}Tuc?;?GfJOQJxqc3B_-NZJ5*>=0wg4J%;mwXSY zu6pby!h=k>TNCok{M|(8&xE^$zIGFn869BscbJR~_Yl)>%(O??a1S%t-ZsBSFzpdc zdj!)S!LxqAd5_C;R8y+k;X3HJ)Zy-aAE`>>Y?4}oSdO6(;ga??gas=+D_ck%*01+MmJ>q!} z5TSXG(*Yv92Ya9n5BLc6C--|Q4-lbQ1qE{%htOF1AQ4Vw!h?eFAQS4R<7M6JAQ2vA zOCKacT61lruF)U#*%rAD5{YTMm_a!tMpuW3WEztkA`(-3JH)Y0G8_^Nhdd021j8ZG z3JwV(%-g6yIVucdz ze3)yfU8w~eCc-b-zc8Eh5*{JK=}dS;5FTOc@x9{++}rrq5h6U!ghz-_@jbI6M3|Tf zj}T!mCOo1E<2w=I5hDDG2{E^I2#r;b65&iHJnFJ)usW&<<8*CY+V-d*JSqr}3c{m4 zLe%I{va-2{c9aO4G9hN#UczHUIEx973BqGcSedjZ36BZFV}kIQAUq}rj|swKLWh|5 zdkK#d;cO;6F06cEJNPL0_Pcq>N)}bDepCG~{?1v|a(A4rz5R;i5hoc}b(@A2Q z%S&}*Qk)`+c}#IiP@EFpW%kxt(kY_Y z2KhEdhEtk?--B?9G?SDyb4u9m6fv0@?l|sqm`s~IO-uv1LU)>&bfgVdr`exuE4Q8| zrtQphnwYlb*iEtEG%+P(rqhDyw1~W?iSTP~lQ;_X5}qN#I3_$J2+uH~eiB4_{24)b zMi8D6gl7cd89{hP5aO8FAv7L;mIwzi;aOqVvrH&Pd%Uv1SwVPK5S}H%K-S?|A0c|c zSsi)fnvu583PK#AJA}r{=ZJ7H6P^<~JjaCMnJFebM}#}T%9v@MBSNYOSJ&gLb3~b( zDbEq5X_x0nkLG^SIifU?18W8jrP1Sgq8!4M=Y<~6Go`JMJx`Q7*%!|fCBD53Bi4Bz z6Kd|f)*P=bbe@=+vG-xk!eKI+yFg4sndySi+y%`PXS*MBftYqN(*T`XNNZ*!h&2;0(?wz$#!MGodJ0k(ts}vUM6;Xic#&x6zRYHQU*@7Arh&g(y=q&17eUi>{g`KZ^7m`ZhGtRG`7cqFYhY~~2#tYNdh1kM)5wTSVXSTDW`SDUb2 ze9a=eZjs%v$ZlF>w=A;T7TFz(?5;(IBddhW_bsvq7TH6K?2$$Gmqqs2B70(yJ+;W5 zS!91(WdB%X&n>clEwUFD*?$(;{Rcc4k9Fx%3z30D1=l_wMA(uEvE_LQ zuM^=~_!QHR%w$>j4ltV}nE>1SrT zA((D3lfLUGbHW?MbO5x5`OFPsqPKg_uTT28L0oB>>xSUEL0qj^R~UU9U72j*CUK2o zuA9VVvgw=5rTan2byINN^l;r2TsH;RO<%@<+Pf(zF&;XUrXRdTl)o_LEup;TSREakhh4bHERyzvBP9Ecbk|-Gt+IMx!cSn-#nmuz1xE6 zwqUv~m~IQE+d^x%1>qel;T<9z!-RJP;T00g) zVH+mIp4%Zbo_C)J$1~x5Vb}Yt!`czJpV?b`-hCo8PaxhWLiZDh_lYtiQ{E>^(<1JZ z9!-XQpD6FbH?g0>T*9F=dVD~XBbf4m(BlK9w2ea^5anUk;{(>C?yVm9G>3M7Koo6R zSD3vx6h>DMiQ-qLc<7=CQV*@Y&RhK^@^D9-XG&+ z)j0yKgV~ywDP-EaER>Fl%%FMGR(b?!SEJ$1`Fs=$z45;xV0ZBA@Xw zoze6(kGWmBXI7c9%wsyQxv%?}&TH=LJ|>UN#2))tSm!ZW$K2O_Ox7{?bulw_2#s}~ z5aA>yeB#ncuzJGQ(Um2=x3SI>B0S30c|wFsmY!|iyP|%g@d;6Wz?4q}oM@MsrVzYcg{^6;XXGC+HX`T^{;%7dek%od-^67i;@_k*Q)m90C{4`)M=B1b(b+#lIh`s0 zap^2r{lorc>skIG%9E_Qf3zKWHLQPpOxW`O;g(;FqSZgbjyO{CGCe1z8O-!tFg<6c z$`QDq-rM;6b7DHhOwYA7(^jGLJ}0eZVXZy)5n=>=PE2O49~`MUOval364Oj(`d3)< z-~VCFe~HP=W%yS!@mz*~i3#taAjQALWJW#zk|oV-_J4^mmi-OKjSiu)NH1m>}<}%Yup_!Ne!*(x;=?rV;rDo#wkYD<&g+BgL`*?(2 zDgPxgnI~Lur0y^oJ-s5PIn4A*Fuh_u6^Ow76#uXDrB}pc-g*5>`%@eJjL0kUr>yMb zuY}fKk=CkkG<`*cJ=veIZr~++O@#B9@U^hyYbMm=8%g+D5WW_KuLa?2LHJq_z7~X7 z#qbioA;S4g_(l-EVM3l0Ds=cp5WW$FZv^2RLHI@xz7d32Q*j85=e;Gu1x)x>Sotk` zo_j6gK)z$}RuH}wgl`2QMh^W}BmC{HV8WV?!(=?~9WgCrrguVX?^tW@6@(1aJHhl$ zFufB@?*!93p|y8{5Gzbx!uLe@I}^TlX)Q>-xAx=jiR~=6)c2Z=M`G`Z&CFtZPi!X2 zz9%*u9isN$6I(B66nH1zBY2EwT(2Sw@R2lSTG{MHXz4Wwyw&SY%l( zvTPPvc8e^;BFm9Lh9e*JKW`gTe*Pd`#BEIZ1;8gW@=<}2IY08gQ z0S?+oIhOLnb-w?AHjS-G`MD#L^22#ln9U;-`oEXSkC+xSlOHjeH~IL%Bum^w?`Lus zKcYFuG=4;5?pgZL`Atjrqw|}$g8BKmSMS1>=cVx{nk7u*FEr$@H56xCWx$`9%p4Ja z&BSv={7FOExQF%^8uBMXle_y1LW~1m!T=&%%7g)eFo11ntEB}H;d$0t0BcRZcPKzG z1rQTv4QMrtJYI@KM6rx161l7v40~u#vOT$#h$t>FMI!A4$B_X;H`Nmrq)tCYf;DT(VTY{STb zS(BG56>YglbUG$W*W>F9U2oQ(h$vhrb$CI zCbvn$8nWG)OCvOtMrbIF&`=t|l!iRqR3y>}ePQ;KAP6%E!VH2C$6^kl=@T*%;Z`Qh zD0G;Sb*TGQ8TB#}p?MZCBN4iv17lHffy-mgxOoVrsFqjC< zouptQOvZ%4L}*5W!Nl}0w^$shJ50u^nTcr!Gi4S`nVHE}#m-DjcR4<1CMNf~>Y0ga zbAaw=GYidSCe4`+xj(G1Z5TvWfno1MNnoT zN;8hgA}Fz%;ib$njeW@xp9Gxtl4oxhx)}vRN~jY{Yk;`LYq8`#xPZL6nUcUNQsLa~uZa zb=iqw4>M#Jn#%q^T0(XrF)O=eClbXgfMq8wnU&qLlU_{x$WBbJmnrSir2HrWt0mcCi9Nv5Mpw_V>yJlwghPFg^;eWh2q_m5I?;l#_dE0o&Ter{i(A_|0Z`_lKZ^xh_hhf@1`z%eqE+E>+T zJzl2CA3<$9l-k!;jsl^+_JvU(l%qg3{w7vzUvsR&7U58u_?eR^4=`m;5e0H`{IspN zm6IqRGG$Jpbg#FSlPJx4TRDA{7zJ`tL^b#6b5cw&&x&Ci^-|^{%7aXqOHk%wO1`g3 zdzD06!#a}(t@rpzrUa|%Kj)1QT_!s#uku=C>77w%tM6RS#x;=VICrE z!1pop5aAncD;U|mgn5bZC==#&=`l#WP-4I9rOZ#1Czvw7 z%d){L|Nm$q`2}Tu4`qI#z5IeQKT(=|JHMdBOu$Q7fGAHgWdRpukSbu!w+j%TnQ>Nt z_{@8>3J974M6({wixIj2(U|-Za}Y00L83{{t!xSOPJxp1| zM~Plmgt*?b<}fpLxJ>*gN?fU#t0-}qx^z+IvfU{yN?c}*%c8{PUgNT;uwha1Ev!#s z?~eJfLt?zC7?Gr5l43$b#n_u{qq<^5@;BSCn6}~324tCHq@BI2onnHi7-=US_CqGj z#~mi4o#MolmYIqR?G$Gw_gpEHV-^=o#RXGw!Bku@6(>EJN=9)(h@%6C&}gj$5vF6p z5<+Vw1fkyB5SAdqe^_fJh|pB3N(e71K@^H@hvN}1MM9RTg+{kOATCWW>JEBZ- z8vGPagWcJ#JUe`2^d4aO-|{A4dE0}W|E|T8!g4n3{(Ebqf9KyCEdO3kgVWKcI`%ZU ztr5#HV(pArdn4Ath;=k#os3v#Bi6-;bv0t$j97Og*29SPG-ADsSZ^cN$B6YcV*QL* zeV%{Af>ORF&bVN~^gr z&C#w_C((jdRv2lFE&s$t+KnwfagoN@dWmBzt;T&Cx*a!mC^0j5vG4wnT|A7M`b#yUepnaE|+UE&OgCrBwVs0R z22(^qWxCPi`f4|2>ITKjtco z=D6(%MSd!Jsut;qncv?Ddn!{;HNW+Q@HGl$r4|c)pXmi*Z3=s-I`yK^dM>>egngrL z668dvm#R}QW$FdR>r}|vFjVx0q7D_km7=#(s6CaxOZJAM9~Hf!$g^Q1W|rO=g$R4Q zGpqgX%&Ip`Z_p%b)7a$Ot`AIgY3id)eOMXy44)q1ePHTOQy*5w%`<&6CRv&FL2LEo z-m?!(Z_*@drg}8>RsE?iP1d{W`zljkW$LR;eU+)NGRfLJHu)aZ52pGw^;31~ zcbp#752gXEQ$JPLb>W zDsF#_vob`ZwrNkAZ$$GASE0` zVU82xYX2+hK}tAC2?r_RASE26YB)##)UnCWAwywmM$=GLwV^awck7`r4WVhMG7VLxp-{X-h3w)(#V{zE zQ!z{_hEdTxjqBXitjsVdhEg$1DTXPRGk_zkh(j%c5LB&X=7>UvJaFkC*(@13+smA+A2)lA;CMPwaa1?~C zDIBGQqbRJM#&x{foyv@Ya3qDJoRIgOQOYz5CjXZJM!~cws+yd5?kJi{P0uTT?`%zU zo+p9|YsBUmvH3=9fe~A1#N^wHW7~4E5nE!! zmKw2TMr^qeTVcdj8nJVZiOJ}f{uhree^(fdvDJoSYqXk!MssW>pH1*9rqOC_jaFl8 zv>IEZGowiq(&J-=pM%Cgk)~pd8sTH8uzsaAMk&T9#Tca+qel1`Ws(sWn#RJ^mZq`F zG?u1@X}M1CO~%4BinGpG&WmnuGFF+!s(biYm=<%cmJuJDd~1z^DTk(UsJmsG)^gGj)@h%wZ=oyj*9WBGUKVpL1lO~?_1-c7|qIzr^2nj#w*2mRHg%0 zH{)Si!pg`T7n^)#CcxC5rU|Mt6I5liX#z}RXquo*6EY^*-A-`3+qC=b#srv_(j;?i zXqpI92bv}-(?puARmMb^#?mxVnI>jTa%Y_ggE2VzT~TBkk_zlOY;U z(PWBR;Ur?R5=}<`@w@)X@GWCo$vrLfO@Xg7eN!^NM^f3TDXfxPm%5SV=fNq;G)0-F zDAN>Wnxaf{PYq2|Vd_HDRMm1*X-d8=+MA{-(^O@es!UUrX{s{GJv}r{gQ+V`)0AnN zGv!))uW2w%VB1ZjsS(<4nlep;$*;wx!L*!hCu@q>>FD z(U*#ON->X$&j2!o9P6o>I)i4CBvE=BXYeyMoX(AEtgZ%~z)RY%MpM z{MGJ`HD8(LE7N>sny*arl}UCbvB}SR3t;L`(*o6l796Lw7Qi%}by}cWYk@K?fZ`nX z8QBNL3g21_p?D^GDVntJLRFcCR9N39FI0+!O0iHW7OMNfLS>SDSZwl@Sp?GnR%Vf^ z%pz48Ud_A0BA8~dGK*+(-%Bl0rbSTXvNE#s3l)o@7)ZrprC6*gqZNyxm`TNAD%@RR zu~ICC;#?|ZuNfCO@;W>K+3)nbWKEJ6GDwfYj6&Z9~8x}j+) zOoM4!s`|`Qn#!he9j|tyX(>#zX@K!eP$Ua>g|_vDXW|1DFdWy-WnnU*Pq?CxWQZ>{A}45ebZs?74^D3&Y5a-~?V6w6g*mMfE- zE`+8PFb$(=g)*(6$$E?a3Yg}wwN|Jev_hQ^u28}iO1J{TKiPJ2))5PR+pUCfIE5>f za3zJ-+X7cYIG4he6uObNQkhmlQ8oH|;j#*p)0I$h4iqD(I47ZyE39`so`d}JnExE+ zcWdHvRQ_|2zgm=EP9(eHMAE&H)&J|7)HXb0xha*)lSg^t(*LLKEYe5hyi%UQ%#A+B zv8Pz)8nN??*!f0ml@VKQ#MT(G3yj!>M(iRZw$_MUY{V`xVwW1Rbw=zmBevd%U2eoS z7_p5;Y?Be&Y{afGVpkfmEk6IwED5S@SigTeDMa8)Zg$7esmrb3t5Hp zoMLni_5kN8#d&I;JrAZDoZ)12$0k3|o)6P#n$B0I^J%i4IyxVw1vH&cllyJm`O0)Y z6xFGa`6O2O%B+H792Kje@ZVRhqC&k3fu~2SRApAF%B)g~RjM+pRApp73r(wG8cWk^ zWm-*>^@PuAm=>`ztDPz3?m(-RX*Ep#L~AunHCZQ_Ph*pBtu-)>r)iC9tu-{cXKQ#h zSHWvwT1?X#Wm=<5YoMq_h0Nfg;sPipP;r4$TtJ2TWjPfWs6KN6Tg&yC3o;5BzjBud z^InL&6PfoymG?sCwf3?XBJYyu)62=^g~*%yRq=($Ex)aoegB1Sk3D-X#@B_YM{U+a z?o_eM_uh-(nnc${s;68;m-UXzia~_G>$%gla4ltTS?k&<*S&FhZKj>Xv=)jwtd88DL&e2VOrhdprMQ?1YlgX4^_GiO zZ@E}0E{38m6|y3T6~2uwfnq8Zm#8+n#8CJXh)bYY##&tB6sacY8B~Yx2GDr`W zHAtwq6pCq7T&fh8Qjt7q^@>ZC;!>r!R4Fc13R(5U3g0i*K{1_*b*e?yQDOb6bR87S zS&Ma4xV_psrC0|=J+_FfvtosBk;|Z%LB(ZCaTyiXj`1=mR#0)7Q>5K5^Dk41%P?a6 zQ|p((RG%hU=fx&pne{Nuq-nh}t*0sZjym5y>y>G}GObsp^-3Wt%UIznb2${VsJLA9 zlFL;u(d&fEmEv-xxLheNSCzS3nPg=fnl`}HEuUNaY*3~RG`Z(m^x5VHm{vxuC3Eiv zm^%0VJA2+eYn-;*}M)q}20swZut%X%_v6I{1OF1Z(Na&4G$tA|ah4L3p2kkyer zMX1;eMNcX=E5&9itabNhC~l)-vr=qUip@|oqC$2tq2dZCdQowOQd~iWbqBZtircBU zLMg6LiYuUKOoi;5LdBI(^rqrUrMQxcq@tBI6Mb; zhbv)fLX+&=a{#@2gv( z=u5>`Rhg~FX`ii7+`-ChrNW)9ZdHn{s6|uOLiVPi;wmWmQE`=0Tt!7r8rQk2xvIPh ziaV*eO7)VfG74#*t6cl!y1&bQ6-;kO7P()`-Z(T}4O4%bu1=Wb3TwxBH8giob2T+? z*1S5Sk(Rj{`Nwd_DEsg*|24=zfcdXc{o@+e#QF=M*FbSM71yYmT%#1%K+%jXBRl_4 zaV-=Bskk;#h3wR|PLZ2jwfK>9Efn`qaV-^YYEF}!JcOp} zU>Zczb;@*|s*JuLT?f;>G+jrNJDvwAtUy%5qrdlJ!-@rJ0>Qxt@OWmH2J&XHjJhr z98KHQY`0B~CSJ`uz&4Dg`#7ezso8Fu8co~OI&K@rmVY~!^#9o8?~OOWG?bev73rpl;RepxJ4;s z<_s0vp%_KQcBR-(h4oGFb}05xv7HLHBio))$enq+yECtMr(xS+YR$^X%pIC;g=sWR zw<^=E$1&Xs(_WfxrOEAlZdImRooVzqtQKyCsSQnX=Lk)=!8C@Z+mz`xnyeMfZ7}Vl z={9A$En||^!fj9t=4wvvRiWZ`D8^E8yHeavh4r20?NB^G#qCOQdqyFxb-Sxfk$GvE zW##S|Dt16Ij*1;hv4aZh$*CPsJV?b3rP!eqJDeib0PV8_mGMtb$=x|L-2u~hn(k1h zJC0MCJ79WZ#a z?t;(n9q(ez)YwnECob+%rn{8sE@iq)neIZ(yh&C{vB}>p?}lj-O?PK%CQsbnO_O@2 zig(MqVS0phx?8p5-5Hbg@Vi|*=DIhZ+znF>>m)0+*yP*s9+)Q6bdRdjJv3#faUHJq zZFdh$kJ5CHGv&H-#(Od*X{~!uC;!HQdthouldKeDldsdgFioN9UR9@iX-a;#VN2vB_6vCrs05+Nn%CX|m?XoiOdEX{R#nRHmI!v}gOsYCKl>%Itz-Iu*NApV>u4 z$uzFx)oyI3(IpUnyk&5i5LUc0)0firuO*yQxS%gY0|mZl&0*6uXsTx2nu;Ws?0*XxanQ zESmNx(;l{#`u>eG;U1U{u(kHk)El$&9%b4ClYbs=4@@1|TC)ENO?zRQP19aw+It++ zUS-;=Ona4SuQKgbCfWbRCf|eh!8C`aeX0lTW7{R)RPNshzE7F+NTt<OrYAuI9b;A(;G=84o$r zXt(ZpNSPjjqB9k;#}5?`L$QF0hn3=CD)_{KQar2_4=csP8HL=@ALeRtHmdfpGRYZ3 zXnF*ug)}{)OpnmSZ#9(Z5oLNrnI6fQ=3Ch4NbKEWtYE$)v#+p*`7j~lTA zM(hbAcF>4DX~do~Vow{fXN=ghM(jBw_Ph~$!HB(R#9lIDFB>s=?(Nw6y=uf>Gh(kB zu{Vs^n?~%A5qrys$+s-W`ra{O?;5f9jM)1|>;ohAp%MG&n3(L8rT@ia%g@03F}9X) zZ0%R`%zln7eus;B#*eN27+X(qZ0+aRawkpu)!5pf8BL;)9v>gKH*Fh> zd(#sz`6qm2jtNZ%VOl}cL1j8fQ*wRmO$T8bO4C7_+;?RMGbXt=9fYDgTT5oOSm9gi zNhnrQ@uX_4C#kT$SAJ3{o>Yn_mEuXXmVdV9NoA5*H8%OydJ3jHc#pzzOu$vhq_d}W@7BA1G%mEvhCtlw`x4aG3F&(o^UJgpQ@tI9kL zQxCS5+(%-Qugo(rolDa*%JhsgU@crTiC{|PPf~v&}R3vx&zC~VuViXlGP|*T&*b7SW0u=s4MphuP!q?(O zDArK%qN>GESCrxvrI0=wDqe+REfudS#j8|U zWULvbK(uzW2Tkd~my+M;Y zL*u;rhBCdOOmAdNQnfeKD1SqlWN#6he4XBeX&p^(D$|=ZS-(_}~-Q9~0p(n|?hScN`Gz+3{ z4c$K#ioW&nYpO%A^`lL8Rk6)i=q=dR)Ap9?NpHCd zZ1S~u2c``)y`yUJ4o%iuoZo?I5>4-@TD+5~g|yE*Q1oYu$X+#6ybHxfD&9>fvQqDw zz2#lxpUnL4vL>!aysK*RF7l7zcg(Uc4)ec<{F|8ny+r=()O#E=)?Yh#4~i*Nyhnwb zhu_O6WNvwnd!q5UYrY550Mm2;C`Q z^4!q>`|bG6^P1@0jEnJWXaC#hh4_8*;pp=ndy@1&BlfWo`^1QSYQ#P>VxJqaFO1li zM(isi_O%iF#)y4u#J)3P-y5+XjM$Gx>?b4kvl08ni2Z8Beludf8?hrs?5Gj@!-)N9 z#Qrj3e;cuXjM%?M?7w4TGQOn$h2!fm#@AMkufuA59p?Bdke2KCOj_EFufuA59S+9V zVKu%EtMPR>Grq(nV<2|n$9|0#)# zyjbFAppPNBnv#!Ig+8Vv`Q9@x`B+Il4kRBd$;YZfA1jxP{?PRaT-VU`iE@2Hm$lRX z1g;spH+=$EDpeJ;;wQ@V2~2)>`w2{gc)yg{Cp3Ku)3r2xnlNRhJ~i(VpF-oGRr%Cu z_^isO(8%AXl{@99(D>cTr_c}-OUQglY5c;U(nP1b4J*UI#@GJTyf$=Lc@DdcV*D|}!528x@h_(mzdVT)M1ly9Jz$I5)8`o%Zs ztMV-@-OOvdIedTiEa$ z&tmRm?OQdLzJ<@P8NY>S7)7$OiABEUzJq8xMc*Yv*{Sc?a=fqdYIk@24z2}seFqo* za%64}s_~tYe200*@4ml-YdBr9x{6)CEx(8BR=U1duJ7rx-Y@mNa(y4TzE`gAl|H0BIp{&%8 zW?%XdqJ?a;A0gUzPrffMsagkZ_M;O02!lUulXYxp_z8v`H2jn>WT$>&ZLB8>e}ZHY zN8C@4q*CrVjh~?K_tc-D7|FgOE9X%0GZc4F@v~C=OojXPDX->C{4*4bsrZ=+_k_{U zO7Sxkqo|OzdZ_pXiaV+JMJaxv!aAk>1&Sq9{6d90xA`Tbkoo2pH{X<YlWBsNSzd<4AX>z}o{ZOp%J&zW{b+3R zbKM^>-AmIS%Jc{8Wc^n6513Za^ao9Dp7}$W{(xdU6|w^k6@Nl;9~FNp#h-`DS1e#_X|EZqvUnZ~gg#YmA zCr6)NP91CG`9%5a)$-S><*!-GxuZK*yfL*QipqboDvG&h6q8RVy7S1zICV@#?=Ly_ z)G_)fK`gHk%V)&$8?h6N*oj8$BqLV9h!r$qCmXRsMy#+AJH?2dYQ#=6VnvMD=|(Ke zh@D}?iW;#qjo4X6te6qYHe$t%*x5#`gb^!g#7Y^l(nhSzF)x=>9w^qX`}F5BS?>Mbc_5K}qRe=C z@)nLHspuCjGRpJ7<=6c(>SC8)OXY>@0lMhW0gpNS$LOnyI-FRy%$ z?4MT12jNtWgd)$esy&M zgkMv5f)bv1SLE{2~WrfrN5ov`diAKV4R?YGAoC|6CwPD!V{J7 zMAp#yjsA%c7GW!&NTFN*ov4H-W`xq_C&Kg?Pe$dg5SmVc>06pkQl^t=O8(tb->xUY zi*o!PR_;onsQ^sh(NsX03ec4N?V~>%ECAE# ztWyEj$?ZW4C{qDv`q4f8QUG--Pm|mgW0RjN3c~a~O$C*yAWi%gg}a(7)PgW&(Nqwo zJhyK~EefLTqgheSVM4{{dg_Nlf6qTruHA<}TEm;_fpQtFT6opwQ>zSd#P@G9cVO6KXN>Lb< z@xSCM3{z#AWQ`P>PJ!uXnodcWvQnqywca9i3N&X?bBgL$r>LHE3N-#XnNy&tLXE7> zVvTR7Q=$2Vno|>+?9{2OkoAn*sW26z=~PvrQ&ojdg`z4IvL1{TzMW2k;#Vq8Q;O4$ z)1Oa+B0EyZs^K&!lJ65dO;zVKRh`pRb!1H%yL@$u!1WtlMU<-uUDoVW1g_$&P7zg| zBC0w?ps2>`$SOEgoDRkBRGgkrWTj3wd(7#`@848?I`g~v<#d(*bmSkyU8=0O^JgKypKY>Oh2$?Q{Cy(}inCdTEGpcJB}*x?&@%p8(JYv%vnI0Qk4?U1&VcDC zO=l!(lASt(ChN@S446vLbcSk~GnDBJC~8n4dyG&~6pBBnD4I}YrHYy@Qxy42GJjFz z&$H@E+#`x2zuXmMl`lJ(FyooX_$M=-naG$OU4d18Zu0!i-!IQZ-crnaCO@~ESI^9R zZpkaVq&Tnd17{)cU(9J1t`TJfmm_~5zEW5 zh3v#*g|AErDE_0Ogi@5C!utNF1QcainG#Us`NBQLTLKEdnk#|I_~+hAz*L7OISGhO zzA`0YiiQBze@0v06r~wh^mi#OfNcdPc0i z5o=(?8XB=iMy#S*u~tT`wGnG`OiWf9(*NSIDxYce&F! zzxFGyOy!lSyfT$n3K`>}q5>2rQBgrDDo|nVi7G%*iIu59gKqujNippfT;m1 zBXdk_@_nWvOa*AF2$SzK6=|||u@zye9GT<}S`nt?vq}}=@^|)%koc#7WM+#czKtqD zQjn5L8HxPguLMc*uOL_A9zox^DnV6+m8#?_wZA_02$fW&Dxp&Cd2&<Q&pJ! z$x&698nTzmT`o5HUS18RQ)sFNlker#XtMUa)nKYdQ#EC(mN7}6kvnOe*Z1k_$a^aD zR#&yC&b-OJpRYxAJ9%vKwWtA8ahhtVTGXJ) zI$y5=Q+=9h(Bx*J8p>1yiX&9W+96cbgyJ+RYAQudDy+Y$Qxl3BRMb?8no3a4{~qgdg4No^=jr=m6# zeiYX>YT?)EwW08*M75Qowo=qqy`(lwO=yz!S!k*QQx;8il&KC)>TgYOon8l~nrxps zs(tDxQyrM(FDghst^-q3nq++#n(D%I22FL9sV+^{w+nS)^54SLb*9ldxGU6Ern)fs zQ;WJV`EOxleHxnT!Bmu{ddgIfCTmw*52jkIQ$1CuddgG}CjUHbJ(!xYPO?6ZO@4&e zhv`h3>ciw`yZX+Q+arzZ+|^tM)`zJ!P4#JV-wf4PrutAcr$ScbvBLN222h+uMFS{& zuWoRh$~1tY4iycc@XyRQfWm*P-T;;H_kjj573X=Z>(O4-OD@9|}!vD1r*=L1{CQy{1qKQ&8 zVT)L`XaYsUNFnRPCQu~TpG_d~>(3^T_~#sCcNa>ULQ;~Frb^QEIQ3`>Nh8*ysp=a| zGxd=BL{omp+aEJeQ*SsX;ng%Xhv`6coz$YaGo^c@7R@sXF*QeJ z{Ch!VKOHOl7-|7U87f*p;b)f?R9LIb7Em-{Wm>4#YM~S@GGjv?qs5kGPQxCb)=Bf z#>$z$MJ;EGYg7I`j_$ch_pGElfAmjHuHhM@{PW`I9P&n-Hm0LbbL?qjTO*cZ#M&9L z_C~CO5$kBgIvKIfMy!hw>uSWh8L{q0tcMZnX~cRNvED|kj}hx@#QGVr{zgpp0>`$| zKqEHDhz&MkLyXu^BR0&44L4#VjMzvcHtLv|%sSHl!qJq*XsXE3lvd+Ct;V}P=S!>6 zlvblDtwvKiGnzypJw8;lg`yG_ZPl1=ONF(5XbVM4j_I~kxZP}9rDzL98_p~;#$tt^ zfpeg!Oht~WMGh6#+d^`nm=OKnlzU>1szr`cYmGdB4a z=?GJGnmQ(Gk(KIbKJ(BKnl`LLM{3;Nu%ps+M1KD?ip1 zahgnL+%oxsWVJ9Y3i&@ooTYhZ)audtW2Gi zsk1V5RwlVig{Cer)uO43GIepLwEB*cb?O3BTh^%yTg%NUUDQgZixPH$(4XOTfv_E` zCil!(=zCIE2y0W=6+*v~>B_3PwU!=LT_Mb&uq%b`+^TEFBy&+$w-e~?{$5d6nA+1M z_w3l@dsjD@>d@3pRjnIM*6OO8GIdj?ZW)t|xNd61byFr;GlZt@Fx8`}yQ))nno?=G z&eh4EtaXQ}9qZJcCihnF?#k31COY+?Ml&Oa@$(kuN^@OP|O+A&Vr)oQYwfk1Sr!w_ark={wQ<-`yldRc7Q!khr(9|nq zlKp-!nB3nD`VN0LsF(Ua$iMZZ7kus6j=h|3zk7pcFXii{`c^OaIZ443ltNa_vBJ*;eW7SfMPDd<5AW*~xo-Z{mFcS#eU+lGQuI}o>8nh#VvkL} zGW}p`LQ_AOd}aF4)H{vq+|``Z`@z(KJ*b~E@mr~W%G3{vj_fnCM+g=Dp=e4)|AZnd z)!$qJ^hf@V%-^3iaqm{?pUE%lAlZS$8GTy}K*nawI3STRJGuf-J*KRc=>X*I#JmIe zx!t{NKqjxu$^+0A{yXOZsKSouDKRn04k$DYgsC}A0~02>!g`K-AT*t+8K^V^m1ZC` zem))uO((XA?7c$GAZS`pGYA^rTL-a1*7sq9VCq8CAe!81`5A$@k0>zKfCzO@M5Ge9&xd119Loy2KAF`tj^A1Jc*33IpC+97cW5TB^x2{K^xfD7vWE^8!=PwG#W1B9Mn&DUT&LF#!<1r}QVh!| zL^#Z?vGz}baF{a59zHht(L5ZcG)=={@_ludv;4GmOwQ z5~duQM#AKK?ns)f?>0ull%J-N$}|!y_yf_{k~)o4!jYmqi~cG zj-t@|E^`!wCsH`d2@hU~9yCgsM#1F&X51*4dPOEVne5AxNq6oj=aKTnqdW)c&Libt z<+RfM=X0Zfw+{a#XOix};Z7z;N1yH3Q?D^bY^)I*XT-)EF*zMMHpfIGHpz%hHeyqZ z*i<7n&4^7mVl#}`Od~eSh|M-)bBx$rBR0>7%{O8TjF@~Eer!z_8L`DiY>5$DYQ&Zq zvE@c=g%Mk6#LhV;Cif5Nf8p2~jj`39V{0_VmY*d@b8K1PnvKTTIw2Zcvd0*Wu~qPo zZIgC=T@vGKG{%?zPH{BGSNu=%+ryq5^D+ivm%pEmfvW>uW7L=*Lznfez!pnZd_F(U%IDd18gX-;Rf(GZo`A)sg-&9*X2MiR0DU)!%!^Lv%7l67vB^`}o}!O%AWzAp4lP*s}1DkXcmH%(Bc3Cc7Rfrn5T9(d<{!(;+#HlIgKztDB3bXCzXg z>8?U`-Fv5|!!?91S<8g38F2NbYld>opvyWBp8;19w%QEWYFlqW%gj)w8P1e;rWr5| zrAgLOvB}TOGhymQ(@dEB{b43e?ulM~f0zl=>8#aEm~fAuS_ZC}8JD!$Ow2N}^N>C* zE4WzV=apHI^rK{!s?ID*+ZA=Su=);*-#9o zVz#QsY$~$Ta-CMpR*Kn5FFD&{K1Tq^3N!&=ORq9|Kru2XQ1ovRdc(J%a)Vdla#g6$*g^4R2i$vl{b z(KJt)=F#Nd>Yyt#52iC|n&(X2-Q8!NGR=cxBo(p~2o>|87(~T9 ztM-|%6!TU4$lfDVEP!Gl6$_MN0Tq=E#R8>RpcD&~Vu4b~ZYNYMgrYwc3!(651Ph&l z@7ZuONKu~OFI0+!O0iHW7Al48uVRJotBar*PQ@anSj1XbPfaa?A}i__a`#yT#rkbW z>wR9^&0~v{WD)wSoa)FtCcC;=;(O>~NJdbySVDVC^Kk)3O(SPI2RDwe8REM+aMRoYT0 zim_FeLXo_?ELD=Fkc?(MWWO9rmO(O#l4VM=>^QBm43cb0mO*ml#8F$iZ+87-nUXAn zWDF&;OAjT>AsJ1{awS<#iCeqrJMMBwinD(#cP;bE)ws_sSBm9OjHN=(1wzFND8^8+ z0t(+hR#4&ANLsN%DOM=O3Z+<~6mk*~Dpo=;fZpp%J^th^;kZ7aOrljM$|{Y@HFi%!sWwVwW4S4MuFE5!+N@w zYQ(NGVpkinYmC^nM(jExcKtCix#LOy3&&3`#?N?;pIkM5ayg2NrsX=lCdrYPWLF&5R$}KjdN*kLM_s9v>>sg<=8~=PJdyRFpRq=PJdyN^!1IoD0PSDrAhs z3cn6H4~mIYoTn7$QQ^KF)U`NIDb7=h^OWK|bz*a#GRYW^O@2jkK1`EnI$xR2r^$T} zs7>c9)A`DDzA~M!6f&#C3SXI3P)w#`6%_uCv+6j#WR+5^Qi@edu}W2Dl`_e!7Mpx! zR>L%frq!x4t7%G}j{3FhYGqojOsg{{xf8ACIm#H^r&q(|`?1WbvB}qI4NOyMTBA&B zXtI9&um+}*(P)yD>>8MoE7>&}myD=2sFgpDTm#oM{%%`l_Rw_!T+`^f050Et7trN) zUV5E>fvVL7s#X^$(*>&iE>N|SJ4dSnEd(Zg)qhcB;QUfcYn5Ux6;>}=3q@Hf);a~(G;1@hCHDfkUx#@wM&1?7dol9* zUUD(>x-$daA{QfXMfTo{kr#he@N4%@@ryGRkleB^2yY(d96mPt)Zv`8K-TnR1=UKU3tMe!N_n zE?1_@mFaS&kdUDmEy^23F?FHgcUib@bnVZGfU0d&&kV@XqWU_qRSaWLib~ z>;~0SHo!HD)sfY8=-LR^YPvQm*T&S^wOP3~E0^qa zLe~{=T}amz%5?=@);BU&z*U|7>k7D%-wj;>m*4AOpa{r%dP+1)qeeOC0sQ)BU}lWUq4)#sg>lF9bK4r3-YdI-YqKc7Us3i2)7__ zP1a)z@+MECwq)udxn<`V=k_zsR^+~zxwop^TbbLMeYYZaE#}^e+{vdpwq`!JjPs2&av~5)u7%_>O0I>(kBw_7X`Ghp+|_<;TnkBE zO0I<@IcBeg#E;o)A(_Y4k#mSpavdb=DY;HbuA{_#N2w*(K~j&B>jKGjN^%_}hoa{a zrJu;@M<}@-lFKQ%UP-Q}#2UBPLsCDI$e!?eNb>N!@_MDX9*X%<201tUG4WTb-4lay zhUorg%<|}5P|gnBub{U?|IYo-;?(je9{-P=8*Ym}&9Uc(HyE)Sjo3{_>}Df&ixJyy z#BMcWw;8e9jo1z&c83wW(}>+=#O^j?_ZYEzjo5ugY^M?1WyJ0`V!Msl9wWBbi0w0C z4;Zlrjo3p*>|rDJh!K0#h&^^pOh%LRzi>2d!)V&T(X>sCrftWWVYXp3HQ;F4=0+2r z1ly)Y(>9Ez1yKf3NRJN{H$btGiW?FNxx)Gt#SO^ci1W$~$e+9u-T;G~8Aw(cQ(@K{ zk#!TZ-k8Xm9bJ(hS6Fx98n%_;rQ#MS zQYm*IzC|f+fno_2GFyg4n;SLzC zrQr@`xWhEu0YfX+;0_r42)P6K{Yv!?uR* zrOCSM-V0O7$Ryvb+zV6kZ1!GNoqHjfPl>D(L&<%R+(gNJkoZ1(A0^hW{O^OL6eagT z;=bi^zpc1WN$!JW0VT3J4JA7vxtWrkO0rY6lRg#Q2}xC=z z1;s5?>{5zdO5v{dE0JALlwmz~CFN7bN~0Tv>&OlKUaqPRadBa=(-0T2F)A z4@p@{?uR6~@4a71?uTR{TSnIYp=38Cw^Fhj5`UN5O^LPs*bPZ}N_Io?#OpOC6ff_7 z>#`dXnPFvw?RMW#-s|2_u^X<%bjkiBcKMOL2d>-b+5?yGyL;GH)*Q13u5xtkfvbA+ z4bPv`*|pW4j7!$9d(`OO1J@$DWDgX({OH~b*X?xeRj$2sS?2_M;i|xT?1d|pDuH=# zuQKh0VhI(p>k1Y7px8mhJ}7*v?W4kKwS7=jWOepIkvzfKrzHCzSxSlQ^+L%5klaDZ z14{D1aoXttNGh=&4?vP9>tfX70Vw1?AbT6xPlkCPMBY1@_d%8SLFTo-hkFotD>Ltd zD({0Tuk2aFybmGoUCjFs^7?-H&~YmN5b{=G-iK7)hg4qK7l(NtM&7%b_hFUyVdiz; zjJvD--1RW>R%PCYRo;g)d1Wj-jMiSp)|Q=ksCWd5d#HFsDITH1`sVc!D55`hjM?B3 zrFbNxkk*#newg=B<*L*DzC_c7%4{%oBoDqB8h`nIM zUNmAa8L^j**egctRU`JA5qsT;ye)ea^kIY*8F>Afa=jEiwhlmVexS&f5`B%e4tm}vv)4F@54i@ibSt5EVJBzq}&Qc0eq zB>DZdmplnc9ZH^rB)LC(Qc0eKRsSd*t9sT)aT zZT%D^{_gS=6#iu6DXee(+c=(r>1~>1mX1w+efu;_576|qGCl1~xjBaEX_)HK^t3WP ztxQit@eUPoUkDY?K=B|I&p_c$$yeJY-z!tF?&QSI{#6z@_YccM`7EEEq> z@vKrjdz_x~EEEmcQ=Ww)Id43xB+o+fFy9X<_rFl`93&4@@|==9M~Sr#c@C0>lspFs zM$h-9vC@7{wajynyw94*T{o8aKKnc*k5KZwk~~j|b(;S?B>pu2c}S9{`OmAGJg-jk zpNGqz=F6QwcKLq$0$h*M^@4J}K$kn~a98_L{Q_K#SfLk!wt6AslDnL&Fv8p~BKKp= z{i4eKB6GXbb3N0)=xV~;FCusH8G{#*+t2hbq6#0db!1%=N?w9wKP4|I$xD>DeW8}T zq$Do|l9!a^B_)y7R492FlE*1|SxH`|#5x6g8ImSZO=NC+8InAFHuU98TgY7hG87-O z9uF;UQvoyR6X3)zE`{gMN_uSD?zV#MM+*kZ}sm4eFd(M=#n*L?DDs94?l`iWZ^eSA%Uk2g?->$$s@oj@pg6OyMX zc~eQ=JdWf|NLsO1yvbhS&b!`JiZ`L?#ph3C?-48fm^lQ+GgKUc!mmmWQDL2_AA+I{ z6^E4KkWw5{r*4N}>P?gEePWXzFK;Q+TgvnnP1b48TQId|8@&aSJMVJ$zqgd@EhUj1 zS15TKl4n_+w;}N>lee8@Tc5OC=dSiM%G;2nS)I34b>7Y>WX^lrot2KBkd{6xd%RHb z4iwK(@s3iw;}p3a48=Q8v}HZsfg-swc?S}I@+N!9F!#I2{XBEOt8%}~+}7FkyU3lx z-0vcH^1I@9Gr47Ue-~}hhrL2}u%YBVNM4}iJtcXMy~28q?LA1^QSx3Oc~42+Qxe%X zhm!Xpd6AO$mE?U&tW)9lA!$#^`+?+rC3#;-WVaqlK7iyUN`3`kq;qxnUW8clzr{y}g7V+Pheh5j&NFt-*LrC(x?Vg1H zP$@pdJW?f-L4G&Wmpw+#C_>XmFulSnK2oNSXmZcMYtu){^pP@sq)Z?xRP| zTRCaD&Ry-_t#eq7t;1?;9adxOuo_#3Gh<6k(*I+VUpM>@rq^itA58vk{Xb_i|GM7) zVCuv<=zlQzb@u<1>3>l4wIB#p_gjtP~%!GS=PUV<r*9>87`E32FW2xK2ws< zj0*Yn-Di+wMW;A&*ZK?+|9$3XO7R&K6Il%6|K~ zQ*3g-IsF_8IZY7Z=WdmmDvwpB%;K@a_m(f9c$IBzN7{#MPG z%Jn5AgD8>vS19=kl6NWjN>%79O01Fn6(l_<`3e&JMZ`fBaW40jl6(b;Ukl4!GnV+a z`Wlk=DfwE}G*J^kR?s+Vz-ytD*QB3V)*TwQ8rYVH(Wp$Xz=$eFM{bG<~D0 z^9@bbncO!p^`z+=nEd(nH_G%4Oar1mEqjJ$`4RR#P43$oZTeoBzE`I2 zGbU-P@0CJUX`$i=C=OHcgR0CAR9O4FAE4;N(e(oq$(`m8O7a6F{(MT-hN0v~Nd8C3 zk4o|*CGLGn?rJ|1{s>85N`4F^KPt(ON+K)LQ1TNbA5-#^lKeyoPY%;^wU_(^Nq_d) zpIpm)cq4kmPfGC<6hql(WeprEeum-`Dt=aqpQ*5B;Gd!B$9nt>MRJ$6^c(3drg{RQw9XXH@*E6u(lzm6THa z3dI0c;Mbr6zbeVEN+NrTQ1TljpHuQ%LXwsG&HQ%vHy8%e@LR&*R&&28!*4JQV}Fo6 zOlUY!dyE6P97=BlV-!aeo?=gQ@BH2NOq9YJ}Nzsu+RkBk@ zT>GSR(sG@f!TdNm0@q)BM)?R_6Y>A*VXPcct|RK6cm%Eytd#8HLf28azM|_WTz+Of z%6?6@iT0W{=vXcy5f57!MU4OvkcS?WIWvw6nfa`CrD*u2hm8yVR z{h>^Mz~tYU{Rd1VSu5Ge#wLHS{u8EeX!=u`{-nwJe*90E{-NnlW%@H?k{L#J%W+=c zoBu-IADH(qmG>{^_4hrw+V5olLf(Iw_b=qlbH+B*;;&3znPL7yE&Sgj{0p@h&RWQB zJv99d)3-GJ4U_a6S-1QRQ>}7kew{w>NvvD`)?<#p%J^IP{#L%fmG5um`&;?s zw)zLY@96tS)$AYEOnnDR(?81ek23wEO#di_oK?gM-*W##@jVs)W~wA>sehd!H~ISr z-*W#d#lK4NuTuQ0D)X;0$yrHk@|F1yrWg25qyJQ8{$pjVwZnff{l}T;KbVrMnEx^^ zIa@4>zY-$niE@_c-sdO(Tb>@=l-dyee|OF(r-}0KRz)#+mQen7vHblIIct<(@uu>H zXN!4^SoAT1Gst{KEWZ&u!HAt`#7;6|1&mlhBX+V8D`dn98?jT2*r`VBG$U5Th@Ec4 zvW(anMy#k2JJX1rWyFdZv1}t&+=!iR#7Y>kl18kQ5i4!P${Z7u8Cm*YyJ$*vr|^CZ zMblgqeUW1*6?L?BDR+00E3Dtort+N>eH)O9&SPYTOyw((>X1r~o>V^n22aTJeqPg%7lxN<$eS=^r}E~@54E+@ z&I?I2h2XR&FC_j1GjB#ApIt_6_}TLz?<>rkPvy@8flg69+&{@&G=^h9?v$ac5L|E5RR}IWHVe^Z?GXyWRfyFo1XrGw8?i1f zq)dfi^81!TsMT1SpX3R;I!!qN;NWB;!~QS&_sNUyoBEd5@A)Ro^+) zNw@;ml2ai$m6B5-N!~9{g~We*aB9B7a(B(`f;K!AuJLrqswi}w2G{#^ou*u;Iaj*3 zQK8e|I*sjgnrf%hGVLTIT2@|hUf(iBkoN=TEu!)kIZpp5g1kkTw+Qn3zl&K!wMr56 zS^qx1BB+J`yF#+^i%q_bPKW73nod`y(^YTLXA`HxbUIC^JJY_+XoJ(0>2xS2utj98 z87q8cvY_~giYzGnXwIU-tt_-6ODVFHB1)W?8p*WL@Go9iE_l?e(N^vF>Q>c&~M5s6mihfj_ zl~81*&NA=AXCePt%zu{4&+j15LVkY_miyhP>HP zURm)ML*C^5zgQ->v}!SYcK^mU*%gJ7Y)A%BlC31!><4bm=TC#&do!~kDNadtAj!^1 zWZsh9SD3pvat~zg;wpD>=B}NV>vZno$X$%NiwC)jXL3swilYkt?H{t|3ngbmGKiA1 zmE>%;M<+~9%lwY8-1!{pXn=S#L647 z3P!A=5vydxDjTsXMy#q4t7gQi8?hQjtfmpGWyES5u{uVqt`Vze#OfQd21cx*5o=_` z8XK`DMy#n3Yi7in8?hEftfdiaWyD$=u{OuVWWJI9*De}8`rNH7M%4(8snP_auy|^sKR`OSyWi#VTa!WrCbC*Nzk<48Vx!sH}S6F9$<&d>3M|3$g zzm&_&FOpG4QJAqjGLB-#@`+C?S6CU#BV##cEUz+_R~coDh8Zg$<7j5ApfXl4Ggd&x z^2}Jl&02h3r-I5TGeDTJA~KF)#)^rI*{O|0YY@KSzUx_`m8uBMkzp5!iH5g{F5^~oH4b@?oL__t2 zAv;xF&wi>B)m0^`t4dT?it4Hoa-R$pHK3SGMGd8>L516mx}AW3-+B$DsG$@!l%j@G z$elS<)P!OR6*X0_s7Xamzxs6|$)mf0(6A%7K) zjan{0-;-Gj`TfakE#&tnv$F08^VdfHY0O_+<*#k#uZ{dwnZGvkyWdIH&U}8kC&{WN z%vc8*r!!-nM8@ps3U>##zCEskywy1x>LBl-CucOwJ>q`(R!4pII{55a*h3H*WQjyB2fTNABeOQ$O>GWrtIrC#apU&yp2r zEb(`!29V6AqyZ%Ue$YUTe0R0KS2uv9HYE)pN&Ygmfol5(>XfbldV+sPgsg%?S3|hw z(A6;E%1Sjf+od5yb=VUcvM0EmUqhv7sP38#p_#>5TGrp8rV%uAscEFzs}ZYIJ}uX= zO1x_}f~h`DjbO@C&^@iwDAQK5nrQ^nKz>uw2&UOI$@)Jw`8#=InC8*cSeY6-lX?!t z|DK_-GBsAF#>&)KDP+eHDw;qspNb|*(S!=?w4ezTby=AvPH~}oJ7yE5Xaa@*RkQ4I zVukO;O`%voMbkuY$xbz8`?%j%>yxaesuoREEt)DtQ`J6ARb^z46`OqfG=pg&P0dtg zn$e`r7*lSI(+sA1tV}a!;+a7+Woib6e>;Kf>Ow_xC{|L@Tq&AUQ9do#@nnlVw>cEm zsA%pKeRpDBZLSo}p_s$ID*MP#(E^GURJ2fv7F4*F(TWyI(LyO&C`AjUkezF&XbHtK zDq1QjnEtR6BQnXYG*(-;NR!}Ubq7@W=#%)DKMMKd_DOxE-E2U_q6tX)H z6|JFILPcw(XibG%@#z+6trV@5qP0@ARth-}2o-IhSV~13)go=EsA<%qjZ(BxiZ)8o zCZmv(w7fV;TOOT>$^WcP)G;wx z$4mcf7nM$*q@^*M7DaJcQKZ#qN^>;1d$F5qvpLtN)o4nq(UexBDV-TjqL3aRD%wJ^ zn2NS)W@+mbe4;v6b+R4sC-=#rM}+|_`>AUlH4edv`RZAX~$Nvp7&`7NkdB7#gYfyFC5z`Njpfs zqeSMDP|_Zfb17+`kYuIWn``R!Ff^i}eQYT14DB-p=?^kvg;_fw>v_!DA(1sZy29-) z8yYRq0l6D9cL(IgFO%cpsvR>5_n*0ERt968)Dfnc(Uam*9hs*?Qzw{K(bP$qIJY5qKi}9=HBz%C8Llwko#Sjw=42qz`R{m-mc7RJ$Km^ zd0RwzWv=LoyvaRqSLBwnAIUBE)i8HAuH5{ zZ(nl6{jJJw>J!VIJIvZ0SubMN?ka0{Gi!HbZN;qJgRI?ER#_W_S$iPsT4wE$$eNw% zq58W1#66I^b(CAiSP$2}{JW(+R9;zs#CiShuqX0f%)C8S`}S1*O6TpV^7d4DduH;= z80(4Mq5ms^vf_yqem~d?ic6^Ir9OQxDy(_17Zh#y>3c!pfBn=eqmUjZtEw<>Z{)p{ zd3&q8y$$<+$eU)1_eS32vo5`n+dr+>n`ce#JAGNpg_1sytfQollJud3X9a1w z+V}WAkhG-0$IuUepg&;tE45*ZEsF&gAoc+x9n9Ue>kXcz#=21*7b zByxrIJ?H=!+DFxpPdfkx|Fq8Xx#ekRQOLd`R1AV*6BUCJ3c14i+=Gz6BYVLh-$i1~?!AZsU9d}H; z57egT^>2BTvnzLE<(?8-7(Fc}f5SnZ2y<_FlPAI!M*m*^_Jez-%>8|b(b15kMr?u+n`p!)8L`PmY>E+^YQ&})vFS!^h7p@-#AX?>*+y)R5u0no<{7d1 zMr?r*TWG`<8L`DiY>5$DYQ&ZqvE@c=g%Mk6#LhV;CZkyTU%P13=u@rH7){r5G>yh+ z@^1|o&C$diGOqUbuF)7xJ)+Sh_pZ?xP08z|iM(=!wPqNLj6IogEHe7vij74^ zzh01W8)x)0*f?a|#*E_<8M9O4_(}O>4zBj!EsjIp-po7BebNT*S6JgRpHybBaq6Ba zGexKv55)~sj8}^BY94i~Rj(MY6yud*yi$x;3YnW?g};kTfZ|3fCP3l)-vp;Hf1x-5 ziaxBx1gEIF3bXlyj6zyn=D#rSMC84Rc_*s86OYr|CL(WXVq)S}#pP-oE_2lU!cjttY9xGLOf3eVr#G?=8$bIq`Y3Qj^VI zJ{cMNG2>)p^u2sCGWuRFcZoQo|D;opaXT|kNqo|*)D$!06lCllWt2H(3Nj{7ho@vd zr}XhDnY*;KvfPzoga6c1VYrossfkaW9bMs0N3Gw#PK9IuKl@Zy?JqAtD^JZ=F#6@( zf#};6QB2L4q(4k`^IU2q<~g}5#wK5vX)xVJ)3k&sD>com%QR>PQZvnIw!4+uv`l`{ zOj9+HJ8+!e*JL{K-_HEg6Zx}K(^(ViH=xs@7{u0@4n^|BVmc&#jVyQiF!v1P-oe~6 z61lTdGtAFE16c<%>kMQ~-e+cHKELz=SxbajXCmtz%sLZU{l0vr8gqJIJ`=fzucE-aDChmdZPedEK5^=beSTLz#D$$~#Ntl{HbAcQ*3g#k{jsm1mn( zo{fydSmoJ4m1nE0vg(Sn`hGnJS?^}nIf<;dgSrWD3;RPxC+OF*7 zd`j(@&vl*WuOYDxh+TzPCqYcn760@7pgQ2bC~2<)E?T$j#MM!3o+pRw55Vpw><<+C z17W)prtWG#m;3w?`~*mV`VuCS9&`gwL;u&W8XF4#C_d7b-G zZe7LZd4HPiYkobjdkDK8*sg!ci0YZ8To2TjrIhP|>c`f4AoAGCwX7cMJ3pJuWzK7i zkY6ACp2Dx6NIPF)&0y<;|B9r#KKK}EZ*X@W>&K~P5B1@JpXp~0yrxNe@HPEIc<3b_ zK1_H>(G{-Vxp!c?t9|}|2q&+KlMms-Um6YI zq>nghpqw-iC)R#Q130NJWzqmnvdKE4LF|G(HBfyJuierve10^9i@xHbp>oksTv)pl z4dLQ7$&ZF`k?eyS#!fgt8oK<*dIkBxE5^`CBRJ_NP8umEjhqwneFBZ(3 zUJHlZ#^ClAZezu5EL>}3Y7A}-$$`e;Ci}$3ip^{8G~3t4CSboN>?VrcMA$_$`8vJI zY6A9~!fpa~$qLgq+;+!#$o(cUn`>l~xJL4h)tlI{;xCNyUX=U7DF40%?ObKi&J^!i z)yF9;K8?luR=iuq-!pJ;7~`ESev?=hrRdZhtENV*nGtJl#9A1!mPV|V5o>M4+8D8q zjaXYF*3O8vH)0)(SVtrFi4p5$#5xt)1x8?in{tgjL4 zXT`KwPTmkFO;y`&>YQX*^PQ$}@|N^xP2q%$V!Uq}yI?N2tdQFb z+#14drnt?7Yn^#&2JYL!Z3eC%;hM!<9#@*lxKahZIk)A|MRT}#OI$QpE}Ba@Sf^T= ztJF7Fsc#;;;M8+}k>>jHZvpO`!fgSrFaH+8RXZqh4zGnO{}w^{w}{!?qqb1x&%IRI zi7)?_aPqb|X_?4@jHso|yJTf29$Lb~J5mBIT?srk5IuU!n9phFJ}@M<0`VOowo0U( zuW&OxO>CuztrW49B61HK5?h1#t`J+RG`2R0trfAgBDPjU?!QA~8xU&>v5iV&87UIFZIw^s&?^>rU3k;Jzzk^vB>P_a;78Y#xQuY+sw(g8iPb z+bVWjVOwpvE!Z`M-4^WR+@x*H=Gxp=)n*?1(oTGBZU-l|#7R5lq@C5SFM1JA&=c zOm>X9T=E@N$@45F?ZTJ*CvZ_$TzsNjd?GHa6MCP(MICYR30y>3Zr%Jz?1D?3XF?&j z6S(z++evXdshoFL`!!=Ha6gco?*#71aW6Mnvcj!iJ1I8LxI%VkuFS)AYs0Q^@|HO13Ma|lr)%tlOP}YhA-fyc4TarJvAanw zSS#diV81QwZs`&z>7I-2rr10`PP2UpbO*bUu)Blp=Mde6?N(*(YCnhU4)#03?hba# zMzcphyWk>>;oZUZdzIZ)+v8bz=%fdnd?ZeKs9fkFC6IiM%{%F#ob(8s^iWQEC?~uY zNIUU$p(mU)7AHLuPBNmNGVZ$Td^_w35ARBD^i0=+t*#H~8GGPbz^jdr+6&YsLhY5H zrsxXv{8`qinOMDT;CJzQgYWN<@d_#=_5rb}5c?=%ACuSz#P@~RM~LoO^gbZ^cUSa5iTmAYUah71 zz7_NZznSp+g75b&`zcXcn*+OzBr7xl;mj{*JR;=LF1{9Wcz zcZ#lm?1bw@e_2bplcT)$4V`?doP4UBd@B9XU738HyV|$FPvNAlIQbM#l6Oo$jq3!r zKwe{~*}fKh26juyh0hXfzQWo){tVRmQVO4`Qur)RJMTnQiFcyh87cQ(Mk`Tydbr~%}>;3-eEeQv}C8nG{o*q26ZkP#bf#D*BLuZ-AG zBlfis8)n3Y8?g~aY@`wU#)y4u#J)3Pqm0;SBR0l}jWuH9jM(=^Y`hWs!HE56#C|ej zKO3=MjMxMt_UkDzZaG~4s#4q2XQu|B<+PHPGeEVR0n&2xnqdH1&WF-o2cYHP`*oY$ zS^EKKJ^mg#*YJ?~IjF6L`Z=h+*Z$n3ey*sW2h`6MmD^BA9SCY0p$=4O9cWSqg4#fO ziGiw@7#Q~woV(mYL*f@8ek{Z<6!8m__yvd!h4_UcexZom{zKxIAhs3amx}nM5Z!gY z|NRogMne1&#B7z_$&4=*k$abrI0(dcLL3C5pN9^TeqZLmnS8b94g$Baa0fYVhI=M! z5V$-K;JoD?EaVObx4m!&CsLZCE8Kcp?vW|(U~roVcd+6PR$T5S(_BB+3<0-;aEGY$ z4iRp0-tE8cG6dX@ggeA>uW_fYhQwT+cMQR}&tH`0)N{Way7&q%I*N<0RO-JH7w&$L z-+6XVDt@J0e5G7`rCfZaT<}N`x)=%<%j6u(~jG1CMbbdl=X& zggs1^!!TjHxxaVd%3&DTO@%!S?Bwcmm}2u-8nTCjy;9i2Rl0`@yGkZs=dSiOakxtN z@F3m86`RNGkUawIRl**TNOwjw!t7^8fZ9w_JOb3@uEPjW{kXxif;82)y^)}<7V5|Z zH5HAN6kAW|j0C&6ut%nITUL1^aUO1R{O|HtS^NrkBx)dkC5vkT&rH(3d=2~tzShX~ z-z0qT71sRo8#rn~j(C3k4IE{Yckq6ra)ReQA@N%f*9!4lRrcSyoXDz@$=A87eXsj1 zxGjbIt>Yf-fcpNeDtDe+h1~DJT_@b{R7$@yQ~Di+EfVxqrV^lehks5F9FpUAbwUqN1u>CV>V`46+eGJm>_tU zJ{B%EiHor+?PJZfj|IPt@W%?@tu4ob@AnmWew`-zbdLjZvk=E6h^c6tq}w_HGfw6C zxFFBRskDzn+8>a%&9nNn6QB0);be89v2-)Mo-YV?ziaq{c*yF)&E4e=&Y=4(@yyEf-CFK49?l$56ptwK$ORf6>+;+nK zL2-XjTwVpGxxUnY1b4e|e^lHb|AqS_xb21eqvJ|%`Xji$H~$eOewUOuue;JNe2M=A z7dynoPYD+p(N9v_^{nbA@HMx4=OF;caQF(P5QYV1ATc{Hhb%IHqpr{i9>I6mQ z^>0Z171TXK{Z&za6{@?gsNC!RRZ)KpsK3Tk-YHsJjGwZrOS?aAkBCnwxi><&^GS8- zjZw95YN-YNKmHx;?*HPe_$TiaO(gfHo>`h?#C|hkla1IEBR18D{cgnmFk;h;*mNWI zrxE+hh|Mr!GmY5aMr@W5n{CAA7_qrVY@QLDZ^RZDv4uu#kr7*L#FiMbrABO-5nFD= zRv59BMr_q7F&>M!{#B)Uquwc+h}N@LTF*qZ9{-fnMCqHYyZjT;hB`{$JP~ck-{+ke zw;$$m9S^yaz}+X@Ns2qkaWkzQ?n&T&BHT&fMibqM+)3c_n835|NwT~B2JWnIs|sEG z1{eFq#c!(b_)Yo_y-)TV_??9Ro8w29VLbQ^d_M!?HX9NrgLpuQlN0IAh$frF$%;5x z5hp7m_Xr_z3Wx`VI3+<$MN_1Nt7P(Z?rOiLm;$ap3pPc#?kw08Rl?lsgxsm%9un?U z#hvQ7S;_l{o;wxXCnSfbf}2ggCO=hixn~Qxzk_>NxW6mz?~-2YZ9>0;+gZ53OM2aD zjo%fQd(V*j2e?Os`vM%RO|+od)hv;Z9TBX_C*@ z9hzz2b`|b4Nw1sBPg7hT1w!t0aE}Rhy2|J2lHMwre4TDV)4}Z~-070uUT8tnV=k{@ zrejv;?-cU56T0{lF8&c0f2!2~DK6Y;3GL!f<>F7};!ow`PvwF~y0i;F!~F{`j*E-G zR9^fgF5EkQw2QysqPyh9U(Ur8w@dq%a`6{j_%{vl*qV0XXSg%q;)J-Ek#La_%}_HT z-_~Y;--GzPS2hFuRH#a*@`_oV9!=;o>PVF zIbhR&B5s*;66wyMKUq0F2h`q@;yIuu>(-o@%A@rhH(FP8&x`OJF7z-L9&(C@xd{)c zXs%0tmh~pBxp2})%6u-I_+68^u?wzcJWot>ee0eFZZ6@@Q`~uy@9JEyoB^B%ZeQWf z1J{2ecV5ip{G6v+H_uGdE_~~r4;Q(`#eC&rzNFrIv*moa=qKecA1;#5k;+&a_rn*&Y|e!R$OXSA%k%Eg$wD~ED^3=w zTv#YhtoiCfIQdkZEQAwxGQyqBTBw{XROP|5{j?Kb9*f{4pEy~RaFP)%GF!wVc=${_ zEP{vRp6R041D6D^Cqn9CQ1c6Qv7#Uec3Mov49Yl zB+5P`T4EBHfcUu(mw@O`A1wiqM-(pcB`TkJWfbz4f?rVhOA{$hMN6f`t@+$ixELt8 zy)<3oo!uF*rE%KX$x=A+`=z`dOFQu;z6?$ZiIZi@$ueg%d6s$x(d`{LS2=h zrlM6+s@6{MDzJx0YFB~nSHP=cF7F^6#SRkh8u1Pi?+m%0ZWoE)$mE`N<99N(!igh3 zYs4pw_(?hbo!dcLP5w{aL0V(P)*7*OMr^$i+hD{t8nI1AY_k#DV#Kx@v28|dyAj)A z#C967T}EuT5!++L_8PH$Mr^+kJ7B~P8nHu0?646#V#JObv13N;A0u|$h@CKECr^oS zKf(2{Dy`@IE_pWe;j7VpGNk>iPP8As!s;hhqvd=ht#&n94rT>%-*z=xkKfy0t;Pth z?`aRdXI=vj#l^!Kc<}R+HLeY3S!d?gz{yZ)!)sKJwI=Sdc-&kgyGajWH;G$V+Jzq} z*TO{!aj`bxA|qOB=EPd?zZU*l*N4a{owYHa$EUT(1%GOlTXUN4b739$C569EmBc!e zzYhFiQWEQel30g3=u>2Ew=OOht_|FKgdWzzLn-mFKH(unSGZHt*1q0)I2kTZ*2771 zZnR#pxt9vr8^A6t>PR5Co-Ee|8brg2b!|jfp zaQb;JmS+1L-vjp9!rr6Ud;Voa*#q|XlJ9#Y-`)Q99>wMPUdY`G?m5EUt4eyWneTf+ z952MZinv!1c{Ujm_knn>5cegBsc4_%vRmou{k?tQ{vf%$58UM5;y$qb_{Fo;l_oYYCm-@Rjye3P#@cr2_xTq{HjwM`VM90i@9|M1q zSaRxCqYfo74BTUwTt!-*uM$;AFz}2@_!VYSC=9CIM|m9 z`?z8s|Cf9}4)$bW9}n2a6`NPFA^QZ_mk9fWVxJJUb$99n*uM(s2OTYKesPXSO&Tm-a=h%23 z$vwm7-UH58@X4fzo=G@$Un!dr%WlLd3xjKO8nIkPEVmKMW5n_rv3y1>zY!~7#0na* zLPo5x5i4TEic;*vV{FY3=qr&w)?ZNle+2G+S@sJH3{AihtsySa_o$1I1C%=;u9u2d>Nj7;) zZ?>HLmfoqdr;-hxcy8wYPkcx05%)H7Zf|K%K3}rK)79cByYiG>Jf$-EI(M~yE+#uX z{UN06&QoLej_&O6^t*V<9(&?3Ej#kbpQh&a9(u|FPuGa29Eq|@MLEP%@(!u@ltX#S zp*-bKo^mKpIg}^vZ_=K8+2w?%YsFK}L_TFiInDZ$6Kx|B;Umj`m*DHn*o#^zErmV3`M-={h^_}2?Rw@P(x zlb>7hb1Q!Cn9nKZzBwf30r3VQ=265vWCSm7Gurs23W{UHHIz!4hAE9`o`#Q{4tXLl8Rqa@k=T` z&*js6U+Sg6zeo6`5~ZGsN=ePZg&F8EAQ`lwTVzJ~v8MxTC_S&)&8@e`7MmZ^?obdW9 zbW#>h9uOyGRaul3C&{zLegrP7oRkfmlvPg3Dkr?^3!RjMlLy5~Ih7~n#7T0G*gGkw zoRkZklv7U1DJQ(vOgr)IqCA{DBu>h!JSp#-m~V$G4<}2cbjrg?a!e`@Cw@%gHEqbQ z0QSSeu8?47L>0_Fp#rE&g<1jBWUHv4sJvKULWNBuM6 zT<1}r*X1GgOi&*c>Y0fW=PRr=_?e(CCn}G6XM*ZSy)!}Nv6pwR>dNj_H+uJXH`M(iRZcCit=#E4yL#3~!H%Z%9N6gzeI?+PP!r4hTzh+S>Ot}$ZQ z8nNq)*!4#21|xQ(5xdEV-E72eF=DqGvD;3Gam(TQ7q*;=XgN*k^a8K3DysHc(QG*t z(Q;Nud#xDs5*1Z1!8JTh^<(E*pf(rkSqW+?IxDB!ueu)SRVyXl_@blS**XOq^6o^c<?N&K4)u`OdT9WR>(}XTym<-+8tw3+^#O?m6JL6z(~yEY1{h})SEcz}VO#UYbHQFCX+GDbS#}N1jk#P0&PC4qw-|C? z7`iwQE?SF=^OTG8#D(>?hx6cKt>nOYaDkCazW8&Va&jJ=_){C)1E-z%I&eOmv=Jxg ztFky>oLG1M&WDqA;^h3m$@#Gp?g_Z357`%h{jsnwQ0xmFJ4?QPj;sA0*9*X2FJ*B7 z*!~^H7sOmHiwklVq8;VO>8(-hg2$G$3tuBHgp0P~;zH%(LUCa|jd>wlY!DY0!i8@I z7b+JQq8#od0=wYREbYRV!$ok>PF!51TwL@o)aXRh9m%}A+(m|YD z0w?~Y>Lro`*2?M`<^5vj*7V71R zQqG7jH`95!O6TP&otG;j&&|?AU!qrl*hPp}sB~Uort=CAw@W&&05Q2!c7>wy>@TEV z32IlNUa6>8n$#;n-67?9rKHpCOkAmmJhu#qSAp0~h*v4%RVMK&5O)gkDn-0X5qahs z60ZiayAZEV5L40B(qgS=->wFCmvFBJH`{#oRqw08_3KQY9f#a&!0jR2Yryq;&DWSI zy#~bHLcB($^cs~?o_nW>zSdp~VoxDntBBW{#A`v^BgAVJ@!FWkz0$R6zQFVRkbfQc zy@Y?A;$LU-uLFNC@wu&D2Y&L7?{%>U?jd+35mK)QwYN~OPf$~Ig_|>5J@WNn?~}A& z@5)!^O4rBb%W1zJY4`h0ymkp)+yEDS#KjG8;qEkNL^qgezXAOH!oNY~{S9%tnSTTL z{tgeXlS2ND;P(~&jfq^yh;B6bH-dja^8QBfll|n4%EOInRN{46=;0=K=qDa-O5{N* zx=Bi+QYK&LuJ-#&H^Iq4adH!!RJ?u9D>I(Yhmq+fRT?)bC%hU=JMq2g&2Z9RoZPIO z+$>J4`NGX`Qk1Ua`M}MtPV{zf1HCzR;r`Ce6{=&d!0XS@#Vv60skpdBxwu7KB)<{t z^Wzq{I3)RT3tZsT?ceVH>MgMoZWX-F4cWJX{h6?DO|bb2Yv<=yP!CIeycN_a)4c)y zRz>7hbV$4n!~sIQ4MaaPx=m7ReS_~daF0l8Zv!{ETDUD{^R826>^ku&D1Nri?I-bn z-CMxt?0k*D|Gy<9-o=Yq%@qm({sBKt762e8nHW!*quh~E+clg z5xd8T-D|||Gh+7}u?LLUgGTHjBlfTndxT;IPT944)QCN1#2z}ezR zj1ha*i2cWiJ!i!JYs8*6VlNo67f*?CtL6F^wxLY4q0glaWhPo}imq^DnbmJ)q75CD zHk28(q0G1qF`Mgon(b!~w}U-U*taY8?ZUR6zPcUkW72Qj4z@pCdwa~~aqM>V-2Qzq z+_KUxe9v74F1`>KRg{Y=;=+1IL>0LBM_g2KE@ZY=C3e9*cNMqF%X|uBOBJlWpOdvW zx9reURe1VRJXKXaTUGI7%^<77({b@s)p?S4X;+Osaf#eva_<0lkZ|u%+&lgyFYW;M zgmCXr+&dJPd#EtIcY-@uxOb{Dxbt7QcY=FTxOXbm*>DC_epTS6Yi6W`=oHKFXTQ6?#1$5#V1{Qqf60_ zo>W|(XNTOUz#S#rrxM(Z=qYogdJ4owlG3MKN^f-E6L|_mzpuix_>lNCh@*x0bb^?Q zo|cqav(l%*{fM}{v-C8$$uk>ItJLypA!I)T_7B2-MzNm}_U7C8I@f3U_4_kmH!!J97CM#pM-A$bA;vABFoYxV|<&D_lP+IlAj5pHV(uRBBP3!wfg)E5%eRP=(BsMSBc0CqD;?F%loQ{4T!7ZjJ*wjuXLa3>1)Ma6wl zxYlzwFM``rxGyU1i!qn?fhOXeOuX~cjdpqXCqE6xJ3ap0Ozzk8fB6|Y_a}afQ(gK! z?*qL={!e}OtC|se*@(Sj#9lRG)s5I|M(lMX_J$FwVZ`1vVs9C-w~g34M(kZ9R?~>R zN3l~+AH8qHY8kQGMy!qz`@o3RHDdLQSbZb*p%H6f#2Om0Mn>!-Bi7i6H8~~5^Hr{Y zVf%Rr?Prp-pO;koc}d!jbx-3Zw4YYeeqKWR@o&m}DQ-W^aDNkSHN~x_ z+O0l+R1MtL!mZ}ot@Mu7VlM9iRl^>TKP%6zDs=HOTuc@hFDG1NL@!HUsYl$G!EZx+ z9_?QS|C|dNxBua}YccA+3=cfhU=J_DgP(VBD-J!p0uNKzXjxDo3vdj$TzBUR55r&q{mn zI~CR8;dk*+U3sXkJw$F!QXL*XmXfIs5BQGyUG82+b$IYQ72GR^)Ym}$L#VGM>O(4e zO=_j}1jlP&w-xqlU?)e=*TD97ak=*m*{_2=P1vs|Qk;rj7q+z{`#RX|g#9|$$@%B& zip{-z$bJLt>B4?PrTYzGTeJK(z-}+>Hv;w>ip`@*$gTnQpTe$@U}r=%%#yAFY6qd# z2&gp_mB*`)`X;D<3H8kcH5I)nDR%P)-T%A^c1L3K{=l1H`#tD46_>}vkoy+6Glct= zD&@C?Yn``y3*1kH`*!zWx?p?<9O>MDLiT{|+2=7Dw*{rTn_g%1?ODVkTNPb@ip#U4koz9EbAlelN)7_Y{@qVIlQ>Q0EHu zeNg?#_P$AdAJncweIHcRwi{d@{=TB}OfRI?0(G8HYpJx>GO4vd?IzS(peE1m)dJOz z;yjlOskK3!FVxzKTHB=72DOKjX>Cyb463$DC(l$vVjU0{2(eCrn2PF1U9!$;)&aM> zq_$2_vUL=jXT%}<1F#ng`vb-PK-ku5?*p)V3j2d}`Id3z`+;Kf%sXV)1$&XO>ne6# zVV{%9*Xfxb+p6S1BR)LvWW0_d~_~P`K8P@`sB1q2hk1xV#by zxedTwCfo*!+d#M#Gx<85&kYo}f#Nn$TwZO3+=k#T7j8pv{a#o@$5mgU@jJT>6}O?{ zHjKGETWIL+TfA2Xcd2-N7rJN!7c0a?Bjuuzq~2POHG&I&-=dLoA@?mB!G)iF@)|Pa zegy7H;eMoY{Ub@e^_GN>!0j!q>LbUMHg%NMo!u{F#4V7pFb=IY<+DSS=o!B+{M{H8DLjfrq}5 z>LxDLvb)_RPBrfy9gN@T-6HA|@sn!)TX4KX)C;>q{N`@{>E7*~-bZpfOf9f`lts=@ z-Ti24#F`ng=0>cA5o>A0S{bp{My!ny``C!JHDc|ISbHPZ!H9JtV!t8nIqRthW*CW5oIzv3^FZzY+V?h<$cSjOWo@|H77&g_g5MT27W~ zy;)|<$x2GjwP+igHi)~z-%mB-~aYG;&3inIqmE`JOUTgAi2D(^p*ytlq% z^f8=#E>1oUoO~QR;abikQpj!#_BLU+RqVFHzC4q!b65Kvg0^4}6n5Ky-Bz)A|hc*fcT~4a|aNUYxWMH`ZYVx z5<+T6P)}e*Uhp!sZ!;cR7mX%>RzFCR@BZW zwKJ$+Njf{bbPjjl-0KXYU&Zh|EhKgUai0*oB#5b~i!0GgYfq#LxI=~81zf)u(2@nv|1OxW;13smSC{UKe?pDw8uPjR@Vql5 zb_4N%5W6LaDZ0YViZ0FM>om8U;&xNqZZVhB-A!gik09MV(+yp8hl_*aqB~sp{l4yI zy1OfWcg61>^Eur-j}D1FKs+SG9*Wq*B=%6m9*WpQ5qXv#5_^KUK;CoGQxSWb#GZ=S zQxSVABCi2LVlNOE3bB_W_A-gR6tR~g_EJP%VT8oqATAPOZxHsLo$*%zhpf#ZVsoikD{uzMQG(uX_0N0wPxbu7i;?~4muHzy1 zb8uG)_wxjouaG^BOuo(&KL>H7^c|mrSoi&Mg;tNa27Slpp!$^nx1x|b5Y*K|9jK@S zP3k~J9T-ptDk`_rkopCvYlQlRqJCjgzfjaK0_qov$~{4v>ifwrL0u=*FG2P9Ilgoy z8=o@ftNmHyFTwssO7=@vvZ+4E<1cXvU^@LB_mf}7FUh-+M@-$dH`8!a_d>X7g4OX58 zD^G)!r@_h-_r#&6A@H<8JPk?Im{c@GJXtH&A@KCAl-&?`ilVboc0-hlA#ia!&8WGJ za^IhJ;roWK;9{e=_)59>N?ce^PJRU!-$_gT3NDg6L|?^Dn9XBM$Q}yzCSea%?4gdG zrD{s#PKONzdz7$;2JE3Rn^!wSWwrA@Ry#bhg-*VPlg;Af>x7ex=xg74ah>m}zJ`a< zQVL(YnsJWn8@~qM-+|_FGb9cJaf=X#DdI4bI1I!wlImd~CeMZqi&M=x&LeY39S-VN zp$=En;U;xBsADCa!$I}CQNvX_dE8GEeGfGP#BD+xkszj`5mLXBPmuZi9Rcn*;f@fl ztK}o&lyaXmLe+AfX{25FT0Rmkwu_6A2^SgBNHg6d!7ne*yo_|^+p8y9*~pmBInDE! zG|{K~8xVI0@f#5ReEl1f_zj5POZk2S;w_iV@3XOjdr!_cDz|y2l&1Q0ehcbOp?<5< z`K?L)7S!=V{T9?@JN{OsmFHq1^*c~^3H3Wg{m!I*2kNXG?&R%v0rfjY<=I|H9R=!c zp^i#WQ_(1?S=Qb4QDDy&_9(EEyN#p3_HzuLRfg=*VDAz3=ma|>8f~U{G^lfgIyy-4 zXq94~w}#X)pzamwm;^Nyjgb_qdkE5BjsbhFu*ZO%>^a6LHqVbk_E@m@345$!j}^AH zA{z_#JYsWSJQi&KCb_YS%X9EF*SDi_;I0+!IB@+8XPj`YGnV7PoiAw~2k!IlOg%B` zn0q(LxR}j-$T;*NkIG2NGyk*`-?qMolY`>qdzAyA^*NDwCV)KpI0wcE2h%GW=i;dV4Bev9t zEi+=vjo1n!w$g~LIwi)V57)o2@g=RHun=DdkWauggqs}&WNU%r7#85!?#pWI_Wd9B}eXbL^{d)|LJJmFBR||KVbhubU*KrM>9+da=*a`P+JR*hcKf%r;>^~LzPsh%*-iG`q*lXkK z`0J9~j=3EEbbC#YyobFe9`!=*U*P7I>;F>Rza-68-|`o@>*zX8^It)2`%9&nN7s-& z1MGalo}o&8hOn(Q^9-=p({-H68LrePbj;*(o}surf~UE@w#@`Lzi?-&G|v>S^`5Mm z;BFA^Ovjz#cG+geTy`-Nwaq`D#&d?W3t!v*hKmB?;&0{RZ*h_QUZw9(|Avc=;^J@T zLgs>h$1b?0@w_DD&H}ffaA&DppCw$kqH$OIUHVzzZW8V+$9=XFa($NCeV(OU@JuLl zF&i!liHq6F#cXlmcH;EO%h}4sY~^CMaxq)E;JH`Yg|7*7;G(d&n4`*Ij<~SS;?04J z&2$~t{yA`w9Mk5+Y_1J+P#gTaZ+XTSI++V6Ma0Qm% z<}Ltthj14-uDq#nLCoc{Uw~Zq_o#Swo_67LeIZ;F7Z(eai-qFCy7RCQF1FKkTniV% z1!kI+-5%}2*a_!6uL9C+p971)E+OniioNJxN_-L6JB7U{U@wZP3`l#cPcJ{E1r*V{wzWM z_#Ho9bERGR{8HCJI@UWMz;FK?ghiGCM^w>*dK3~r(x{$aG z#FP-1DdIAfYEN7S;yxiRQ^aM8$m_+BxE#bXLR_wh%T3~P5O)i4If(v?d&?D(SEFg7 zFY6T`mKEX(5dF+^h15DX0_bsQg(}|_LHVwT*<8LWP`>_bG_QfvPJH>Ugp+dOWTi^` zN;B;%;bA{r!8LOwJS1!WN_gZ%0wa@4F`@$R`#P-llxQmqt+O)wMJ~6 z5nFG>HW;yuMr@N2+ib+P7_qHJY?~3=Zp3yNv7JV2ml4}-#P%4my+&-G5!-LX4j8e6 zM(mIgJ8Z;`7_p;9?3fYz$A}#_VkeB)$x~w7LvsB~w;tbntVZjpAgyP0qW9n{+?gWX zC$CnmXLZneR>!S}`;FD;H>%3G$@M+$!S@?$;NcAMuqM%hQ_&jf!K_`1HE?o}uH(^R z4V?IUVQXR+oOW(gA$KjfX9{<1g3DJ}&xoxB@sJSLg6Q|+)?)7dC*Aqwbgu>9@62)g z4f*T9uPFR=ioed}uLJ)uUBUh3I`ES_Bp?wBsOuBdRJ7jZc9yjZwjS)G z!d{;!clUmR^)Z`EW4+8$?#0TT`=!vy1~{oCPBthf8>HN=6~hKNIU-Ir1Wq=@PPpv3 z{|nh0!9H8q8xw54!YccXpdJ(IMo^RYxi^C9_h-0=O;deq+XU)4Lfw?0rlL)fVrzA> z3GBDz^u#8xc_qO$ZBv|D_a{D4=GREvr>9-`_O=-=&J`D%6D~5M&1Tv+gZ~d*!S!x) zkoL_g?L2CvsXpynKs{fmTN2b%v_;bH-qoPzDOk;;cd#vw!q0YD zl0DLPIPo($9{u4zZi=9;ksx!_v{ z&pJXUJK^MFak5i6+3B2QJ#6H{PB@9^-{ih;C!AzE*a7u(r*g4Vx!`$D+J!HRU2t)U zxY(5_i;QTOv`<|#cY&W>_`AT5`^ZEI2W1fJdU)6J<7!%kz~0yemjTa4L+(CsFBk4U zmHK_ctzwkHKE>Uqxcd~BXS^YIKe$&2cfU&Se&M?LfG+#}io0KN_bV>XszdGpaIX~Z zfkb*Uq66kAbO1!MgjRMy%GtH`1B%FV_KH$4}*J+a1Sf);eX*C1~x6q0TtBWK zmGsIuB6N5D`6#$~g?m)ESAu&~ae18O4I^d1wgyBi}XtRlxf25vs#9#h<7 zipy)kkoym~HwgD1#r;RPviGmj`;SWRKZ^U0;_`Yk+7IHeyALSW$|dx@%X=h-DbD;zq255i4oLN*S@zMl5B-${4Y- zMy#9>D{sUq7_l=>iSfLK>sr`eqFmW2dW*D|h>l29jeM^h(V)s#Sa+zRTzP0ETS(eV zlq+v^XOwI&Q7(T{fZ1H9Lv}W>Zxwbnu-((zDZ0Y3tsWyA*oB3i4eaEqCYxe&+X>m( z!M;t{*%dqczp%4|U4+;?Z_W;O6uD6_d(7n(P4-;;#qvy|a}IE84PE4bi%fBmBjJLt zu=Y7}fL~P7o&)^ko_vnj15>#_2&p+iyg#Gnma*FMY&xWXIeX#xxp?jsm(2A>}ImL zV=k9}?p*E*r?s$R;T|;Y!jFr2;NlK(kq0h(3(F%l&3dy(9=IqWIgke~k~>;?;KYxO zd2;d2lHXb3emm{NkBxcZ`C>MgMLsEu+9(SikwPc=;pA>{l3zK= zFHYPEEbSyeoRk(P`2#2UV<(&oJo2U4z8(|+`yOEz0NdAt0+Rb~-mTdMz)nf-7m(a{ z-*G9RxIDV1xxUm3f_txU3ntQ>5fwDsUO^Dc2(ciD+2ngf1r?FU?vPjr#QTI;ND&K} z#6lpJ6=ESrl>ViVBJ#{2Bo+pdE{nM5Ev$%zO=4jX%SnkA1~J)>7FJZAdxX>?pyn28 zkpwjr6_FCPR*pr$zLoB}@LaPc%bEhb#+j9D>oD+sq3xc)q6v6#!L zFXmGJ(o0A^&*MTD8E}zLTx29%WJDQe>1Tj{hVU~~x-()vr<-SqA+b1!`Gr_KK}^vV zZst-Yldp4E`}{5r?wP_ZE?hUV6jxlHiH6(~;1&>W3B@g;@>};ICBUsnT&{5?z{P2i z&6i*uT>@-BmXuIq3D1AiPJC}x5>5(=lag@ad$W?_q%v|qJ1ME0lnk7djGb`q^E^Av z_B~oDunP&hl*)xtjy>@%uyyX20{bk<{Ze52m1e1!%eAhQt94m!Ea91b=%O@S6cHDt z6D~5M(q^qI4Sprzmsb4JF`v`T>xPh+0P|K*2E+ZxF-q58>x(wK7OPwndlysSx&3$$m;q>!*D0EU5PKt?> zvI!@Ag_|vF4`r2yvVn)Pu?ME|iYug+12sda0az zTS2($9+;%Q0=VZ1w*t6PmYXwH0GIDNO^Av{XVCvyHU3#3S`<}@QuLJBGuiol)9wvL z{LM0cj*RzK_t?rR&xpq5=AF4r^ujJ}?eZ3l+7UPXqe@gE zDiY=3NfLK21#{8wviFYYPx)TYTRM6@@jqvevYn(KIqrz4zW(3#s{{J7W@k*_lwQIgsr{bOUU#b3flu5B@<16>CYdawu zexLPuo_pH3-)B8pd*kv4=JI1jr+=Kzfzvr~ItNbY!08+~odc(H;B*e0&Vkc8a5@K0 z=fLS4`2R!>{M-6J`+}JZ`c*Dhv};dV|6g)sB(MJ$lk$;x{r?E52#NOrN|Wl7c%7d` z;x%;}60h$&kxG$zk$BxdfW-R%V@ap3`=5NbM&0VQYS*aqVa1xSReY*Ot=DV3TceuW zUtk*l`~2g7`t=L~vp4ow$L%3p6&0i)gLw0RFMPUYt+#5wQN4Pd+Ss??|Nr!l(>ZWD z2Ttd}=^Qwn1E+J~bPk-(fzvr~ItNbY!08+~odf@W&H-Ng^3s^^W%IpzIxIpbJ)(#n zddEF{UPtr&eqJ{7nw{70g-N{b=QTbZ=)gUGUgwt}l_c?6pZ5b&q%x$kq;jP4qzWWn zhM!5QNIHvDiF7vU9MZX@^GN5DE+Abg;bSv2kB1IU8K87_mJ)--AB5g^Z@BW(nF+& zNso{oB|S!Zob&|gNzzlKr%BI{o+bT<^c?BGq~}R5kX|IcM5;!5ne+0MGy(tD)$Nwr9|NqnB+15#a5JyLzrholCihNMQMk4TM4O-Na! zrle-1=A;&+mZVms)+FA)`Iyv})Q;4i)PdBI^a-gGsWYhysVk`)sXM6$sVAuysW+() zsV}J?sXysc(r2Urq|ZqMNnen@Bn=`BCJiBdMH))_nly|woHT+olJpJfThe!=QKZqN zF{H7iais4_<4He|ekA=w`kC|#X#(k2(nQiE(r={6q$#ASq~A$@kfxEQll~_h;*2Agmjd2jPwubIOzoGB#Af5 zvXQcra*%S8a*=YA@{sb9@{#hB3Xu4mP9aiZQV~*7QZZ5nsW^$x29+e0B9$hkNM%T6 zN##i8Nfk(Ekj^AkB%MX7L^_*v4(VLdd8G467mzL_T|~N=bP4HFQf1O*q{~THkgg@cq*q9< zlB$zlBfU;~gH(g`Ch0BG+oX3$?~-bg-XpzFszs_zszdsKRF_nbRG;)AsR5}WsS)WT zQe#pRQWmKxsTrv`sRgMesTHX;sSW94Qd?3xQhQPdQb*D!q)w#Hq%Necq;90{q#mT6 zq+X=nq&}p+q<*CSq)$nokp_@HCk-ThLHd$3h%}fqg!C0@DCuj`Fw$_+2+~N>H>7V# z-;qXgPSP&YZqgpoUeZ3&e$oNbLDC`8VbT%OQPMHeKcwTN z6Qq+Q9{Bm_Uv^Rs5`jk zq|&4msSK$ssT`?1sRHQ?(wU@+q_aqsNN1DIA)QM)k90oi0@8(~i%1ufE+L(ulbV(3 z*ULzkldd3LNxF)3HHrWJTKe@m()FYpNH>yhBHc{lzdvq>t!HADv*WpMU+|UYPN0PW-RAXy*0-&8psssv+(7QZBp(PaLU3^RJqiZ~agI?ghl>rj lzvW-bSya^hi@9jlS(9de?4deJ;QyJQ>;GS*|C$8<{{a7tlDGf> diff --git a/examples/testing.py b/examples/testing.py deleted file mode 100644 index 0345284..0000000 --- a/examples/testing.py +++ /dev/null @@ -1,19 +0,0 @@ -#%% -import pandas as pd -import numpy as np -import sys -sys.path.insert(0,"C:/Local/pers/Documents/GitHub/Cobra") - -ROOT = "C:/Local/pers/Documents/GitHub/Cobra/" - -#%% -df_data = pd.read_csv(ROOT + "datasets/titanic_data.csv") - -#%% -from cobra.preprocessing import KBinsDiscretizer - -KBD = KBinsDiscretizer() -df_prep = KBD.fit_transform(data=df_data, column_names=['Age','Fare']) - -#%% -from cobra.preprocessing import TargetEncoder \ No newline at end of file From 946a500718050ccefc77c242f7643fb17b641ae8 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 24 Mar 2020 09:07:48 +0100 Subject: [PATCH 61/98] Change docstring format in univariate_selection.py --- cobra/model_building/univariate_selection.py | 64 +++++++++++++------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index afc96d5..72620bd 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -18,7 +18,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, preselect_auc_threshold: float, preselect_overtrain_threshold: float ) -> pd.DataFrame: - """ Perform a preselection of predictors based on an AUC threshold of + """Perform a preselection of predictors based on an AUC threshold of a univariate model on a train and selection dataset and return a datframe containing for each variable the train and selection AUC along with a boolean "preselection" column. @@ -33,18 +33,29 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, Therefore, no univariate model is trained here as the target encoded train and selection data is/must be used as inputs for this function. These will be used as predicted scores to compute the AUC with against the target - Args: - target_enc_train_data (pd.DataFrame): Train data - target_enc_selection_data (pd.DataFrame): Selection data - predictors (list): list of predictors (e.g. column names in the train - and selection data sets) - target_column (str): name of the target column - preselect_auc_threshold (float): Description - preselect_overtrain_threshold (float): Description - Returns: - pd.DataFrame: DataFrame containing for each variable the train auc and - selection auc allong with a boolean indicating whether or not it is - selected based on the criteria + + Parameters + ---------- + target_enc_train_data : pd.DataFrame + Train data + target_enc_selection_data : pd.DataFrame + Selection data + predictors : list + list of predictors (e.g. column names in the train + target_column : str + name of the target column + preselect_auc_threshold : float + Description + preselect_overtrain_threshold : float + Description + and selection data sets) + + Returns + ------- + pd.DataFrame + DataFrame containing for each variable the train auc and + selection auc allong with a boolean indicating whether or not it is + selected based on the criteria """ result = [] @@ -81,8 +92,8 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, def get_preselected_predictors(df_auc: pd.DataFrame) -> list: - """Wrapper function to extract a list of predictors - from df_auc + """Wrapper function to extract a list of predictors from df_auc + Parameters ---------- df_auc : pd.DataFrame @@ -105,13 +116,20 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame: """Given a DataFrame and a list of predictors, compute the correlations amongst the predictors in the DataFrame - Args: - target_enc_train_data (pd.DataFrame): data to compute correlation - matrix from - predictors (list): List of column names of the DataFrame between which - to compute correlations - Returns: - pd.DataFrame: The correlation matrix of the training set + + Parameters + ---------- + target_enc_train_data : pd.DataFrame + data to compute correlation + predictors : list + List of column names of the DataFrame between which + matrix from + to compute correlations + + Returns + ------- + pd.DataFrame + The correlation matrix of the training set """ correlations = target_enc_train_data[predictors].corr() @@ -124,4 +142,4 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, correlations.columns = predictors_cleaned correlations.index = predictors_cleaned - return correlations \ No newline at end of file + return correlations From e94f17d809afb20ca49b9e4727de6b12db1e4768 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 24 Mar 2020 09:39:55 +0100 Subject: [PATCH 62/98] Add docstrings to forward_selection.py --- cobra/model_building/forward_selection.py | 50 +++++++++++++---------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index 543334c..c9d4807 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -113,20 +113,20 @@ def compute_model_performances(self, data: pd.DataFrame, def fit(self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list=[], excluded_predictors: list=[]): - """Summary + """Fit the forward feature selection estimator Parameters ---------- data : pd.DataFrame - Description + Data on which to fit the model target_column_name : str - Description + Name of the target column predictors : list - Description + List of predictors on which to train the estimator forced_predictors : list, optional - Description + List of predictors to force in the estimator excluded_predictors : list, optional - Description + List of predictors to exclude from the estimator Raises ------ @@ -158,18 +158,21 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str, def _forward_selection(self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list=[]): - """Summary + """Perform the forward feature selection algoritm to compute a list + of models (with increasing performance?). The length of the list, + i.e. the number of models is bounded by the max_predictors class + attribute. Parameters ---------- train_data : pd.DataFrame - Description + Data on which to fit the model target_column_name : str - Description + Name of the target column predictors : list - Description + List of predictors on which to train the models forced_predictors : list, optional - Description + List of predictors to force in the models """ current_predictors = [] @@ -205,23 +208,26 @@ def _find_next_best_model(self, train_data: pd.DataFrame, target_column_name: str, candidate_predictors: list, current_predictors: list) -> MLModel: - """Summary + """Given a list of current predictors which are already to selected to + be include in the model, Find amongst a list candidate predictors + the predictor to add to the selected list so that the resulting model + has the best performance. Parameters ---------- train_data : pd.DataFrame - Description + Data on which to fit the model target_column_name : str - Description + Name of the target column candidate_predictors : list - Description + List of candidate predictors to test current_predictors : list - Description + List of predictors on which to train the models Returns ------- MLModel - Description + Best performing model """ # placeholders best_model = None @@ -251,21 +257,21 @@ def _find_next_best_model(self, train_data: pd.DataFrame, def _train_model(self, train_data: pd.DataFrame, target_column_name: str, predictors: list) -> MLModel: - """Summary + """Train the model with a given set of predictors Parameters ---------- train_data : pd.DataFrame - Description + Data on which to fit the model target_column_name : str - Description + Name of the target column predictors : list - Description + List of predictors on which to train the models Returns ------- MLModel - Description + trained model """ model = MLModel() From 3b6523585e8a690675e39cc3925dae8e7c400321 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 08:42:26 +0100 Subject: [PATCH 63/98] Add default args to univariate_selection --- cobra/model_building/univariate_selection.py | 21 ++++++++++---------- setup.py | 5 +++-- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 72620bd..6b4dc38 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -15,8 +15,8 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, target_enc_selection_data: pd.DataFrame, predictors: list, target_column: str, - preselect_auc_threshold: float, - preselect_overtrain_threshold: float + preselect_auc_threshold: float=0.053, + preselect_overtrain_threshold: float=0.05 ) -> pd.DataFrame: """Perform a preselection of predictors based on an AUC threshold of a univariate model on a train and selection dataset and return a datframe @@ -41,14 +41,14 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, target_enc_selection_data : pd.DataFrame Selection data predictors : list - list of predictors (e.g. column names in the train + list of predictors (e.g. column names in the train set and selection + data sets) target_column : str name of the target column - preselect_auc_threshold : float - Description - preselect_overtrain_threshold : float - Description - and selection data sets) + preselect_auc_threshold : float, optional + threshold on AUC to select predictor + preselect_overtrain_threshold : float, optional + threshold on the difference between train and selection AUC Returns ------- @@ -83,12 +83,13 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, # Identify those variables for which the AUC difference between train # and selection is within a user-defined ratio - auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) * 100 + auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) < preselect_overtrain_threshold) df_auc["preselection"] = auc_thresh & auc_overtrain - return df_auc.sort_values(by='AUC selection', ascending=False) + return (df_auc.sort_values(by='AUC selection', ascending=False) + .reset_index()) def get_preselected_predictors(df_auc: pd.DataFrame) -> list: diff --git a/setup.py b/setup.py index e04e832..20ac799 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,11 @@ from distutils.core import setup + setup( name="cobra", - version="0.1.0", + version="1.0.0", description="Python Prediction's methodology for predictive analytics", - packages=["cobra"], + packages=["cobra", "cobra.preprocessing", "cobra.model_building"], url="https://github.com/PythonPredictions", #long_description=long_description, # TO DO #long_description_content_type="text/markdown", From e2635aea2ff4d706d5344efb9bae81e4878918f8 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 09:15:49 +0100 Subject: [PATCH 64/98] Change output of forward_selection.compute_model_performances --- cobra/model_building/forward_selection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index c9d4807..7924e79 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -58,7 +58,8 @@ def get_model_from_step(self, step: int) -> MLModel: def compute_model_performances(self, data: pd.DataFrame, target_column_name: str, splits: list=["train", "selection", - "validation"]) -> list: + "validation"] + ) -> pd.DataFrame: """Compute for each model the performance for train-selection-validation sets and return them along with a list of predictors used in the model. @@ -108,7 +109,7 @@ def compute_model_performances(self, data: pd.DataFrame, predictor_set = predictor_set.union(set(model.predictors)) - return results + return pd.DataFrame(results) def fit(self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list=[], From 741ab490b24cbc6d4fa9825e950f9a9b8d91c3bb Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 10:01:45 +0100 Subject: [PATCH 65/98] Clean PreProcessor docstrings --- README.md | 4 +- cobra/preprocessing/preprocessor.py | 85 +++++++++++++++++++---------- 2 files changed, 59 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index cc7293b..9dd035c 100644 --- a/README.md +++ b/README.md @@ -74,8 +74,8 @@ preprocessor = PreProcessor.from_params(serialization_path=path) basetable = preprocessor.train_selection_validation_split( basetable, target_column_name=target_column_name, - train_pct=0.6, selection_pct=0.2, - validation_pct=0.2) + train_prop=0.6, selection_prop=0.2, + validation_prop=0.2) # create list containing the column names of the discrete resp. # continiuous variables diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 81c7677..8299b8f 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -205,9 +205,9 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, ---------- train_data : pd.DataFrame Data to be preprocessed - continuous_vars : list, optional + continuous_vars : list list of continuous variables - discrete_vars : list, optional + discrete_vars : list list of discrete variables target_column_name : str Name of the target column @@ -259,26 +259,26 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, def transform(self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list) -> pd.DataFrame: - """Summary + """Transform the data by applying the preprocessing pipeline to the it Parameters ---------- data : pd.DataFrame - Description - continuous_vars : list, optional + Data to be preprocessed + continuous_vars : list list of continuous variables - discrete_vars : list, optional + discrete_vars : list list of discrete variables Returns ------- pd.DataFrame - Description + Transformed (preprocessed) data Raises ------ NotFittedError - Description + In case PreProcessor was not fitted first """ start = time.time() @@ -308,12 +308,33 @@ def transform(self, data: pd.DataFrame, continuous_vars: list, return data + def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, + discrete_vars: list, target_column_name: str): + """Fit the data to the preprocessing pipeline and transform the data + + Parameters + ---------- + train_data : pd.DataFrame + Data to be preprocessed + continuous_vars : list + list of continuous variables + discrete_vars : list + list of discrete variables + target_column_name : str + Name of the target column + """ + + self.fit(train_data, continuous_vars, discrete_vars, + target_column_name) + + return self.transform(train_data, continuous_vars, discrete_vars) + @staticmethod def train_selection_validation_split(data: pd.DataFrame, target_column_name: str, - train_pct: float=0.6, - selection_pct: float=0.2, - validation_pct: float=0.2, + train_prop: float=0.6, + selection_prop: float=0.2, + validation_prop: float=0.2, stratify_split=True)->pd.DataFrame: """Split dataset into train-selection-validation datasets and merge them into one big DataFrame with an additional column "split" @@ -325,11 +346,11 @@ def train_selection_validation_split(data: pd.DataFrame, Input dataset to split into train-selection and validation sets target_column_name : str Name of the target column - train_pct : float, optional + train_prop : float, optional Percentage data to put in train set - selection_pct : float, optional + selection_prop : float, optional Percentage data to put in selection set - validation_pct : float, optional + validation_prop : float, optional Percentage data to put in validation set stratify_split : bool, optional Whether or not to stratify the train-test split @@ -339,17 +360,22 @@ def train_selection_validation_split(data: pd.DataFrame, pd.DataFrame Description """ + + if train_prop + selection_prop + validation_prop > 1: + raise ValueError("The sum of train_prop, selection_prop and " + "validation_prop cannot be larger than 1") + column_names = list(data.columns) predictors = [col for col in column_names if col != target_column_name] # for the first split, take sum of selection & validation pct as # test pct - test_pct = selection_pct + validation_pct + test_prop = selection_prop + validation_prop # To further split our test set into selection + validation set, - # we have to modify validation pct because we only have test_pct of + # we have to modify validation pct because we only have test_prop of # the data available anymore for further splitting! - validation_pct_modif = validation_pct / test_pct + validation_prop_modif = validation_prop / test_prop X = data[predictors] y = data[target_column_name] @@ -358,10 +384,12 @@ def train_selection_validation_split(data: pd.DataFrame, if stratify_split: stratify = y - X_train, X_test, y_train, y_test = train_test_split(X, y, - test_size=test_pct, - random_state=42, - stratify=stratify) + X_train, X_test, y_train, y_test = train_test_split( + X, y, + test_size=test_prop, + random_state=42, + stratify=stratify + ) df_train = pd.DataFrame(X_train, columns=predictors) df_train[target_column_name] = y_train @@ -369,7 +397,7 @@ def train_selection_validation_split(data: pd.DataFrame, # If there is no validation percentage, return train-selection sets # only - if validation_pct == 0.0: + if validation_prop == 0.0: df_selection = pd.DataFrame(X_test, columns=predictors) df_selection[target_column_name] = y_test df_selection["split"] = "selection" @@ -382,7 +410,7 @@ def train_selection_validation_split(data: pd.DataFrame, X_sel, X_val, y_sel, y_val = train_test_split( X_test, y_test, - test_size=validation_pct_modif, + test_size=validation_prop_modif, random_state=42, stratify=stratify ) @@ -462,24 +490,25 @@ def _is_valid_pipeline(pipeline: dict) -> bool: @staticmethod def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: - """Summary + """merge lists of continuous_vars and discrete_vars and add suffix + "_bin" resp. "_processed" to the predictors Parameters ---------- continuous_vars : list - Description + list of continuous variables discrete_vars : list - Description + list of discrete variables Returns ------- list - Description + Merged list of predictors with proper suffixes added Raises ------ ValueError - Description + in case both lists are empty """ var_list = ([col + "_processed" for col in discrete_vars] + [col + "_bin" for col in continuous_vars]) From 8d2a970425c31d6b2179a53624073ed4d73d22cf Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 10:07:07 +0100 Subject: [PATCH 66/98] Delete scripts module --- cobra/scripts/__init__.py | 0 cobra/scripts/export_pigs.py | 143 ----------------------------------- 2 files changed, 143 deletions(-) delete mode 100644 cobra/scripts/__init__.py delete mode 100644 cobra/scripts/export_pigs.py diff --git a/cobra/scripts/__init__.py b/cobra/scripts/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/cobra/scripts/export_pigs.py b/cobra/scripts/export_pigs.py deleted file mode 100644 index 086f342..0000000 --- a/cobra/scripts/export_pigs.py +++ /dev/null @@ -1,143 +0,0 @@ -# third party lib imports -import pandas as pd -# custom imports -import cobra.utils as utils -from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer - - -def preprocess_categoricals(data: pd.DataFrame, - categorical_columns: list) -> pd.DataFrame: - - for column_name in categorical_columns: - - # change data to categorical - data[column_name] = data[column_name].astype("category") - - # check for null values - if data[column_name].isnull().sum() > 0: - - # Add an additional category - data[column_name].cat.add_categories(["Missing"], inplace=True) - - # Replace NULL with "Missing" - # Otherwise these will be ignored in groupby - data[column_name].fillna("Missing", inplace=True) - - return data - - -def compute_pig_table(data: pd.DataFrame, - column_name: str, - target_column_name: str, - id_column_name: str) -> pd.DataFrame: - """Compute the pig table of a given predictor for a given target - - Parameters - ---------- - data : pd.DataFrame - input data from which to compute the pig table - column_name : str - predictor name of which to compute the pig table - target_column_name : str - name of the target variable - id_column_name : str - name of the id column (used to count population size) - - Returns - ------- - pd.DataFrame - pig table as a DataFrame - """ - avg_incidence = data[target_column_name].mean() - - # group by the binned variable, compute the incidence - # (=mean of the target for the given bin) and compute the bin size - # (e.g. COUNT(id_column_name)). After that, rename the columns - res = (data.groupby(column_name) - .agg({target_column_name: "mean", id_column_name: "size"}) - .reset_index() - .rename(columns={column_name: "label", - target_column_name: "incidence", - id_column_name: "pop_size"})) - - # add the column name to a variable column - # add the average incidence - # replace population size by a percentage of total population - res["variable"] = column_name - res["avg_incidence"] = avg_incidence - res["pop_size"] = res["pop_size"]/len(data.index) - - # make sure to always return the data with the proper column order - column_order = ["variable", "label", "pop_size", - "avg_incidence", "incidence"] - - return res[column_order] - - -def generate_pig_tables(data: pd.DataFrame, - id_column_name: str, - target_column_name: str, - n_bins: int, - strategy: str, - label_format: str) -> pd.DataFrame: - """Summary - - Parameters - ---------- - data : pd.DataFrame - basetable to compute PIG tables of - id_column_name : str - column name of the id (e.g. customernumber) - target_column_name : str - column name of the target - n_bins : int - Number of bins to produce after discretization - strategy : str - Binning strategy. Currently only "uniform" and "quantile" - e.g. equifrequency is supported - label_format : str - format string to display the bin labels e.g. min - max, (min, max], ... - - Returns - ------- - pd.DataFrame - DataFrame containing a PIG table for all predictors - """ - - # Based on the data, get column names by datatype - # threshold to decide whether a numeric column should be considered - # a categorical variable (if the number of distinct values is smaller - # or equal to the number of requested bins) - categorical_threshold = n_bins - columns_by_type = utils.get_column_datatypes(data, id_column_name, - target_column_name, - categorical_threshold) - - # process continuous variables - discretizer = KBinsDiscretizer(n_bins=n_bins, - strategy=strategy, - label_format=label_format) - - # Transform the data - data = discretizer.fit_transform(data, - columns_by_type["numeric_variables"]) - - # Process categorical and dummy variables - categorical_vars = columns_by_type["categorical_variables"] - dummy_vars = columns_by_type["dummy_variables"] - relevant_columns = set(categorical_vars).union(set(dummy_vars)) - - data = preprocess_categoricals(data, list(relevant_columns)) - - # Get relevant columns, e.g. the ones that are transformed - # into categorical dtypes by the preprocessing steps - relevant_columns = set(data.dtypes[data.dtypes == "category"].index) - - pigs = [compute_pig_table(data, column_name, target_column_name, - id_column_name) - for column_name in sorted(relevant_columns) - if column_name not in [id_column_name, target_column_name]] - - output = pd.concat(pigs) - - return output From 234aba8551c03e05450a24ea8a50d5b56339e71f Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 10:07:20 +0100 Subject: [PATCH 67/98] Add evaluation module with PIGs script --- cobra/evaluation/__init__.py | 5 ++ cobra/evaluation/pigs_tables.py | 107 ++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 cobra/evaluation/__init__.py create mode 100644 cobra/evaluation/pigs_tables.py diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py new file mode 100644 index 0000000..c7d6820 --- /dev/null +++ b/cobra/evaluation/__init__.py @@ -0,0 +1,5 @@ +from .pig_tables import generate_pig_tables, compute_pig_table, plot_pig_graph + +__all__ = ['generate_pig_tables', + 'compute_pig_table', + 'plot_pig_graph'] diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py new file mode 100644 index 0000000..19305d1 --- /dev/null +++ b/cobra/evaluation/pigs_tables.py @@ -0,0 +1,107 @@ +# third party imports +import pandas as pd +#import matplotlib.pyplot as plt +#import seaborn as sns + + +def generate_pig_tables(data: pd.DataFrame, + id_column_name: str, + target_column_name: str, + preprocessed_predictors: list) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + basetable to compute PIG tables of + id_column_name : str + column name of the id (e.g. customernumber) + target_column_name : str + column name of the target + predictors: list + list of preprocessed predictor names + + Returns + ------- + pd.DataFrame + DataFrame containing a PIG table for all predictors + """ + + # Based on the data, get column names by datatype + # threshold to decide whether a numeric column should be considered + # a categorical variable (if the number of distinct values is smaller + # or equal to the number of requested bins) + + pigs = [compute_pig_table(data, column_name, target_column_name, + id_column_name) + for column_name in sorted(preprocessed_predictors) + if column_name not in [id_column_name, target_column_name]] + + output = pd.concat(pigs) + + return output + + +def compute_pig_table(data: pd.DataFrame, + column_name: str, + target_column_name: str, + id_column_name: str) -> pd.DataFrame: + """Compute the pig table of a given predictor for a given target + + Parameters + ---------- + data : pd.DataFrame + input data from which to compute the pig table + column_name : str + predictor name of which to compute the pig table + target_column_name : str + name of the target variable + id_column_name : str + name of the id column (used to count population size) + + Returns + ------- + pd.DataFrame + pig table as a DataFrame + """ + avg_incidence = data[target_column_name].mean() + + # group by the binned variable, compute the incidence + # (=mean of the target for the given bin) and compute the bin size + # (e.g. COUNT(id_column_name)). After that, rename the columns + res = (data.groupby(column_name) + .agg({target_column_name: "mean", id_column_name: "size"}) + .reset_index() + .rename(columns={column_name: "label", + target_column_name: "incidence", + id_column_name: "pop_size"})) + + # add the column name to a variable column + # add the average incidence + # replace population size by a percentage of total population + res["variable"] = column_name + res["avg_incidence"] = avg_incidence + res["pop_size"] = res["pop_size"]/len(data.index) + + # make sure to always return the data with the proper column order + column_order = ["variable", "label", "pop_size", + "avg_incidence", "incidence"] + + return res[column_order] + + +def plot_pig_graph(pig_table: pd.DataFrame, + dim: tuple=(12, 8), + save_path: str=None): + """Create the Predictor Insights Graphs from a PIG table + + Parameters + ---------- + pig_table : pd.DataFrame + Description + dim : tuple, optional + Tuple with width and lentgh of the plot + save_path : str, optional + path to store the plot on disk + """ + pass From baf792f438ad4fc251897fa405f6b2cfeb84726a Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 10:10:07 +0100 Subject: [PATCH 68/98] Add plotting functions to evaluation module --- cobra/evaluation/model_evaluator.py | 354 +++++++++++++++++++++++++ cobra/evaluation/performance_curves.py | 35 +++ cobra/evaluation/predictor_quality.py | 85 ++++++ 3 files changed, 474 insertions(+) create mode 100644 cobra/evaluation/model_evaluator.py create mode 100644 cobra/evaluation/performance_curves.py create mode 100644 cobra/evaluation/predictor_quality.py diff --git a/cobra/evaluation/model_evaluator.py b/cobra/evaluation/model_evaluator.py new file mode 100644 index 0000000..a0addd9 --- /dev/null +++ b/cobra/evaluation/model_evaluator.py @@ -0,0 +1,354 @@ +""" +Created on Fri Apr 12 09:36:37 2019 +@author: AP_JBENEK +""" +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +from sklearn.metrics import precision_score +from sklearn.metrics import recall_score +from sklearn.metrics import f1_score +from sklearn.metrics import accuracy_score +from sklearn.metrics import roc_curve +from sklearn.metrics import confusion_matrix +from sklearn.metrics import roc_auc_score + + +class Evaluator(): + + def __init__(self, y_true, y_pred_p, threshold=0.5, lift_at=0.1): + self.y_true = y_true + self.y_pred_p = y_pred_p # As probability + self.lift_at = lift_at + self.threshold = threshold + + #Convert to bool + self.y_pred_b = np.array([0 if pred <= self.threshold else 1 + for pred in self.y_pred_p]) + + def plotROCCurve(self, save_pth=None, desc=None): + ''' + Plot ROC curve and print best cutoff value + + Parameters + ---------- + y_true: True values of target y + proba: Predicted values of target y, probabilities + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + if desc is None: + desc = '' + + fpr, tpr, thresholds = roc_curve(self.y_true, self.y_pred_p) + + #--------------------------- + #Calculate AUC + #-------------------------- + out_perfo = self.evaluation() + score = out_perfo['AUC'] + + fig, ax = plt.subplots(figsize=(8, 5)) + ax.plot(fpr, tpr, color='darkorange', lw=2, + label='ROC curve (area = {s:.3})'.format(s=score)) + ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') + ax.set_xlabel('False Positive Rate', fontsize=15) + ax.set_ylabel('True Positive Rate', fontsize=15) + ax.legend(loc="lower right") + ax.set_title('ROC Curve {}' .format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + #Best cutoff value + #i want value where FPR is highest and FPR is lowest + #https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python + i = np.arange(len(tpr)) + roc = pd.DataFrame({'tf': pd.Series(tpr-(1-fpr), index=i), + 'threshold': pd.Series(thresholds, index=i)}) + roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]] + + best_cutoff = list(roc_t['threshold']) + print(f'Best cutoff value for probability is: {best_cutoff[0]}') + + def plotConfusionMatrix(self, labels=None, color='Reds', save_pth=None, desc=None): + ''' + Plot Confusion matrix with performance measures + + Parameters + ---------- + y_test: True values of target y + pred: Predicted values of target y, boolean + labels: labels for the matrix, if empty, values from y_test_ are used + color: Color of the matrix, its a cmap, so many values possible + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + if labels is None: + labels = [str(lab) for lab in np.unique(self.y_true)] + + if desc is None: + desc = '' + + cm = confusion_matrix(self.y_true, self.y_pred_b) + + fig, ax = plt.subplots(figsize=(8,5)) + ax = sns.heatmap(cm, annot=cm.astype(str), fmt="s", cmap=color, + xticklabels=labels, yticklabels=labels) + ax.set_title('Confusion matrix {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + out_perfo = self.evaluation() + + # If we mark customer as a churner, how often we are correct + print('Precision: {s:.3}'.format(s=out_perfo['precision'])) + # Overall performance + print('Accuracy: {s:.3}'.format(s=out_perfo['accuracy'])) + # How many churners can the model detect + print('Recall: {s:.3}'.format(s=out_perfo['recall'])) + # 2 * (precision * recall) / (precision + recall) + print('F1 Score: {s:.3}'.format(s=out_perfo['F1'])) + # 2 * (precision * recall) / (precision + recall) + print('Lift at top {l}%: {s:.3}' + .format(l=self.lift_at*100, s=out_perfo['lift'])) + # 2 * (precision * recall) / (precision + recall) + print('AUC: {s:.3}'.format(s=out_perfo['AUC'])) + + def plotCumulativeGains(self, save_pth=None, desc=None): + ''' + Functions plot cumulative gains + + Parameters + ---------- + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + if desc is None: + desc = '' + + #--------------------------- + #Calculate cumulative gains + #-------------------------- + nrows = len(self.y_true) + npositives = self.y_true.sum() + df_y_pred = (pd.DataFrame({"y": self.y_true, "y_pred": self.y_pred_p}) + .sort_values(by='y_pred', ascending=False) + .reset_index(drop=True)) + cgains = [0] + for stop in (np.linspace(0.01, 1, 100) * nrows).astype(int): + cgains.append(round(df_y_pred.loc[:stop, 'y'].sum()/npositives*max(100, 1), 2)) + + #--------------------------- + #Plot it + #--------------------------- + plt.style.use('seaborn-darkgrid') + fig, ax_cgains = plt.subplots(figsize=(8, 5)) + ax_cgains.plot(cgains, color='blue', linewidth=3, + label='cumulative gains') + ax_cgains.plot(ax_cgains.get_xlim(), ax_cgains.get_ylim(), linewidth=3, + ls="--", color="darkorange", label='random selection') + ax_cgains.set_title('Cumulative Gains ' + desc, fontsize=20) + + ax_cgains.set_title('Cumulative Gains {}' .format(desc), fontsize=20) + #Format axes + ax_cgains.set_xlim([0, 100]) + ax_cgains.set_ylim([0, 100]) + #Format ticks + ax_cgains.set_yticklabels(['{:3.0f}%'.format(x) + for x in ax_cgains.get_yticks()]) + ax_cgains.set_xticklabels(['{:3.0f}%'.format(x) + for x in ax_cgains.get_xticks()]) + #Legend + ax_cgains.legend(loc='lower right') + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + def plotLift(self, desc=None, save_pth=None): + ''' + Method plots lift per decile + + Parameters + ---------- + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + #--------------------- + #-- CALCULATE LIFT --- + #--------------------- +# inc_rate = self.y_true.mean() + lifts = [Evaluator.liftCalculator(y_true=self.y_true, + y_pred=self.y_pred_p, + lift_at=perc_lift) + for perc_lift in np.arange(0.1, 1.1, 0.1)] + + #--------------------- + #------- PLOT -------- + #--------------------- + if desc is None: + desc = '' + + fig, ax = plt.subplots(figsize=(8,5)) + plt.style.use('seaborn-darkgrid') + + nrows = len(lifts) + x_labels = [nrows-x for x in np.arange(0, nrows, 1)] + + plt.bar(x_labels[::-1], lifts, align='center', color="cornflowerblue") + plt.ylabel('lift', fontsize=15) + plt.xlabel('decile', fontsize=15) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=1, color='darkorange', linestyle='--', + xmin=0.1, xmax=0.9, linewidth=3, label='Baseline') + + #Legend + ax.legend(loc='upper right') + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title('Cumulative Lift {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + def plotCumulativeResponse(self, desc=None, save_pth=None): + #--------------------- + #-- CALCULATE LIFT --- + #--------------------- + inc_rate = self.y_true.mean() + lifts = [Evaluator.liftCalculator(y_true=self.y_true, + y_pred=self.y_pred_p, + lift_at=perc_lift) + for perc_lift in np.arange(0.1, 1.1, 0.1)] + lifts = np.array(lifts)*inc_rate*100 + #--------------------- + #------- PLOT -------- + #--------------------- + if desc is None: + desc = '' + + fig, ax = plt.subplots(figsize=(8, 5)) + #plt.style.use('seaborn-darkgrid') + plt.style.use('default') + + nrows = len(lifts) + x_labels = [nrows-x for x in np.arange(0, nrows, 1)] + + plt.bar(x_labels[::-1], lifts, align='center', color="#00ccff") + plt.ylabel('response (%)', fontsize=16) + plt.xlabel('decile', fontsize=16) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=inc_rate*100, color='#ff9500', linestyle='--', + xmin=0.05, xmax=0.95, linewidth=3, label='Incidence') + + #Legend + ax.legend(loc='upper right') + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title('Cumulative response {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + def evaluation(self): + ''' + Convenient function, returns various performance measures in a dict + + Parameters + ---------- + y_true: true values + y_pred: predictions as booleans + + Output + ------ + Returns dictionary with the measures + ''' + + dict_perfo = {'precision': precision_score(self.y_true, self.y_pred_b), + 'accuracy': accuracy_score(self.y_true, self.y_pred_b), + 'recall': recall_score(self.y_true, self.y_pred_b), + 'F1': f1_score(self.y_true, self.y_pred_b, + average=None)[1], + 'lift': np.round(Evaluator + .liftCalculator(y_true=self.y_true, + y_pred=self.y_pred_p, + lift_at=self.lift_at), + 2), + 'AUC': roc_auc_score(self.y_true, self.y_pred_p) + } + return dict_perfo + + @staticmethod + def liftCalculator(y_true, y_pred, lift_at=0.05, **kwargs): + ''' + Calculates lift given two arrays on specified level + + Parameters + ---------- + y_true: numpy array with true values + y_pred: numpy array with predictions (probabilities) + lift_at: lift at what top percentage + + Output + ------ + Scalar value, lift. + + 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, + 10000 loops each) + ''' + #Make sure it is numpy array + y_true_ = np.array(y_true) + y_pred_ = np.array(y_pred) + + #Make sure it has correct shape + y_true_ = y_true_.reshape(len(y_true_), 1) + y_pred_ = y_pred_.reshape(len(y_pred_), 1) + + #Merge data together + y_data = np.hstack([y_true_, y_pred_]) + + #Calculate necessary variables + nrows = len(y_data) + stop = int(np.floor(nrows*lift_at)) + avg_incidence = np.einsum('ij->j', y_true_)/float(len(y_true_)) + + #Sort and filter data + data_sorted = (y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] + .reshape(stop, 1)) + + #Calculate lift (einsum is very fast way of summing, + # needs specific shape) + inc_in_top_n = np.einsum('ij->j', data_sorted)/float(len(data_sorted)) + + lift = np.round(inc_in_top_n/avg_incidence, 2)[0] + + return lift diff --git a/cobra/evaluation/performance_curves.py b/cobra/evaluation/performance_curves.py new file mode 100644 index 0000000..9ce9be4 --- /dev/null +++ b/cobra/evaluation/performance_curves.py @@ -0,0 +1,35 @@ +# third party imports +import numpy as np +import pandas as pd + +import matplotlib.pyplot as plt + + +def plot_performance_curves(model_performances: list, + dim: tuple=(12, 8)): + + df_plt = pd.DataFrame(model_performances) + + highest_auc = np.round(max(max(df_plt['train_performance']), + max(df_plt['selection_performance']), + max(df_plt['validation_performance'])), 1) + + fig, ax = plt.subplots(figsize=dim) + + plt.plot(df_plt['train_performance'], marker=".", markersize=20, + linewidth=3, label='AUC train') + plt.plot(df_plt['selection_performance'], marker=".", markersize=20, + linewidth=3, label='AUC selection') + plt.plot(df_plt['validation_performance'], marker=".", markersize=20, + linewidth=3, label='AUC validation') + # Set x/yticks + ax.set_xticks(np.arange(len(df_plt['last_added_predictor']) + 1)) + ax.set_xticklabels(df_plt['last_added_predictor'].tolist(), + rotation=40, ha='right') + ax.set_yticks(np.arange(0.5, highest_auc + 0.02, 0.05)) + #Make Pretty + ax.legend(loc='lower right') + fig.suptitle('Performance curves - forward feature selection', + fontsize=20) + plt.ylabel('Model performance') + plt.show() diff --git a/cobra/evaluation/predictor_quality.py b/cobra/evaluation/predictor_quality.py new file mode 100644 index 0000000..f2a2ea3 --- /dev/null +++ b/cobra/evaluation/predictor_quality.py @@ -0,0 +1,85 @@ +# third party imports +import pandas as pd + +import matplotlib.pyplot as plt +import seaborn as sns + + +def plot_variable_importance(importance_by_variable: dict, + title: str=None, + dim: tuple=(12, 8)): + """Plot variable importance of a given model + + Parameters + ---------- + importance_by_variable : dict + Map of predictor -> importance + title : str, optional + Title of the plot + dim : tuple, optional + tuple with width and lentgh of the plot + """ + df = pd.DataFrame.from_dict(importance_by_variable, + orient='index').reset_index() + + df.columns = ["predictor", "importance"] + + df = df.sort_values(by="importance", ascending=False) + + # plot data + fig, ax = plt.subplots(figsize=dim) + ax = sns.barplot(x="importance", y="predictor", data=df) + if title: + ax.set_title(title) + else: + ax.set_title("Variable importance") + plt.show() + + +def plot_predictor_quality(df_auc: pd.DataFrame, + dim: tuple=(12, 8)): + """Plot univariate quality of the predictors + + Parameters + ---------- + df_auc : pd.DatFrame + Contains for each variable the train auc and selection auc allong with + a boolean indicating whether or not it is selected based on the + criteria + dim : tuple, optional + tuple with width and lentgh of the plot + """ + + plt.style.use('seaborn-darkgrid') + + df = (df_auc[df_auc["preselection"]] + .sort_values(by='AUC train', ascending=False)) + + df = pd.melt(df, id_vars=["predictor"], + value_vars=["AUC train", "AUC selection"], + var_name="partition", + value_name="AUC") + + # plots + fig, ax = plt.subplots(figsize=dim) + + ax = sns.barplot(x="AUC", y="predictor", hue="partition", data=df) + ax.set_title('Univariate Quality of Predictors') + plt.show() + + +def plot_correlation_matrix(df_corr: pd.DataFrame, + dim: tuple=(12, 8)): + """Plot correlation matrix amongst the predictors + + Parameters + ---------- + df_corr : pd.DataFrame + Correlation matrix + dim : tuple, optional + tuple with width and lentgh of the plot + """ + fig, ax = plt.subplots(figsize=dim) + ax = sns.heatmap(df_corr, cmap='Blues') + ax.set_title('Correlation Matrix') + plt.show() From b6dac15cec4f810eb934ba38aef232b67ab1c222 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 11:16:51 +0100 Subject: [PATCH 69/98] Fix a bug in univariate_selection.compute_univariate_preselection output --- cobra/model_building/univariate_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 6b4dc38..ced16ac 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -89,7 +89,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, df_auc["preselection"] = auc_thresh & auc_overtrain return (df_auc.sort_values(by='AUC selection', ascending=False) - .reset_index()) + .reset_index(drop=True)) def get_preselected_predictors(df_auc: pd.DataFrame) -> list: From 1b6a058e68fcad6647a25fe797907a4683bcc15d Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 11:17:22 +0100 Subject: [PATCH 70/98] Bug fix in evaluation __init__.py --- cobra/evaluation/__init__.py | 2 +- cobra/evaluation/pigs_tables.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py index c7d6820..3172704 100644 --- a/cobra/evaluation/__init__.py +++ b/cobra/evaluation/__init__.py @@ -1,4 +1,4 @@ -from .pig_tables import generate_pig_tables, compute_pig_table, plot_pig_graph +from .pigs_tables import generate_pig_tables, compute_pig_table, plot_pig_graph __all__ = ['generate_pig_tables', 'compute_pig_table', diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index 19305d1..ccd64e9 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -3,6 +3,8 @@ #import matplotlib.pyplot as plt #import seaborn as sns +import cobra.utils as utils + def generate_pig_tables(data: pd.DataFrame, id_column_name: str, @@ -79,7 +81,7 @@ def compute_pig_table(data: pd.DataFrame, # add the column name to a variable column # add the average incidence # replace population size by a percentage of total population - res["variable"] = column_name + res["variable"] = utils.clean_predictor_name(column_name) res["avg_incidence"] = avg_incidence res["pop_size"] = res["pop_size"]/len(data.index) From a0f9ee79a9f63e9403f9d56ad8c59ba3f2620d05 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 11:17:40 +0100 Subject: [PATCH 71/98] Modified example notebook to new version --- examples/examples.ipynb | 2421 ++++++++++++--------------------------- 1 file changed, 751 insertions(+), 1670 deletions(-) diff --git a/examples/examples.ipynb b/examples/examples.ipynb index fd8f6fd..4611637 100644 --- a/examples/examples.ipynb +++ b/examples/examples.ipynb @@ -6,30 +6,22 @@ "source": [ "\n", "\n", - "# COBRA 1.0 as a Python library\n", + "# COBRA v1.0.0\n", "----------------------------------------------------------------------------------------------------------\n", - "**Autor**: Jan Benisek\n", + "**Autors**: Jan Benisek, Matthias Roels, Geert Verstraeten\n", "\n", - "**Date**: 05/03/2018\n", + "**Date**: 25/03/2020\n", "\n", - "**Purpose**: Show transformed COBRA 1.0 code into Python library for easy use\n", + "**Purpose**: Example usage of COBRA 1.0.0\n", "\n", - "**Requirements**: Python 3.6 (Conda distribution), COBRA library (to be specified)\n", + "**Requirements**: Python 3.6, COBRA library (installation, see README)\n", "\n", "----------------------------------------------------------------------------------------------------------\n", - "\n", - "**BEFORE YOU START**\n", - " * The dataset must contains a column name _ID_, which is a unique identifier of a row\n", - " * The column with target must be named _TARGET_\n", - " * .csv should be comma delimited (although Pandas will try to guess the sepator)\n", - " * Make sure you have the latest version of COBRA (follow the instructions on the main page of the repository)\n", - " \n", " \n", "## List of content\n", " * Data transformation\n", " * Univariate selection\n", - " * Model building\n", - " * Model comparison" + " * Model building" ] }, { @@ -38,24 +30,8 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "#####################\n", - "import cobra.cobra as c\n", - "#####################\n", - "\n", - "data_path = 'C:/Local/pers/Documents/GitHub/COBRA/datasets/data.csv'\n", - "data_types_path = 'C:/Local/pers/Documents/GitHub/COBRA/datasets/data_types.csv'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data transformation\n", - "The first step is, as in the original version, to import and process the data." + "%reload_ext autoreload\n", + "%autoreload 2" ] }, { @@ -64,29 +40,34 @@ "metadata": {}, "outputs": [], "source": [ - "build = c.COBRA(data_path, #Path to .csv file which contains the data\n", - " data_types_path, #Path to .csv files which contains the metadata\n", - " partition_train=0.5, #Size of training set as int <0;1>\n", - " partition_select=0.3, #Size of selection set as int <0;1>\n", - " partition_valid=0.2, #Size of validation set as int <0;1>\n", - " sampling_1=1, #Size of sampling of target class (ie. 0.5 = take 50% of 1s)\n", - " sampling_0=1, #Size of sampling of non-target class (ie. 0.5 = take 50% of 0s)\n", - " discret_nbins=5, #Number of bins when binning continuous variables\n", - " regroup_sign=0.001, #Threshold for regrouping cat. variables (p-value)\n", - " rseed=0) #Random seed for reproduction\n", - "df_transformed = build.transform()" + "# third party imports \n", + "import numpy as np\n", + "import pandas as pd \n", + "\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# Custom imports\n", + "import sys\n", + "sys.path.append(\"/mnt/c/Users/matroe/Documents/workspace/cobra\")\n", + "from cobra.preprocessing import PreProcessor\n", + "from cobra.model_building import univariate_selection\n", + "from cobra.model_building import ForwardFeatureSelection\n", + "from cobra.evaluation import generate_pig_tables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Below you can see first 10 rows of the transformed dataframe." + "## Data Loading\n", + "The first step is to load a dataset into a pandas DataFrame. This step is not part of COBRA as we want to keep the flexibility the load from different data sources" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -110,806 +91,339 @@ " \n", " \n", " \n", - " ID\n", - " TARGET\n", - " PARTITION\n", - " B_age\n", - " B_fnlwgt\n", - " B_education-num\n", - " B_capital-gain\n", - " B_capital-loss\n", - " B_hours-per-week\n", - " B_scont_1\n", - " B_scont_2\n", - " B_scont_3\n", - " B_scont_4\n", - " B_scont_5\n", - " B_scont_6\n", - " B_scont_7\n", - " B_scont_8\n", - " B_scont_9\n", - " B_scont_10\n", - " B_sflag_1\n", - " B_sflag_2\n", - " B_sflag_3\n", - " B_sflag_4\n", - " B_sflag_5\n", - " B_marital-status\n", + " age\n", + " workclass\n", + " fnlwgt\n", + " education\n", + " education-num\n", + " marital-status\n", + " occupation\n", + " relationship\n", + " race\n", + " sex\n", " ...\n", - " D_sex\n", - " D_native-country\n", - " D_hours-per-week\n", - " D_scont_2\n", - " D_sflag_2\n", - " D_scat_4\n", - " D_scont_4\n", - " D_sflag_1\n", - " D_scont_1\n", - " D_workclass\n", - " D_scont_9\n", - " D_scat_1\n", - " D_capital-loss\n", - " D_capital-gain\n", - " D_sflag_3\n", - " D_scat_2\n", - " D_fnlwgt\n", - " D_scont_7\n", - " D_sflag_4\n", - " D_relationship\n", - " D_education\n", - " D_scont_5\n", - " D_age\n", - " D_scont_10\n", - " D_race\n", + " scat_1\n", + " scat_2\n", + " scat_3\n", + " scat_4\n", + " scat_5\n", + " sflag_1\n", + " sflag_2\n", + " sflag_3\n", + " sflag_4\n", + " sflag_5\n", " \n", " \n", " \n", " \n", " 0\n", - " 6640.0\n", - " 1.0\n", - " train\n", - " (41, 51]\n", - " (195948, 260560]\n", - " (13,...]\n", - " [..., 1797]\n", - " (1258,...]\n", - " (35, 40]\n", - " [..., 2]\n", - " (2, 4]\n", - " (9, 12]\n", - " (16,...]\n", - " [..., 5]\n", - " (7, 12]\n", - " (21, 28]\n", - " [..., 9]\n", - " [..., 9]\n", - " (30, 40]\n", - " 1.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " Married-civ-spouse\n", + " 39\n", + " State-gov\n", + " 77516\n", + " Bachelors\n", + " 13\n", + " Never-married\n", + " Adm-clerical\n", + " Not-in-family\n", + " White\n", + " Male\n", " ...\n", - " 0.301925\n", - " 0.242674\n", - " 0.213624\n", - " 0.230408\n", - " 0.240833\n", - " 0.239292\n", - " 0.248492\n", - " 0.245857\n", - " 0.246575\n", - " 0.557160\n", - " 0.220824\n", - " 0.239292\n", - " 0.512613\n", - " 0.205332\n", - " 0.236874\n", - " 0.239292\n", - " 0.224519\n", - " 0.243842\n", - " 0.236838\n", - " 0.444804\n", - " 0.756410\n", - " 0.235711\n", - " 0.380231\n", - " 0.231227\n", - " 0.254518\n", + " A\n", + " C\n", + " C\n", + " A\n", + " A\n", + " 0\n", + " 1\n", + " 0\n", + " 0\n", + " 0\n", " \n", " \n", " 1\n", - " 8393.0\n", - " 1.0\n", - " train\n", - " (26, 33]\n", - " (195948, 260560]\n", - " (13,...]\n", - " [..., 1797]\n", - " [..., 1258]\n", - " (40, 50]\n", - " [..., 2]\n", - " (2, 4]\n", - " (6, 9]\n", - " [..., 5]\n", - " (21,...]\n", - " (18, 25]\n", - " [..., 7]\n", - " (9, 17]\n", - " (37,...]\n", - " (20, 30]\n", - " 1.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " Never-married\n", + " 50\n", + " Self-emp-not-inc\n", + " 83311\n", + " Bachelors\n", + " 13\n", + " Married-civ-spouse\n", + " Exec-managerial\n", + " Husband\n", + " White\n", + " Male\n", " ...\n", - " 0.113317\n", - " 0.242674\n", - " 0.395141\n", - " 0.230408\n", - " 0.240833\n", - " 0.239292\n", - " 0.237532\n", - " 0.245857\n", - " 0.246575\n", - " 0.217724\n", - " 0.237834\n", - " 0.239292\n", - " 0.226278\n", - " 0.205332\n", - " 0.236874\n", - " 0.239292\n", - " 0.224519\n", - " 0.231617\n", - " 0.236838\n", - " 0.101233\n", - " 0.560060\n", - " 0.250251\n", - " 0.182973\n", - " 0.236608\n", - " 0.254518\n", + " B\n", + " B\n", + " A\n", + " B\n", + " A\n", + " 1\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " \n", " \n", " 2\n", - " 32199.0\n", - " 1.0\n", - " train\n", - " (51,...]\n", - " (156845, 195948]\n", - " (9, 10]\n", - " (1797,...]\n", - " [..., 1258]\n", - " [..., 35]\n", - " (4,...]\n", - " [..., 2]\n", - " [..., 3]\n", - " (12, 16]\n", - " (11, 16]\n", - " (18, 25]\n", - " [..., 7]\n", - " (24, 33]\n", - " (37,...]\n", - " [..., 10]\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " Married-civ-spouse\n", + " 38\n", + " Private\n", + " 215646\n", + " HS-grad\n", + " 9\n", + " Divorced\n", + " Handlers-cleaners\n", + " Not-in-family\n", + " White\n", + " Male\n", " ...\n", - " 0.301925\n", - " 0.242674\n", - " 0.082494\n", - " 0.239825\n", - " 0.237778\n", - " 0.239292\n", - " 0.244302\n", - " 0.232815\n", - " 0.241358\n", - " 0.557160\n", - " 0.237834\n", - " 0.239292\n", - " 0.226278\n", - " 0.646965\n", - " 0.236874\n", - " 0.239292\n", - " 0.251229\n", - " 0.231617\n", - " 0.241786\n", - " 0.444804\n", - " 0.182432\n", - " 0.239166\n", - " 0.310733\n", - " 0.241169\n", - " 0.254518\n", + " A\n", + " C\n", + " A\n", + " A\n", + " F\n", + " 0\n", + " 1\n", + " 0\n", + " 1\n", + " 0\n", " \n", " \n", " 3\n", - " 16168.0\n", - " 1.0\n", - " train\n", - " (41, 51]\n", - " (260560,...]\n", - " [..., 9]\n", - " (1797,...]\n", - " [..., 1258]\n", - " (40, 50]\n", - " [..., 2]\n", - " (6, 8]\n", - " (9, 12]\n", - " (8, 12]\n", - " (11, 16]\n", - " (12, 18]\n", - " (28,...]\n", - " (9, 17]\n", - " (37,...]\n", - " (30, 40]\n", - " 1.0\n", - " 1.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", + " 53\n", + " Private\n", + " 234721\n", + " 11th\n", + " 7\n", " Married-civ-spouse\n", + " Handlers-cleaners\n", + " Husband\n", + " Black\n", + " Male\n", " ...\n", - " 0.301925\n", - " 0.242674\n", - " 0.395141\n", - " 0.235776\n", - " 0.240833\n", - " 0.239292\n", - " 0.234490\n", - " 0.245857\n", - " 0.246575\n", - " 0.217724\n", - " 0.237834\n", - " 0.239292\n", - " 0.226278\n", - " 0.646965\n", - " 0.236874\n", - " 0.239292\n", - " 0.230596\n", - " 0.243021\n", - " 0.241786\n", - " 0.444804\n", - " 0.162138\n", - " 0.239166\n", - " 0.380231\n", - " 0.231227\n", - " 0.254518\n", + " B\n", + " B\n", + " C\n", + " A\n", + " B\n", + " 0\n", + " 0\n", + " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 4\n", - " 6469.0\n", - " 1.0\n", - " train\n", - " (33, 41]\n", - " (195948, 260560]\n", - " (13,...]\n", - " [..., 1797]\n", - " [..., 1258]\n", - " (40, 50]\n", - " [..., 2]\n", - " (8,...]\n", - " (12,...]\n", - " (8, 12]\n", - " [..., 5]\n", - " [..., 7]\n", - " [..., 7]\n", - " (24, 33]\n", - " (9, 18]\n", - " [..., 10]\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " Married-civ-spouse\n", - " ...\n", - " 0.301925\n", - " 0.242674\n", - " 0.395141\n", - " 0.252790\n", - " 0.237778\n", - " 0.239292\n", - " 0.234490\n", - " 0.232815\n", - " 0.246575\n", - " 0.217724\n", - " 0.246939\n", - " 0.239292\n", - " 0.226278\n", - " 0.205332\n", - " 0.241725\n", - " 0.239292\n", - " 0.224519\n", - " 0.231617\n", - " 0.236838\n", - " 0.444804\n", - " 0.560060\n", - " 0.235711\n", - " 0.312165\n", - " 0.241169\n", - " 0.254518\n", - " \n", - " \n", - " 5\n", - " 101.0\n", - " 1.0\n", - " train\n", - " (51,...]\n", - " (105659, 156845]\n", - " (13,...]\n", - " [..., 1797]\n", - " [..., 1258]\n", - " (35, 40]\n", - " [..., 2]\n", - " [..., 2]\n", - " (6, 9]\n", - " (5, 8]\n", - " (5, 11]\n", - " (12, 18]\n", - " (7, 14]\n", - " (17, 24]\n", - " (28, 37]\n", - " (40,...]\n", - " 1.0\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " 0.0\n", - " Married-civ-spouse\n", - " ...\n", - " 0.301925\n", - " 0.242674\n", - " 0.213624\n", - " 0.239825\n", - " 0.237778\n", - " 0.239292\n", - " 0.230127\n", - " 0.245857\n", - " 0.246575\n", - " 0.217724\n", - " 0.248562\n", - " 0.239292\n", - " 0.226278\n", - " 0.205332\n", - " 0.241725\n", - " 0.239292\n", - " 0.264947\n", - " 0.236933\n", - " 0.241786\n", - " 0.444804\n", - " 0.560060\n", - " 0.234372\n", - " 0.310733\n", - " 0.246743\n", - " 0.254518\n", - " \n", - " \n", - " 6\n", - " 22547.0\n", - " 1.0\n", - " train\n", - " (41, 51]\n", - " [..., 105659]\n", - " (10, 13]\n", - " [..., 1797]\n", - " [..., 1258]\n", - " (35, 40]\n", - " (3, 4]\n", - " (4, 6]\n", - " [..., 3]\n", - " [..., 5]\n", - " (5, 11]\n", - " (18, 25]\n", - " (14, 21]\n", - " [..., 9]\n", - " [..., 9]\n", - " [..., 10]\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " 1.0\n", - " 0.0\n", - " Married-civ-spouse\n", - " ...\n", - " 0.301925\n", - " 0.242674\n", - " 0.213624\n", - " 0.237765\n", - " 0.240833\n", - " 0.239292\n", - " 0.237532\n", - " 0.232815\n", - " 0.230504\n", - " 0.217724\n", - " 0.220824\n", - " 0.239292\n", - " 0.226278\n", - " 0.205332\n", - " 0.241725\n", - " 0.239292\n", - " 0.225179\n", - " 0.241300\n", - " 0.241786\n", - " 0.444804\n", - " 0.246997\n", - " 0.234372\n", - " 0.380231\n", - " 0.241169\n", - " 0.254518\n", - " \n", - " \n", - " 7\n", - " 1319.0\n", - " 1.0\n", - " train\n", - " (33, 41]\n", - " (105659, 156845]\n", - " (10, 13]\n", - " [..., 1797]\n", - " [..., 1258]\n", - " (35, 40]\n", - " (4,...]\n", - " [..., 2]\n", - " [..., 3]\n", - " [..., 5]\n", - " (5, 11]\n", - " (12, 18]\n", - " [..., 7]\n", - " (17, 24]\n", - " (28, 37]\n", - " (20, 30]\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " 0.0\n", - " 1.0\n", - " Married-civ-spouse\n", - " ...\n", - " 0.301925\n", - " 0.242674\n", - " 0.213624\n", - " 0.239825\n", - " 0.240833\n", - " 0.239292\n", - " 0.237532\n", - " 0.232815\n", - " 0.241358\n", - " 0.273360\n", - " 0.248562\n", - " 0.239292\n", - " 0.226278\n", - " 0.205332\n", - " 0.241725\n", - " 0.239292\n", - " 0.264947\n", - " 0.231617\n", - " 0.236838\n", - " 0.444804\n", - " 0.415358\n", - " 0.234372\n", - " 0.312165\n", - " 0.236608\n", - " 0.254518\n", - " \n", - " \n", - " 8\n", - " 653.0\n", - " 1.0\n", - " train\n", - " (33, 41]\n", - " (260560,...]\n", - " (13,...]\n", - " [..., 1797]\n", - " [..., 1258]\n", - " (35, 40]\n", - " (4,...]\n", - " (6, 8]\n", - " (9, 12]\n", - " (5, 8]\n", - " (11, 16]\n", - " (7, 12]\n", - " (14, 21]\n", - " [..., 9]\n", - " (37,...]\n", - " (10, 20]\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", + " 28\n", + " Private\n", + " 338409\n", + " Bachelors\n", + " 13\n", " Married-civ-spouse\n", + " Prof-specialty\n", + " Wife\n", + " Black\n", + " Female\n", " ...\n", - " 0.301925\n", - " 0.242674\n", - " 0.213624\n", - " 0.235776\n", - " 0.240833\n", - " 0.239292\n", - " 0.230127\n", - " 0.232815\n", - " 0.241358\n", - " 0.217724\n", - " 0.237834\n", - " 0.239292\n", - " 0.226278\n", - " 0.205332\n", - " 0.236874\n", - " 0.239292\n", - " 0.230596\n", - " 0.241300\n", - " 0.236838\n", - " 0.444804\n", - " 0.762590\n", - " 0.239166\n", - " 0.312165\n", - " 0.240711\n", - " 0.254518\n", - " \n", - " \n", - " 9\n", - " 16288.0\n", - " 1.0\n", - " train\n", - " (41, 51]\n", - " (260560,...]\n", - " (9, 10]\n", - " [..., 1797]\n", - " [..., 1258]\n", - " (35, 40]\n", - " (3, 4]\n", - " (4, 6]\n", - " (12,...]\n", - " (16,...]\n", - " (11, 16]\n", - " (18, 25]\n", - " (21, 28]\n", - " (9, 17]\n", - " (9, 18]\n", - " (30, 40]\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " 0.0\n", - " Married-civ-spouse\n", - " ...\n", - " 0.301925\n", - " 0.242674\n", - " 0.213624\n", - " 0.237765\n", - " 0.237778\n", - " 0.239292\n", - " 0.248492\n", - " 0.232815\n", - " 0.230504\n", - " 0.217724\n", - " 0.246939\n", - " 0.239292\n", - " 0.226278\n", - " 0.205332\n", - " 0.241725\n", - " 0.239292\n", - " 0.230596\n", - " 0.243842\n", - " 0.241786\n", - " 0.444804\n", - " 0.182432\n", - " 0.239166\n", - " 0.380231\n", - " 0.231227\n", - " 0.254518\n", + " B\n", + " B\n", + " C\n", + " C\n", + " A\n", + " 1\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", " \n", " \n", "\n", - "

    10 rows × 71 columns

    \n", + "

    5 rows × 36 columns

    \n", "" ], "text/plain": [ - " ID TARGET PARTITION B_age B_fnlwgt B_education-num \\\n", - "0 6640.0 1.0 train (41, 51] (195948, 260560] (13,...] \n", - "1 8393.0 1.0 train (26, 33] (195948, 260560] (13,...] \n", - "2 32199.0 1.0 train (51,...] (156845, 195948] (9, 10] \n", - "3 16168.0 1.0 train (41, 51] (260560,...] [..., 9] \n", - "4 6469.0 1.0 train (33, 41] (195948, 260560] (13,...] \n", - "5 101.0 1.0 train (51,...] (105659, 156845] (13,...] \n", - "6 22547.0 1.0 train (41, 51] [..., 105659] (10, 13] \n", - "7 1319.0 1.0 train (33, 41] (105659, 156845] (10, 13] \n", - "8 653.0 1.0 train (33, 41] (260560,...] (13,...] \n", - "9 16288.0 1.0 train (41, 51] (260560,...] (9, 10] \n", - "\n", - " B_capital-gain B_capital-loss B_hours-per-week B_scont_1 B_scont_2 \\\n", - "0 [..., 1797] (1258,...] (35, 40] [..., 2] (2, 4] \n", - "1 [..., 1797] [..., 1258] (40, 50] [..., 2] (2, 4] \n", - "2 (1797,...] [..., 1258] [..., 35] (4,...] [..., 2] \n", - "3 (1797,...] [..., 1258] (40, 50] [..., 2] (6, 8] \n", - "4 [..., 1797] [..., 1258] (40, 50] [..., 2] (8,...] \n", - "5 [..., 1797] [..., 1258] (35, 40] [..., 2] [..., 2] \n", - "6 [..., 1797] [..., 1258] (35, 40] (3, 4] (4, 6] \n", - "7 [..., 1797] [..., 1258] (35, 40] (4,...] [..., 2] \n", - "8 [..., 1797] [..., 1258] (35, 40] (4,...] (6, 8] \n", - "9 [..., 1797] [..., 1258] (35, 40] (3, 4] (4, 6] \n", - "\n", - " B_scont_3 B_scont_4 B_scont_5 B_scont_6 B_scont_7 B_scont_8 B_scont_9 \\\n", - "0 (9, 12] (16,...] [..., 5] (7, 12] (21, 28] [..., 9] [..., 9] \n", - "1 (6, 9] [..., 5] (21,...] (18, 25] [..., 7] (9, 17] (37,...] \n", - "2 [..., 3] (12, 16] (11, 16] (18, 25] [..., 7] (24, 33] (37,...] \n", - "3 (9, 12] (8, 12] (11, 16] (12, 18] (28,...] (9, 17] (37,...] \n", - "4 (12,...] (8, 12] [..., 5] [..., 7] [..., 7] (24, 33] (9, 18] \n", - "5 (6, 9] (5, 8] (5, 11] (12, 18] (7, 14] (17, 24] (28, 37] \n", - "6 [..., 3] [..., 5] (5, 11] (18, 25] (14, 21] [..., 9] [..., 9] \n", - "7 [..., 3] [..., 5] (5, 11] (12, 18] [..., 7] (17, 24] (28, 37] \n", - "8 (9, 12] (5, 8] (11, 16] (7, 12] (14, 21] [..., 9] (37,...] \n", - "9 (12,...] (16,...] (11, 16] (18, 25] (21, 28] (9, 17] (9, 18] \n", + " age workclass fnlwgt education education-num \\\n", + "0 39 State-gov 77516 Bachelors 13 \n", + "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", + "2 38 Private 215646 HS-grad 9 \n", + "3 53 Private 234721 11th 7 \n", + "4 28 Private 338409 Bachelors 13 \n", "\n", - " B_scont_10 B_sflag_1 B_sflag_2 B_sflag_3 B_sflag_4 B_sflag_5 \\\n", - "0 (30, 40] 1.0 1.0 0.0 0.0 1.0 \n", - "1 (20, 30] 1.0 1.0 0.0 0.0 0.0 \n", - "2 [..., 10] 0.0 0.0 0.0 1.0 1.0 \n", - "3 (30, 40] 1.0 1.0 0.0 1.0 0.0 \n", - "4 [..., 10] 0.0 0.0 1.0 0.0 0.0 \n", - "5 (40,...] 1.0 0.0 1.0 1.0 0.0 \n", - "6 [..., 10] 0.0 1.0 1.0 1.0 0.0 \n", - "7 (20, 30] 0.0 1.0 1.0 0.0 1.0 \n", - "8 (10, 20] 0.0 1.0 0.0 0.0 0.0 \n", - "9 (30, 40] 0.0 0.0 1.0 1.0 0.0 \n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Adm-clerical Not-in-family White Male \n", + "1 Married-civ-spouse Exec-managerial Husband White Male \n", + "2 Divorced Handlers-cleaners Not-in-family White Male \n", + "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", + "4 Married-civ-spouse Prof-specialty Wife Black Female \n", "\n", - " B_marital-status ... D_sex D_native-country D_hours-per-week \\\n", - "0 Married-civ-spouse ... 0.301925 0.242674 0.213624 \n", - "1 Never-married ... 0.113317 0.242674 0.395141 \n", - "2 Married-civ-spouse ... 0.301925 0.242674 0.082494 \n", - "3 Married-civ-spouse ... 0.301925 0.242674 0.395141 \n", - "4 Married-civ-spouse ... 0.301925 0.242674 0.395141 \n", - "5 Married-civ-spouse ... 0.301925 0.242674 0.213624 \n", - "6 Married-civ-spouse ... 0.301925 0.242674 0.213624 \n", - "7 Married-civ-spouse ... 0.301925 0.242674 0.213624 \n", - "8 Married-civ-spouse ... 0.301925 0.242674 0.213624 \n", - "9 Married-civ-spouse ... 0.301925 0.242674 0.213624 \n", + " ... scat_1 scat_2 scat_3 scat_4 scat_5 sflag_1 sflag_2 sflag_3 \\\n", + "0 ... A C C A A 0 1 0 \n", + "1 ... B B A B A 1 0 0 \n", + "2 ... A C A A F 0 1 0 \n", + "3 ... B B C A B 0 0 1 \n", + "4 ... B B C C A 1 0 0 \n", "\n", - " D_scont_2 D_sflag_2 D_scat_4 D_scont_4 D_sflag_1 D_scont_1 D_workclass \\\n", - "0 0.230408 0.240833 0.239292 0.248492 0.245857 0.246575 0.557160 \n", - "1 0.230408 0.240833 0.239292 0.237532 0.245857 0.246575 0.217724 \n", - "2 0.239825 0.237778 0.239292 0.244302 0.232815 0.241358 0.557160 \n", - "3 0.235776 0.240833 0.239292 0.234490 0.245857 0.246575 0.217724 \n", - "4 0.252790 0.237778 0.239292 0.234490 0.232815 0.246575 0.217724 \n", - "5 0.239825 0.237778 0.239292 0.230127 0.245857 0.246575 0.217724 \n", - "6 0.237765 0.240833 0.239292 0.237532 0.232815 0.230504 0.217724 \n", - "7 0.239825 0.240833 0.239292 0.237532 0.232815 0.241358 0.273360 \n", - "8 0.235776 0.240833 0.239292 0.230127 0.232815 0.241358 0.217724 \n", - "9 0.237765 0.237778 0.239292 0.248492 0.232815 0.230504 0.217724 \n", + " sflag_4 sflag_5 \n", + "0 0 0 \n", + "1 0 0 \n", + "2 1 0 \n", + "3 0 0 \n", + "4 0 1 \n", "\n", - " D_scont_9 D_scat_1 D_capital-loss D_capital-gain D_sflag_3 D_scat_2 \\\n", - "0 0.220824 0.239292 0.512613 0.205332 0.236874 0.239292 \n", - "1 0.237834 0.239292 0.226278 0.205332 0.236874 0.239292 \n", - "2 0.237834 0.239292 0.226278 0.646965 0.236874 0.239292 \n", - "3 0.237834 0.239292 0.226278 0.646965 0.236874 0.239292 \n", - "4 0.246939 0.239292 0.226278 0.205332 0.241725 0.239292 \n", - "5 0.248562 0.239292 0.226278 0.205332 0.241725 0.239292 \n", - "6 0.220824 0.239292 0.226278 0.205332 0.241725 0.239292 \n", - "7 0.248562 0.239292 0.226278 0.205332 0.241725 0.239292 \n", - "8 0.237834 0.239292 0.226278 0.205332 0.236874 0.239292 \n", - "9 0.246939 0.239292 0.226278 0.205332 0.241725 0.239292 \n", - "\n", - " D_fnlwgt D_scont_7 D_sflag_4 D_relationship D_education D_scont_5 \\\n", - "0 0.224519 0.243842 0.236838 0.444804 0.756410 0.235711 \n", - "1 0.224519 0.231617 0.236838 0.101233 0.560060 0.250251 \n", - "2 0.251229 0.231617 0.241786 0.444804 0.182432 0.239166 \n", - "3 0.230596 0.243021 0.241786 0.444804 0.162138 0.239166 \n", - "4 0.224519 0.231617 0.236838 0.444804 0.560060 0.235711 \n", - "5 0.264947 0.236933 0.241786 0.444804 0.560060 0.234372 \n", - "6 0.225179 0.241300 0.241786 0.444804 0.246997 0.234372 \n", - "7 0.264947 0.231617 0.236838 0.444804 0.415358 0.234372 \n", - "8 0.230596 0.241300 0.236838 0.444804 0.762590 0.239166 \n", - "9 0.230596 0.243842 0.241786 0.444804 0.182432 0.239166 \n", - "\n", - " D_age D_scont_10 D_race \n", - "0 0.380231 0.231227 0.254518 \n", - "1 0.182973 0.236608 0.254518 \n", - "2 0.310733 0.241169 0.254518 \n", - "3 0.380231 0.231227 0.254518 \n", - "4 0.312165 0.241169 0.254518 \n", - "5 0.310733 0.246743 0.254518 \n", - "6 0.380231 0.241169 0.254518 \n", - "7 0.312165 0.236608 0.254518 \n", - "8 0.312165 0.240711 0.254518 \n", - "9 0.380231 0.231227 0.254518 \n", - "\n", - "[10 rows x 71 columns]" + "[5 rows x 36 columns]" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_transformed.head(n=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The library also offers a function to print basic summary (will be improved in the future)" + "path = \"../datasets/data.csv\"\n", + "\n", + "basetable = pd.read_csv(path)\n", + "\n", + "basetable.head()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "----------------- SUMMARY -----------------\n", - "Dataset has 48842 rows and 71 columns.\n", - "Train set has 24422 rows\n", - "Selection set has 14654 rows\n", - "Validation set has 9766 rows\n", - "Overall incidence rate is 23.93%\n", - "0.00% records in the dataset are missing.\n", - "-------------------------------------------\n" - ] + "data": { + "text/plain": [ + "Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',\n", + " 'marital-status', 'occupation', 'relationship', 'race', 'sex',\n", + " 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',\n", + " 'ID', 'scont_1', 'scont_2', 'scont_3', 'scont_4', 'scont_5', 'scont_6',\n", + " 'scont_7', 'scont_8', 'scont_9', 'scont_10', 'scat_1', 'scat_2',\n", + " 'scat_3', 'scat_4', 'scat_5', 'sflag_1', 'sflag_2', 'sflag_3',\n", + " 'sflag_4', 'sflag_5', 'TARGET', 'split'],\n", + " dtype='object')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "build.summary(df_transformed) #Dataframe with transformed dataset" + "basetable.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Moreover, the class has few available attributes, which migth help you throughout your work." + "## Data preparation" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "train 29305\n", + "validation 9769\n", + "selection 9768\n", + "Name: split, dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#Dictionary with variables used grouped by data type\n", - "#build._headers_dict\n", + "# Prepare data\n", + "path = \"../test_pipeline.json\"\n", + "preprocessor = PreProcessor.from_params(serialization_path=path)\n", "\n", - "#Size of partitionsd\n", - "#build._partitioning_settings" + "basetable = preprocessor.train_selection_validation_split(basetable, target_column_name=\"TARGET\",\n", + " train_prop=0.6, selection_prop=0.2,\n", + " validation_prop=0.2)\n", + "basetable.split.value_counts()" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 20, "metadata": {}, + "outputs": [], "source": [ - "## Univariate selection\n", - "The second step in the process is univariate selection." + "# create lists containing the continuous and discrete variables \n", + "continuous_vars = [\"age\", \"capital-gain\", \"capital-loss\"] + [f\"scont_{i}\" for i in range(1, 11)]\n", + "discrete_vars = ([\"workclass\", \"fnlwgt\", \"education\", \"marital-status\",\n", + " \"occupation\", \"relationship\", \"race\", \"sex\",\n", + " \"hours-per-week\", \"native-country\"] \n", + " + [f\"scat_{i}\" for i in range(1, 6)] \n", + " + [f\"sflag_{i}\" for i in range(1, 6)])" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Only 1 bin was found for predictor 'capital-gain' so it will be ignored in computation\n", + "Only 1 bin was found for predictor 'capital-loss' so it will be ignored in computation\n", + "The number of actual bins for predictor 'scont_1' is 4 which is smaller than the requested number of bins 10\n", + "The number of actual bins for predictor 'scont_2' is 9 which is smaller than the requested number of bins 10\n", + "/home/matthias/.local/lib/python3.8/site-packages/pandas/core/indexing.py:845: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self.obj[key] = _infer_fill_value(value)\n", + "/home/matthias/.local/lib/python3.8/site-packages/pandas/core/indexing.py:966: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self.obj[item] = s\n", + "DataFrame has no column 'capital-gain_bin', so it will be skipped in fitting\n", + "DataFrame has no column 'capital-loss_bin', so it will be skipped in fitting\n" + ] + } + ], "source": [ - "df_unisel, df_corr = build.fit_univariate(df_transformed, #Dataframe with transformed data\n", - " preselect_auc=0.53, #Minimal threshol for AUC selection\n", - " preselect_overtrain=5) #Threshold for difference between train and test performance (prevent overfitting)" + "preprocessor.fit(basetable[basetable[\"split\"]==\"train\"],\n", + " continuous_vars=continuous_vars, discrete_vars=discrete_vars,\n", + " target_column_name=\"TARGET\")" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 23, "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Unknown column 'capital-gain_bin' will be skipped\n", + "Unknown column 'capital-loss_bin' will be skipped\n" + ] + } + ], "source": [ - "Below is first 10 rows of the output of univariate selection. You can see AUC on train and test and column preselection, which shows which variables met the first selection criteria (AUC >= 0.53 and not overtrained)." + "basetable = (preprocessor\n", + " .transform(basetable,\n", + " continuous_vars=continuous_vars, \n", + " discrete_vars=discrete_vars))" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -933,120 +447,244 @@ " \n", " \n", " \n", - " variable\n", - " AUC train\n", - " AUC selection\n", - " preselection\n", + " age\n", + " workclass\n", + " fnlwgt\n", + " education\n", + " education-num\n", + " marital-status\n", + " occupation\n", + " relationship\n", + " race\n", + " sex\n", + " ...\n", + " scont_1_enc\n", + " scont_2_enc\n", + " scont_3_enc\n", + " scont_4_enc\n", + " scont_5_enc\n", + " scont_6_enc\n", + " scont_7_enc\n", + " scont_8_enc\n", + " scont_9_enc\n", + " scont_10_enc\n", " \n", " \n", " \n", " \n", " 0\n", - " scont_6\n", - " 0.512\n", - " 0.505\n", - " False\n", + " 56\n", + " Self-emp-not-inc\n", + " 206149\n", + " 7th-8th\n", + " 4\n", + " Never-married\n", + " Other-service\n", + " Unmarried\n", + " Black\n", + " Female\n", + " ...\n", + " 0.240395\n", + " 0.248292\n", + " 0.234307\n", + " 0.240191\n", + " 0.248383\n", + " 0.236050\n", + " 0.245621\n", + " 0.228513\n", + " 0.230132\n", + " 0.229873\n", " \n", " \n", " 1\n", - " scont_3\n", - " 0.508\n", - " 0.507\n", - " False\n", + " 38\n", + " Self-emp-not-inc\n", + " 342635\n", + " Bachelors\n", + " 13\n", + " Married-civ-spouse\n", + " Prof-specialty\n", + " Husband\n", + " White\n", + " Male\n", + " ...\n", + " 0.230626\n", + " 0.223856\n", + " 0.235635\n", + " 0.233058\n", + " 0.237876\n", + " 0.247832\n", + " 0.242909\n", + " 0.237694\n", + " 0.244211\n", + " 0.227472\n", " \n", " \n", " 2\n", - " marital-status\n", - " 0.769\n", - " 0.773\n", - " True\n", + " 60\n", + " Self-emp-not-inc\n", + " 197060\n", + " HS-grad\n", + " 9\n", + " Married-civ-spouse\n", + " Craft-repair\n", + " Husband\n", + " White\n", + " Male\n", + " ...\n", + " 0.243971\n", + " 0.267262\n", + " 0.234307\n", + " 0.246637\n", + " 0.248383\n", + " 0.225694\n", + " 0.233650\n", + " 0.232350\n", + " 0.244211\n", + " 0.229873\n", " \n", " \n", " 3\n", - " occupation\n", - " 0.729\n", - " 0.722\n", - " True\n", + " 21\n", + " Private\n", + " 113106\n", + " HS-grad\n", + " 9\n", + " Never-married\n", + " Sales\n", + " Other-relative\n", + " White\n", + " Female\n", + " ...\n", + " 0.240395\n", + " 0.244848\n", + " 0.234307\n", + " 0.233058\n", + " 0.237245\n", + " 0.244987\n", + " 0.245621\n", + " 0.245230\n", + " 0.244211\n", + " 0.229873\n", " \n", " \n", " 4\n", - " sflag_5\n", - " 0.506\n", - " 0.514\n", - " False\n", - " \n", - " \n", - " 5\n", - " scont_8\n", - " 0.507\n", - " 0.495\n", - " False\n", - " \n", - " \n", - " 6\n", - " scat_5\n", - " 0.500\n", - " 0.500\n", - " False\n", - " \n", - " \n", - " 7\n", - " education-num\n", - " 0.698\n", - " 0.695\n", - " True\n", - " \n", - " \n", - " 8\n", - " scat_3\n", - " 0.500\n", - " 0.500\n", - " False\n", - " \n", - " \n", - " 9\n", - " sex\n", - " 0.615\n", - " 0.621\n", - " True\n", + " 27\n", + " Private\n", + " 169117\n", + " HS-grad\n", + " 9\n", + " Married-civ-spouse\n", + " Adm-clerical\n", + " Wife\n", + " Black\n", + " Female\n", + " ...\n", + " 0.230626\n", + " 0.244848\n", + " 0.229594\n", + " 0.246637\n", + " 0.232236\n", + " 0.233454\n", + " 0.242909\n", + " 0.241368\n", + " 0.240012\n", + " 0.227472\n", " \n", " \n", "\n", + "

    5 rows × 99 columns

    \n", "" ], "text/plain": [ - " variable AUC train AUC selection preselection\n", - "0 scont_6 0.512 0.505 False\n", - "1 scont_3 0.508 0.507 False\n", - "2 marital-status 0.769 0.773 True\n", - "3 occupation 0.729 0.722 True\n", - "4 sflag_5 0.506 0.514 False\n", - "5 scont_8 0.507 0.495 False\n", - "6 scat_5 0.500 0.500 False\n", - "7 education-num 0.698 0.695 True\n", - "8 scat_3 0.500 0.500 False\n", - "9 sex 0.615 0.621 True" + " age workclass fnlwgt education education-num \\\n", + "0 56 Self-emp-not-inc 206149 7th-8th 4 \n", + "1 38 Self-emp-not-inc 342635 Bachelors 13 \n", + "2 60 Self-emp-not-inc 197060 HS-grad 9 \n", + "3 21 Private 113106 HS-grad 9 \n", + "4 27 Private 169117 HS-grad 9 \n", + "\n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Other-service Unmarried Black Female \n", + "1 Married-civ-spouse Prof-specialty Husband White Male \n", + "2 Married-civ-spouse Craft-repair Husband White Male \n", + "3 Never-married Sales Other-relative White Female \n", + "4 Married-civ-spouse Adm-clerical Wife Black Female \n", + "\n", + " ... scont_1_enc scont_2_enc scont_3_enc scont_4_enc scont_5_enc \\\n", + "0 ... 0.240395 0.248292 0.234307 0.240191 0.248383 \n", + "1 ... 0.230626 0.223856 0.235635 0.233058 0.237876 \n", + "2 ... 0.243971 0.267262 0.234307 0.246637 0.248383 \n", + "3 ... 0.240395 0.244848 0.234307 0.233058 0.237245 \n", + "4 ... 0.230626 0.244848 0.229594 0.246637 0.232236 \n", + "\n", + " scont_6_enc scont_7_enc scont_8_enc scont_9_enc scont_10_enc \n", + "0 0.236050 0.245621 0.228513 0.230132 0.229873 \n", + "1 0.247832 0.242909 0.237694 0.244211 0.227472 \n", + "2 0.225694 0.233650 0.232350 0.244211 0.229873 \n", + "3 0.244987 0.245621 0.245230 0.244211 0.229873 \n", + "4 0.233454 0.242909 0.241368 0.240012 0.227472 \n", + "\n", + "[5 rows x 99 columns]" ] }, - "execution_count": 7, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_unisel.head(n=10)" + "basetable.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Second output is a correlation matrix." + "## Univariate selection\n" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['workclass_enc', 'fnlwgt_enc', 'education_enc', 'marital-status_enc', 'occupation_enc', 'relationship_enc', 'race_enc', 'sex_enc', 'hours-per-week_enc', 'native-country_enc', 'scat_1_enc', 'scat_2_enc', 'scat_3_enc', 'scat_4_enc', 'scat_5_enc', 'sflag_1_enc', 'sflag_2_enc', 'sflag_3_enc', 'sflag_4_enc', 'sflag_5_enc', 'age_enc', 'scont_1_enc', 'scont_2_enc', 'scont_3_enc', 'scont_4_enc', 'scont_5_enc', 'scont_6_enc', 'scont_7_enc', 'scont_8_enc', 'scont_9_enc', 'scont_10_enc']\n" + ] + } + ], + "source": [ + "preprocessed_predictors = [col for col in basetable.columns if col.endswith(\"_enc\")]\n", + "print(preprocessed_predictors)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "df_auc = univariate_selection.compute_univariate_preselection(\n", + " target_enc_train_data=basetable[basetable[\"split\"] == \"train\"],\n", + " target_enc_selection_data=basetable[basetable[\"split\"] == \"selection\"],\n", + " predictors=preprocessed_predictors,\n", + " target_column=\"TARGET\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below is first 10 rows of the output of univariate selection. You can see AUC on train and test and column preselection, which shows which variables met the first selection criteria (AUC >= 0.53 and not overtrained)." + ] + }, + { + "cell_type": "code", + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -1070,638 +708,165 @@ " \n", " \n", " \n", - " D_scont_6\n", - " D_scont_3\n", - " D_marital-status\n", - " D_occupation\n", - " D_sflag_5\n", - " D_scont_8\n", - " D_scat_5\n", - " D_education-num\n", - " D_scat_3\n", - " D_sex\n", - " D_native-country\n", - " D_hours-per-week\n", - " D_scont_2\n", - " D_sflag_2\n", - " D_scat_4\n", - " D_scont_4\n", - " D_sflag_1\n", - " D_scont_1\n", - " D_workclass\n", - " D_scont_9\n", - " D_scat_1\n", - " D_capital-loss\n", - " D_capital-gain\n", - " D_sflag_3\n", - " D_scat_2\n", - " D_fnlwgt\n", - " D_scont_7\n", - " D_sflag_4\n", - " D_relationship\n", - " D_education\n", - " D_scont_5\n", - " D_age\n", - " D_scont_10\n", - " D_race\n", + " predictor\n", + " AUC train\n", + " AUC selection\n", + " preselection\n", " \n", " \n", " \n", " \n", - " D_scont_6\n", - " 1.000000\n", - " 0.007128\n", - " 0.009146\n", - " 0.013050\n", - " -0.001755\n", - " -0.001888\n", - " 0.0\n", - " 0.016360\n", - " 0.0\n", - " -0.004987\n", - " 0.005869\n", - " 0.006357\n", - " 0.000534\n", - " -0.000288\n", - " 0.0\n", - " -0.010261\n", - " 0.006295\n", - " -0.002706\n", - " 0.008775\n", - " -0.002586\n", - " 0.0\n", - " -0.005740\n", - " 0.009891\n", - " -0.003451\n", - " 0.0\n", - " 0.011761\n", - " 0.006252\n", - " 0.006015\n", - " 0.011639\n", - " 0.016165\n", - " -0.005525\n", - " 0.013099\n", - " 0.000822\n", - " 0.010272\n", + " 0\n", + " relationship\n", + " 0.777379\n", + " 0.779450\n", + " True\n", " \n", " \n", - " D_scont_3\n", - " 0.007128\n", - " 1.000000\n", - " 0.008258\n", - " 0.008033\n", - " 0.008862\n", - " 0.004288\n", - " 0.0\n", - " 0.007674\n", - " 0.0\n", - " 0.007207\n", - " 0.010168\n", - " 0.005486\n", - " 0.003948\n", - " -0.000631\n", - " 0.0\n", - " 0.008515\n", - " 0.009562\n", - " 0.001590\n", - " 0.015998\n", - " -0.005320\n", - " 0.0\n", - " 0.002691\n", - " -0.003005\n", - " 0.005161\n", - " 0.0\n", - " -0.005306\n", - " -0.018140\n", - " -0.005946\n", - " 0.008531\n", - " 0.005123\n", - " 0.004172\n", - " 0.011820\n", - " -0.008748\n", - " 0.004958\n", + " 1\n", + " marital-status\n", + " 0.767728\n", + " 0.771644\n", + " True\n", " \n", " \n", - " D_marital-status\n", - " 0.009146\n", - " 0.008258\n", - " 1.000000\n", - " 0.194836\n", - " 0.003072\n", - " 0.002741\n", - " 0.0\n", - " 0.110648\n", - " 0.0\n", - " 0.417974\n", - " 0.007336\n", - " 0.241776\n", - " 0.009113\n", - " 0.012480\n", - " 0.0\n", - " 0.005861\n", - " 0.014518\n", - " 0.004128\n", - " 0.153130\n", - " 0.008042\n", - " 0.0\n", - " 0.080024\n", - " 0.136724\n", - " 0.007978\n", - " 0.0\n", - " 0.023283\n", - " -0.007105\n", - " 0.003858\n", - " 0.968994\n", - " 0.124311\n", - " -0.005558\n", - " 0.406239\n", - " 0.005080\n", - " 0.127041\n", + " 2\n", + " occupation\n", + " 0.727460\n", + " 0.734469\n", + " True\n", " \n", " \n", - " D_occupation\n", - " 0.013050\n", - " 0.008033\n", - " 0.194836\n", - " 1.000000\n", - " 0.002311\n", - " 0.001243\n", - " 0.0\n", - " 0.498731\n", - " 0.0\n", - " 0.101265\n", - " 0.089346\n", - " 0.260357\n", - " 0.008588\n", - " 0.004220\n", - " 0.0\n", - " 0.008526\n", - " 0.006996\n", - " 0.008079\n", - " 0.265053\n", - " 0.007554\n", - " 0.0\n", - " 0.077405\n", - " 0.121000\n", - " -0.003457\n", - " 0.0\n", - " 0.018786\n", - " 0.005152\n", - " 0.009259\n", - " 0.213882\n", - " 0.514657\n", - " 0.000169\n", - " 0.207039\n", - " -0.000281\n", - " 0.102792\n", + " 3\n", + " age\n", + " 0.707488\n", + " 0.716277\n", + " True\n", " \n", " \n", - " D_sflag_5\n", - " -0.001755\n", - " 0.008862\n", - " 0.003072\n", - " 0.002311\n", - " 1.000000\n", - " -0.000989\n", - " 0.0\n", - " 0.000375\n", - " 0.0\n", - " -0.005644\n", - " 0.007657\n", - " 0.001318\n", - " -0.004619\n", - " -0.008614\n", - " 0.0\n", - " -0.003978\n", - " 0.009308\n", - " 0.000902\n", - " 0.002748\n", - " 0.003078\n", - " 0.0\n", - " 0.026558\n", - " -0.003853\n", - " -0.016438\n", - " 0.0\n", - " -0.004404\n", - " -0.000654\n", - " 0.003012\n", - " 0.003403\n", - " -0.000387\n", - " -0.002178\n", - " -0.002454\n", - " -0.000998\n", - " 0.013506\n", + " 4\n", + " education\n", + " 0.717664\n", + " 0.714776\n", + " True\n", " \n", " \n", - " D_scont_8\n", - " -0.001888\n", - " 0.004288\n", - " 0.002741\n", - " 0.001243\n", - " -0.000989\n", - " 1.000000\n", - " 0.0\n", - " -0.000552\n", - " 0.0\n", - " 0.005559\n", - " 0.005384\n", - " 0.003167\n", - " -0.005675\n", - " 0.003946\n", - " 0.0\n", - " 0.003569\n", - " -0.003863\n", - " -0.014552\n", - " 0.007620\n", - " 0.010211\n", - " 0.0\n", - " -0.000134\n", - " 0.008799\n", - " 0.008798\n", - " 0.0\n", - " 0.005380\n", - " 0.008139\n", - " -0.001740\n", - " 0.002170\n", - " -0.000085\n", - " 0.005394\n", - " 0.002401\n", - " -0.002377\n", - " 0.007083\n", + " 5\n", + " hours-per-week\n", + " 0.667920\n", + " 0.664445\n", + " True\n", " \n", " \n", - " D_scat_5\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.000000\n", - " 0.0\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", + " 6\n", + " sex\n", + " 0.617710\n", + " 0.622441\n", + " True\n", " \n", " \n", - " D_education-num\n", - " 0.016360\n", - " 0.007674\n", - " 0.110648\n", - " 0.498731\n", - " 0.000375\n", - " -0.000552\n", - " 0.0\n", - " 1.000000\n", - " 0.0\n", - " 0.028074\n", - " 0.099447\n", - " 0.170187\n", - " -0.000245\n", - " 0.006791\n", - " 0.0\n", - " -0.003311\n", - " 0.005988\n", - " 0.001628\n", - " 0.159168\n", - " 0.010261\n", - " 0.0\n", - " 0.081967\n", - " 0.136519\n", - " -0.005692\n", - " 0.0\n", - " 0.015937\n", - " 0.011899\n", - " 0.001293\n", - " 0.133911\n", - " 0.932743\n", - " -0.001994\n", - " 0.165189\n", - " 0.000595\n", - " 0.085323\n", + " 7\n", + " workclass\n", + " 0.584366\n", + " 0.584248\n", + " True\n", " \n", " \n", - " D_scat_3\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.000000\n", - " 0.0\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", + " 8\n", + " race\n", + " 0.533289\n", + " 0.535948\n", + " True\n", " \n", " \n", - " D_sex\n", - " -0.004987\n", - " 0.007207\n", - " 0.417974\n", - " 0.101265\n", - " -0.005644\n", - " 0.005559\n", - " 0.0\n", - " 0.028074\n", - " 0.0\n", - " 1.000000\n", - " -0.022878\n", - " 0.245770\n", - " 0.005180\n", - " 0.014224\n", - " 0.0\n", - " 0.000182\n", - " 0.003786\n", - " 0.005233\n", - " 0.102203\n", - " 0.000280\n", - " 0.0\n", - " 0.049508\n", - " 0.069717\n", - " -0.001209\n", - " 0.0\n", - " -0.013472\n", - " -0.007598\n", - " 0.003615\n", - " 0.410982\n", - " 0.041657\n", - " 0.003662\n", - " 0.108070\n", - " 0.003784\n", - " 0.116125\n", + " 9\n", + " native-country\n", + " 0.514099\n", + " 0.513408\n", + " True\n", " \n", " \n", "\n", "" ], "text/plain": [ - " D_scont_6 D_scont_3 D_marital-status D_occupation \\\n", - "D_scont_6 1.000000 0.007128 0.009146 0.013050 \n", - "D_scont_3 0.007128 1.000000 0.008258 0.008033 \n", - "D_marital-status 0.009146 0.008258 1.000000 0.194836 \n", - "D_occupation 0.013050 0.008033 0.194836 1.000000 \n", - "D_sflag_5 -0.001755 0.008862 0.003072 0.002311 \n", - "D_scont_8 -0.001888 0.004288 0.002741 0.001243 \n", - "D_scat_5 0.000000 0.000000 0.000000 0.000000 \n", - "D_education-num 0.016360 0.007674 0.110648 0.498731 \n", - "D_scat_3 0.000000 0.000000 0.000000 0.000000 \n", - "D_sex -0.004987 0.007207 0.417974 0.101265 \n", - "\n", - " D_sflag_5 D_scont_8 D_scat_5 D_education-num D_scat_3 \\\n", - "D_scont_6 -0.001755 -0.001888 0.0 0.016360 0.0 \n", - "D_scont_3 0.008862 0.004288 0.0 0.007674 0.0 \n", - "D_marital-status 0.003072 0.002741 0.0 0.110648 0.0 \n", - "D_occupation 0.002311 0.001243 0.0 0.498731 0.0 \n", - "D_sflag_5 1.000000 -0.000989 0.0 0.000375 0.0 \n", - "D_scont_8 -0.000989 1.000000 0.0 -0.000552 0.0 \n", - "D_scat_5 0.000000 0.000000 0.0 0.000000 0.0 \n", - "D_education-num 0.000375 -0.000552 0.0 1.000000 0.0 \n", - "D_scat_3 0.000000 0.000000 0.0 0.000000 0.0 \n", - "D_sex -0.005644 0.005559 0.0 0.028074 0.0 \n", - "\n", - " D_sex D_native-country D_hours-per-week D_scont_2 \\\n", - "D_scont_6 -0.004987 0.005869 0.006357 0.000534 \n", - "D_scont_3 0.007207 0.010168 0.005486 0.003948 \n", - "D_marital-status 0.417974 0.007336 0.241776 0.009113 \n", - "D_occupation 0.101265 0.089346 0.260357 0.008588 \n", - "D_sflag_5 -0.005644 0.007657 0.001318 -0.004619 \n", - "D_scont_8 0.005559 0.005384 0.003167 -0.005675 \n", - "D_scat_5 0.000000 0.000000 0.000000 0.000000 \n", - "D_education-num 0.028074 0.099447 0.170187 -0.000245 \n", - "D_scat_3 0.000000 0.000000 0.000000 0.000000 \n", - "D_sex 1.000000 -0.022878 0.245770 0.005180 \n", - "\n", - " D_sflag_2 D_scat_4 D_scont_4 D_sflag_1 D_scont_1 \\\n", - "D_scont_6 -0.000288 0.0 -0.010261 0.006295 -0.002706 \n", - "D_scont_3 -0.000631 0.0 0.008515 0.009562 0.001590 \n", - "D_marital-status 0.012480 0.0 0.005861 0.014518 0.004128 \n", - "D_occupation 0.004220 0.0 0.008526 0.006996 0.008079 \n", - "D_sflag_5 -0.008614 0.0 -0.003978 0.009308 0.000902 \n", - "D_scont_8 0.003946 0.0 0.003569 -0.003863 -0.014552 \n", - "D_scat_5 0.000000 0.0 0.000000 0.000000 0.000000 \n", - "D_education-num 0.006791 0.0 -0.003311 0.005988 0.001628 \n", - "D_scat_3 0.000000 0.0 0.000000 0.000000 0.000000 \n", - "D_sex 0.014224 0.0 0.000182 0.003786 0.005233 \n", - "\n", - " D_workclass D_scont_9 D_scat_1 D_capital-loss \\\n", - "D_scont_6 0.008775 -0.002586 0.0 -0.005740 \n", - "D_scont_3 0.015998 -0.005320 0.0 0.002691 \n", - "D_marital-status 0.153130 0.008042 0.0 0.080024 \n", - "D_occupation 0.265053 0.007554 0.0 0.077405 \n", - "D_sflag_5 0.002748 0.003078 0.0 0.026558 \n", - "D_scont_8 0.007620 0.010211 0.0 -0.000134 \n", - "D_scat_5 0.000000 0.000000 0.0 0.000000 \n", - "D_education-num 0.159168 0.010261 0.0 0.081967 \n", - "D_scat_3 0.000000 0.000000 0.0 0.000000 \n", - "D_sex 0.102203 0.000280 0.0 0.049508 \n", - "\n", - " D_capital-gain D_sflag_3 D_scat_2 D_fnlwgt D_scont_7 \\\n", - "D_scont_6 0.009891 -0.003451 0.0 0.011761 0.006252 \n", - "D_scont_3 -0.003005 0.005161 0.0 -0.005306 -0.018140 \n", - "D_marital-status 0.136724 0.007978 0.0 0.023283 -0.007105 \n", - "D_occupation 0.121000 -0.003457 0.0 0.018786 0.005152 \n", - "D_sflag_5 -0.003853 -0.016438 0.0 -0.004404 -0.000654 \n", - "D_scont_8 0.008799 0.008798 0.0 0.005380 0.008139 \n", - "D_scat_5 0.000000 0.000000 0.0 0.000000 0.000000 \n", - "D_education-num 0.136519 -0.005692 0.0 0.015937 0.011899 \n", - "D_scat_3 0.000000 0.000000 0.0 0.000000 0.000000 \n", - "D_sex 0.069717 -0.001209 0.0 -0.013472 -0.007598 \n", - "\n", - " D_sflag_4 D_relationship D_education D_scont_5 D_age \\\n", - "D_scont_6 0.006015 0.011639 0.016165 -0.005525 0.013099 \n", - "D_scont_3 -0.005946 0.008531 0.005123 0.004172 0.011820 \n", - "D_marital-status 0.003858 0.968994 0.124311 -0.005558 0.406239 \n", - "D_occupation 0.009259 0.213882 0.514657 0.000169 0.207039 \n", - "D_sflag_5 0.003012 0.003403 -0.000387 -0.002178 -0.002454 \n", - "D_scont_8 -0.001740 0.002170 -0.000085 0.005394 0.002401 \n", - "D_scat_5 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "D_education-num 0.001293 0.133911 0.932743 -0.001994 0.165189 \n", - "D_scat_3 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "D_sex 0.003615 0.410982 0.041657 0.003662 0.108070 \n", - "\n", - " D_scont_10 D_race \n", - "D_scont_6 0.000822 0.010272 \n", - "D_scont_3 -0.008748 0.004958 \n", - "D_marital-status 0.005080 0.127041 \n", - "D_occupation -0.000281 0.102792 \n", - "D_sflag_5 -0.000998 0.013506 \n", - "D_scont_8 -0.002377 0.007083 \n", - "D_scat_5 0.000000 0.000000 \n", - "D_education-num 0.000595 0.085323 \n", - "D_scat_3 0.000000 0.000000 \n", - "D_sex 0.003784 0.116125 " + " predictor AUC train AUC selection preselection\n", + "0 relationship 0.777379 0.779450 True\n", + "1 marital-status 0.767728 0.771644 True\n", + "2 occupation 0.727460 0.734469 True\n", + "3 age 0.707488 0.716277 True\n", + "4 education 0.717664 0.714776 True\n", + "5 hours-per-week 0.667920 0.664445 True\n", + "6 sex 0.617710 0.622441 True\n", + "7 workclass 0.584366 0.584248 True\n", + "8 race 0.533289 0.535948 True\n", + "9 native-country 0.514099 0.513408 True" ] }, - "execution_count": 8, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_corr.head(n=10)" + "df_auc.head(n=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Clearly, for meaningful inspection, we need to visualize the data. Therefore, below are plots for **Predictor quality**, **Correlation Matrix** and **Incidence**." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAwIAAAHoCAYAAAD6/Fh1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzs3Xl8TXf+x/H3jUQWuRE0QstYYq2l9qUqTamOpRsPUVtaaqqipZhWCNIgCLG0dESj1UiqgplQWg9FdWqqrZYZRSm1Va0JScRNZL+/Pzx6f00jDXKTi/N6/pWc5Xs+5xOPh/O+33PuMVmtVqsAAAAAGIqTowsAAAAAUP4IAgAAAIABEQQAAAAAAyIIAAAAAAZEEAAAAAAMiCAAAAAAGBBBAADKWOPGjZWSklJo2ZYtWxQUFFTivi+99JKOHTtmlzrefvttbdiw4U+3uXr1qp5//vnbGn/37t0aPny4evXqpaefflrDhw/X999/f1tj/ebMmTNq3bq1JGn16tWKiYmRJK1bt06rVq0q1di/sVgsGjhwoPr06aOtW7cWWjdp0iR17dpVzzzzjJ599lk9+eSTCg4O1uXLl0t1zAMHDqhbt26SCp9Xcex5vgDwG2dHFwAAKN7y5cvtNtZrr71W4jZXrlzRgQMHbnnsL7/8UuHh4Vq0aJFatWolSdq3b5/Gjx+vsLAwPfbYY7c85h8NGjTI9vPevXvVsGHDUo8pSYcPH9bly5e1bdu2G64fNmyYRowYYfs9MjJS06dP1+LFi+1y/N+fV3Hseb4A8BuCAAA42JIlS3T27FklJyfr7Nmz8vX1VVRUlKpXr65u3brp7bffVmxsrJo1a6YXX3xRkvTRRx/pu+++08KFCzV79mz98MMPysjIkNVqVUREhNq2batJkyYpLS1Nv/76qwICAnT58mU1bNhQI0aM0D//+U+tWbNGubm5unLlil566SUNHjxYkydPVlZWlp555hklJibq1KlTmjVrltLS0pSfn6+goCD179+/yDnMmzdPkydPtoUASWrVqpVCQ0MVFRWlxx57TEuWLFFqaqrCwsJs5/3b7/v27VNUVJRycnKUnJyshx9+WLNnzy7Sp9TUVHXu3Fk7duzQrl275Obmpri4OIWFhalLly6SpClTpqhRo0Z64YUXCu2/fft2vfPOOyooKFClSpU0efJkeXp6KjQ0VBcvXtQzzzyjNWvWyM3N7U//Xp07d1ZUVJQkqVu3bmrZsqWOHDmiCRMmqGXLlpoxY4bOnz+v3Nxc9enTR6NGjbL9zVauXClPT081atSoyHmFhYXp5MmTCgsLU0pKipycnBQcHCwXF5dC5ztgwABFRkbqm2++UYUKFdSyZUvbufyxnuTkZCUkJMjFxUWurq6aMWOGGjRocFP/LgHc+wgCAHAH2LNnjzZs2CBPT0+NGjVKCQkJGjt2rG19YGCgZs2aZQsC69ev1/jx4/XDDz8oKSlJa9askZOTk2JiYrR8+XK1bdtWkpSVlaVPP/1U0vXbXCQpIyND69atU0xMjKpUqaJ9+/Zp+PDhGjx4sObMmaOnnnpKH3/8sfLy8jR27FjNmzdPzZo109WrV/Xcc8+pQYMGhS74r1y5omPHjql9+/ZFzuvhhx/Wq6++qitXrvzp+cfFxWns2LHq2LGjMjIy1L17dx08eFDe3t5Ftu3Ro4c+//xzNWzYUEOGDFFubq7Wrl2rLl26yGKxaMeOHQoJCSm0z/Hjx/Xmm28qISFBtWvX1jfffKPRo0dry5YtioiI0MyZM/Xxxx+X+HfKysrShg0b1LFjR9uyhg0b6q233pIkPf/88xo2bJi6deum7OxsvfTSS/rLX/6ievXq6Z133tHHH38sHx8fWxj6owkTJqh///4aMmSIzp8/r6CgIG3YsEHdunWzne/ixYuVlJSkjz/+WBUqVNCUKVM0b948zZgxo1A9+fn5euihh7Rjxw5Vr15dGzZs0N69ewkCAGwIAgBQxkwmU5FlBQUFcnL6/8e0OnToIE9PT0nSgw8+WOTCuWPHjsrOztaBAwfk7u6ulJQUde7cWSaTSZUrV1ZCQoJ+/fVX7d69W5UqVbLt91sg+L1KlSpp2bJl+vLLL3Xq1Cn99NNPyszMLLLdqVOndPr0aYWGhtqWZWVl6dChQ4WCwM0oKCj40/WRkZHauXOnli1bphMnTig7O1uZmZk3DAJ/1K9fP/3jH/9QSkqKtmzZooCAAHl5eRXa5ttvv1WnTp1Uu3ZtSdc/1a9ataoOHjx4w7/P78XGxmrjxo2SpPz8fLVv314TJkywrW/Xrp0kKTMzU99//72uXLmit99+27bsp59+0oULF9SlSxf5+PhIkp577jl99dVXhY6Tlpamn376SYGBgZKkmjVravv27UXq2blzp8aPHy8XFxdJUlBQkF555ZUi9VSoUEE9e/bUwIEDFRAQoEceeUSPPvron54rAGMhCABAGatSpYrS0tJUtWpV27LLly8Xusj9/e0oJpNJVqu10Bgmk0n9+/fXxx9/LBcXF/Xv318mk0n//ve/NWvWLA0fPlzdu3dX/fr1bRetkuTh4VGkngsXLui5557TgAED1LZtW/Xs2VNffPFFke3y8/NlNpsLfVJ+6dIlmc3mQttVrlxZfn5++u677/TXv/5VknTx4kX5+vrq22+/VZ06dVSlSpUi55Wbm2v7eejQoWrcuLG6du2qXr166YcffijSg+J4eXmpZ8+e2rhxozZt2qQ333yzyDYFBQVFLvitVqvy8vJsF9TF+eMzAn/0W48LCgpktVqVkJAgd3d3SVJKSopcXV21Zs2aQudToUKFIuM4O1//L/n3dZ44cUL333//n55LQUFBoV7+/m8+f/58HT16VF9//bViYmL08ccf20IKAPCtQQBQxvz9/RUfH2/7VPzKlStav379LX8627dvX+3YsUOfffaZ+vXrJ0natWuXHnvsMQ0ePFjNmzfX9u3blZ+f/6fjHDx4UFWrVtXo0aP1yCOP2EJAfn6+nJ2dlZ+fL6vVqnr16snNzc0WBM6fP68nn3xSBw8eLDJmSEiI5s6dq3379km6/szAkCFDNGvWLE2cOFHS9UD0448/ymq1ymKx2I6bnp6uAwcO6PXXX9cTTzyhCxcu6PTp0386i1ChQgXl5eXZfh8yZIji4uJktVrVsmXLItt37txZX331lX799VdJ0jfffKPz58/roYce+tNe3QpPT0+1atVKH3zwge28Bg0apM8//1xdunTRrl27dOHCBUnXb+260f7NmjWzfbPT+fPnNWjQIF29erXQ+Xbt2lWrV69Wbm6uCgoKtGrVKtvzEb+XkpKiRx99VN7e3ho2bJjGjRt3Ww+CA7h3MSMAAGVsypQpioyM1JNPPmn7JPiZZ55R3759b2kcHx8fPfjgg8rLy5Ovr68kaeDAgfr73/+up556Snl5eerSpYu2bt36pxfRXbp00T//+U/17NlTJpNJHTp0UNWqVfXLL7+oTp06atmypfr06aNVq1Zp6dKlmjVrlt577z3l5eXptddeu+HtRo8++qgiIyP19ttv6/z585KkatWq6f7779euXbvUrl07Pf300/rPf/6jJ554Qr6+vurQoYOsVqu8vLw0cuRI9e3bVx4eHvL19VWbNm30yy+/2G7l+SN/f39FRkZKkl5++WU1adJElStX1sCBA2+4fYMGDfTmm2/q1VdfVX5+vtzc3LRs2bIisxulNX/+fM2cOVNPPfWUcnJy9OSTT+rpp5+WJL3xxht64YUXVKlSpRuGFUlasGCBpk+frvj4eJlMJs2aNUs+Pj6Fzjc4OFhz587Vs88+q7y8PLVs2VLTpk0rMlbVqlUVHBysYcOGyc3NTRUqVFBERIRdzxfA3c1kvdm5VwAAbpHVatXOnTvVoUMH2+0yZeH06dMKCgrSli1byvQ4AHAvIQgAAO5qb7/9ttauXavp06fr8ccfd3Q5AHDXIAgAAAAABsTDwgAAAIABEQQAAAAAA+Jbgxzg2rUcWSzZji7jrufp6Uof7YA+lh49tA/6aB/0sfTooX3QR/sobR99fIr/djRmBBzA2bnoi2Rw6+ijfdDH0qOH9kEf7YM+lh49tA/6aB9l2UeCAAAAAGBABAEAAADAgHhGwAE6ha5ydAkAAAAoB1sm3tpb5MsTMwIAAACAAREEAAAAAAMiCAAAAAAGRBAAAAAADIggAAAAABgQQQAAAAAwIIIAAAAAYEAEAQAAAMCACAIAAACAAREEAAAAAAMiCAAAAAAGdFcHgSVLlmj16tXFrj937px27NghSZo1a5bOnTtX6mOeOXNGAwYMKLI8JiZG+/fvL/X4AAAAQHlwdnQBZenbb7/ViRMn1K1bN02ZMqVMjzVy5MgyHR8AAACwpzs6CCQmJupf//qXCgoKFBQUpJUrV8rJyUlt27bV66+/btsuPz9fYWFhunDhglJTU+Xv768xY8YoJiZGWVlZat26tWJjYxUeHi4fHx+98cYbslgsys/P12uvvabOnTvrqaeeUocOHXTkyBGZTCYtXbpUubm5GjdunKxWq3JzczV9+nRVqlRJKSkpGj16tJKTk9W4cWNFRERo0qRJ6t27ty5duqTPP/9cFotFqampeuWVV/TXv/7VgV0EAAAAirqjg4AkeXl5ac6cORo8eLD+9a9/yd3dXW+88YZ27dpl2+b8+fNq1aqVAgMDlZ2dLX9/f40bN04jR47UiRMn1L17d8XGxkqSoqOj9fDDD+uFF17QxYsXNWjQIG3fvl0ZGRnq06ePpk2bpr///e/auXOnKlWqJLPZrAULFujYsWOyWCyqVKmSLBaL5syZI7PZrB49eujy5cuFas7MzNQHH3yglJQUBQYGqnv37nJ2/v9WrzdHlUvvAAAA4Gh9HV1Ase74IFCvXj2dPn1aKSkptttvMjIy9Ouvv9q28fb21oEDB/Ttt9/K09NTOTk5xY53/PhxPfXUU5IkX19feXp6KiUlRZL04IMPSpJq1qyp7Oxs9erVS6dOndLo0aPl7Oys4OBgSVLt2rVVuXJlSVK1atV07dq1Qsdo3769nJycdN9998nLy0spKSmqXr26nToCAAAAlN4d/7Cwk5OTatWqpZo1a2rFihWKj4/X0KFD9dBDD9m2SUxMtH1y/+KLLyorK0tWq1VOTk4qKCgoNJ6fn5/27NkjSbp48aLS09Pl7e0tSTKZTIW23b17t6pXr64VK1YoODhYCxcuvOF2f/Tjjz9Kki5duiSLxaJq1aqVrgkAAACAnd3xMwKSVLVqVQ0bNkxBQUHKz8/XAw88oF69etnWd+7cWRMmTNDevXvl7u6uOnXqKCkpSY0aNVJ0dLSaNWtm2/bll19WaGioPvvsM2VlZWnGjBmFbtv5vSZNmmj8+PG2ZxNeeeWVm6r30qVLeuGFF3T16lW9+eabqlChQukaAAAAANiZyWq1Wh1dxL0kMTFRJ06cKPQw8x+dntGiHCsCAACAo7gHf12q/b29PZSWlnnb+/v4mItdd8ffGgQAAADA/u6KW4PuJv369XN0CQAAAECJmBEAAAAADIggAAAAABgQQQAAAAAwIIIAAAAAYEAEAQAAAMCACAIAAACAAREEAAAAAAPizcIOkJubX6o3xOG60r5pD9fRx9Kjh/ZBH+2DPpYePbQP+mgfvFkYAAAAgF0RBAAAAAADIggAAAAABkQQAAAAAAyIIAAAAAAYEEEAAAAAMCBnRxdgRJ1CVzm6BAAAAJTClol9HV1CqTEjAAAAABgQQQAAAAAwIIIAAAAAYEAEAQAAAMCACAIAAACAAREEAAAAAAMiCAAAAAAGRBAAAAAADIggAAAAABgQQQAAAAAwIIIAAAAAYEB3bRB49dVXJUlHjhzR999/X+x2u3fv1vjx429qzLS0NG3atOlPt9m2bZsuXrx484UCAAAAd6C7Ngi88847kqStW7fq2LFjdhnzyJEj2rFjx59uExcXJ4vFYpfjAQAAAI7i7IiDJiYm6osvvlBWVpaSk5P1/PPP6/PPP9fPP/+siRMn6sKFC9q6davy8vJkNpu1ZMkSffLJJ/rXv/6lgoICjR07Vq+//roSExO1fv16ubi4qFmzZjp37pxWrVplO87bb79dbA1bt27V8uXL5ezsrAceeEDz5s3TsmXL9NNPP2nNmjVq3bq1IiMjVVBQoPT0dE2dOlXp6ek6fPiwQkJCFBUVpZCQEK1du1aSNGDAAC1cuFAXL17U3Llz5ezsLC8vL82fP1+enp5l3lMAAADgVjgkCEhSRkaGVqxYoU8//VSxsbFau3atdu/erdjYWDVv3lyxsbFycnLSiBEjdODAAUmSl5eXoqOjbWP4+vqqb9++uu+++9SyZUt9/fXXiomJkbu7u8LCwvTVV1/J19f3hsf/5JNPNGzYMPXp00cbNmyQxWLRqFGjlJCQoOeee06bN29WSEiIGjdurE2bNikxMVERERFq2rSpwsPD5eLicsNxt2/frh49emjEiBHasWOH0tPTiwSB9eYoO3URAAAAjnAtuvD1nHvw1w6q5PY5LAg0bdpUkmQ2m+Xn5yeTyaTKlSsrNzdXLi4umjBhgjw8PHThwgXl5eVJkurVq/enY1arVk0hISGqVKmSTpw4oVatWtnW7dmzxzZDMGLECE2ePFnvvvuuVq9erfr16+vxxx8vNFb16tW1dOlSubm5KSMjo8RP9a1WqyRp1KhRWrZsmV544QX5+vqqZcuWt9YYAAAAoBw47BkBk8l0w+W5ubnavn273nrrLU2bNk0FBQW2i2wnp6LlmkwmFRQU6OrVq1q8eLEWLVqkiIgIubq62vaTpHbt2ik+Pl7x8fEKCAjQmjVrNGbMGH344YeSrj8E7OTkpIKCAknSrFmzNHbsWM2dO1eNGjWyjWUymWS1WuXq6qrLly8rPz9f6enpOnPmjCRp06ZN6tu3r+Lj49WwYUPbrUMAAADAncRhMwLFcXZ2lru7u/r166eKFSvKx8dHSUlJxW7fvHlzzZs3T35+fmrTpo369u0rDw8PeXl5KSkpSbVq1brhfi1bttTw4cPl7e2tSpUqKSAgQDk5OTp69KhiY2P19NNPa/To0apWrZpq1Kih1NRUSVLr1q01ceJErVixQl26dFH//v31l7/8RXXq1JEktWjRQpMmTZKHh4dcXFw0Y8YM+zcJAAAAKCWT9fcfm6NcnJ7RwtElAAAAwI7K6hkBb28PpaVl3vb+Pj7mYtfdtV8fCgAAAOD2EQQAAAAAAyIIAAAAAAZEEAAAAAAMiCAAAAAAGBBBAAAAADAgggAAAABgQAQBAAAAwIAIAgAAAIAB8WZhB8jNzS/VG+JwXWnftIfr6GPp0UP7oI/2QR9Ljx7aB320D94sDAAAAMCuCAIAAACAAREEAAAAAAMiCAAAAAAGRBAAAAAADIggAAAAABgQQQAAAAAwIGdHF2BEnUJXOboEAACAu9qWiX0dXcJdjxkBAAAAwIAIAgAAAIABEQQAAAAAAyIIAAAAAAZEEAAAAAAMiCAAAAAAGBBBAAAAADAgggAAAABgQAQBAAAAwIAIAgAAAIABEQRuwrZt23Tx4kUlJycrPDzc0eUAAAAApUYQuAlxcXGyWCzy8fEhCAAAAOCe4OzoAm5Gbm6uQkND9euvvyo/P1/Dhw/XAw88oFmzZslqtcrX11fz58/XkSNHiix76aWXFB4eLj8/P61evVqXLl1S37599dprr8nHx0cXL16Uv7+/xo8fr6NHjyoyMlIFBQVKT0/X1KlTlZ6ersOHDyskJERRUVEKCQnR2rVrtWvXLr311ltydXWVt7e3Zs+ercOHD2v58uVycXHRmTNn1Lt3bwUHBzu6fQAAAEARd0UQWLNmjapUqaKoqChZLBb169dPLi4uWrx4sfz8/LRq1SodP35c06ZN06JFiwotK87Zs2f1/vvvy2w2a/Dgwfrxxx/1yy+/KCQkRI0bN9amTZuUmJioiIgINW3aVOHh4XJxcZEkWa1WTZs2TatXr5avr69Wrlyp6OhoBQQE6Ny5c9q4caNycnLUtWvXGwaB9eaoMusVAADAvcQ9+GtHl3DPuiuCwPHjx/Xwww9Lkjw9PeXn56cdO3bIz89PkjRkyBBJ0uXLl4ss+z2r1Wr7uUmTJvL29pYktWzZUidPnlSNGjW0dOlSubm5KSMjQ56enjesJzU1VZ6envL19ZUktW/fXgsXLlRAQIAaNWokZ2dnOTs7y83NzU4dAAAAAOzrrnhGwM/PT3v27JEkWSwWHT16VLVq1dKpU6ckSTExMdq2bZuqV69eZFnFihWVnJwsSTp06JBtzOPHj+vatWvKz8/X/v371aBBA82aNUtjx47V3Llz1ahRI1twMJlMhUJElSpVZLFYlJSUJEn67rvvVLduXdu2AAAAwJ3urpgRGDBggKZNm6ZBgwYpOztbr776qvz8/BQaGionJyf5+Pho2LBh8vX1LbKsYsWKmjFjhmrWrKnq1avbxnRxcdFrr72mS5cuqWfPnmrSpImefvppjR49WtWqVVONGjWUmpoqSWrdurUmTpyomTNnSrp+sR8REaExY8bIZDKpcuXKmjNnjn7++WeH9AcAAAC4VSbr7z/qNogzZ85owoQJWrt2rUOOf3pGC4ccFwAA4G5j9GcEvL09lJaWedv7+/iYi113V9waBAAAAMC+DBkEatWq5bDZAAAAAOBOYMggAAAAABgdQQAAAAAwIIIAAAAAYEAEAQAAAMCACAIAAACAAREEAAAAAAMiCAAAAAAGRBAAAAAADMhktVqtji7CaHJz80v1qmhcV9pXbuM6+lh69NA+6KN90MfSo4f2QR/to7R99PExF7uOGQEAAADAgAgCAAAAgAERBAAAAAADIggAAAAABkQQAAAAAAyIIAAAAAAYkLOjCzCiTqGrHF0CAADAHWfLxL6OLsFQmBEAAAAADIggAAAAABgQQQAAAAAwIIIAAAAAYEAEAQAAAMCACAIAAACAAREEAAAAAAMiCAAAAAAGRBAAAAAADIggAAAAABgQQQAAAAAwoHs2CMyfP1+JiYl2GWvbtm26ePGikpOTFR4ebpcxAQAAAEe6Z4OAPcXFxcliscjHx4cgAAAAgHuCs6MLuF25ubl688039csvv6igoEDjxo1TWlqaoqOjVbVqVeXm5qp+/fravXu3EhIStGjRIklSly5dtGvXLp06dUpTp05Vbm6u3NzctGjRIl26dEmRkZEqKChQenq6pk6dqvT0dB0+fFghISGKiopSSEiI1q5dq127dumtt96Sq6urvL29NXv2bB0+fFjLly+Xi4uLzpw5o969eys4OLhI7evNUeXdLgAAgDuOe/DXji7B0O7aILBu3TpVqVJFs2fPVmpqqoYOHars7GytW7dO3t7eGjly5J/uP3fuXI0cOVL+/v7avHmzDh06pPT0dIWEhKhx48batGmTEhMTFRERoaZNmyo8PFwuLi6SJKvVqmnTpmn16tXy9fXVypUrFR0drYCAAJ07d04bN25UTk6OunbtesMgAAAAADjaXRsEjh49qr1792r//v2SpGvXrkmSqlSpIklq3br1DfezWq2SpJMnT9q26d27tyRpz549Wrp0qdzc3JSRkSFPT88bjpGamipPT0/5+vpKktq3b6+FCxcqICBAjRo1krOzs5ydneXm5manswUAAADs664NAvXr11eNGjU0atQoZWVlKTo6Wp988olSUlJUtWpVHThwQDVq1JCrq6uSk5MlSWfPntWVK1ckSX5+fjpw4IAefvhhbdy4UVeuXFFiYqLmz58vPz8/LV68WGfPnpUkmUwmW4CQrocNi8WipKQkVa9eXd99953q1q1r2xYAAAC40921QWDgwIGaOnWqhg4dKovFosGDB2vOnDkaMWKEKleuLGfn66fWvHlzmc1mBQYGys/PT7Vq1ZIkTZw4UWFhYYqOjpabm5uioqKUl5en0aNHq1q1aqpRo4ZSU1MlXZ9dmDhxombOnCnp+sV+RESExowZI5PJpMqVK2vOnDn6+eefHdMMAAAA4BaZrL//qBvl4vSMFo4uAQAAwOF4WLhk3t4eSkvLvO39fXzMxa7j60MBAAAAAyIIAAAAAAZEEAAAAAAMiCAAAAAAGBBBAAAAADAgggAAAABgQAQBAAAAwIAIAgAAAIABEQQAAAAAA+LNwg6Qm5tfqjfE4brSvmkP19HH0qOH9kEf7YM+lh49tA/6aB+8WRgAAACAXREEAAAAAAMiCAAAAAAGRBAAAAAADIggAAAAABgQQQAAAAAwIGdHF2BEnUJXOboEAACAMrFlYl9Hl4CbxIwAAAAAYEAEAQAAAMCACAIAAACAAREEAAAAAAMiCAAAAAAGRBAAAAAADIggAAAAABgQQQAAAAAwIIIAAAAAYEAEAQAAAMCACAIAAACAATk0CMyfP1+JiYl2GWvbtm26ePGikpOTFR4ebpcxAQAAgHvVPTMjEBcXJ4vFIh8fH4IAAAAAUALnshw8NzdXb775pn755RcVFBRo3LhxSktLU3R0tKpWrarc3FzVr19fu3fvVkJCghYtWiRJ6tKli3bt2qVTp05p6tSpys3NlZubmxYtWqRLly4pMjJSBQUFSk9P19SpU5Wenq7Dhw8rJCREUVFRCgkJ0dq1a7Vr1y699dZbcnV1lbe3t2bPnq3Dhw9r+fLlcnFx0ZkzZ9S7d28FBwcXqnv37t033GbSpEnq3bu3/P39tXPnTm3evFmRkZHq0aOHWrdurV9++UWdOnXS1atXtX//ftWrV09RUVFl2WIAAADgtpRpEFi3bp2qVKmi2bNnKzU1VUOHDlV2drbWrVsnb29vjRw58k/3nzt3rkaOHCl/f39t3rxZhw4dUnp6ukJCQtS4cWNt2rRJiYmJioiIUNOmTRUeHi4XFxdJktVq1bRp07R69Wr5+vpq5cqVio6OVkBAgM6dO6eNGzcqJydHXbt2LRIEJN3UNr85e/asVq5cKR8fH3Xo0EHr1q3TtGnT1L17d6Wnp8vLy6vQ9uvNhAMAAHBvcQ/+2tEl4BaVaRA4evSo9u7dq/3790uSrl27JkmqUqWKJKl169Y33M9qtUqSTp48adumd+/ekqQ9e/Zo6dKlcnNzU0ZGhjwj+jyIAAAgAElEQVQ9PW84Rmpqqjw9PeXr6ytJat++vRYuXKiAgAA1atRIzs7OcnZ2lpubmyTp5ZdfVmZmpho1aqQnnnjihtvcqEZJ8vb21v333y9J8vDwUIMGDSRJZrNZ2dnZN9suAAAAoNyUaRCoX7++atSooVGjRikrK0vR0dH65JNPlJKSoqpVq+rAgQOqUaOGXF1dlZycLOn6p+tXrlyRJPn5+enAgQN6+OGHtXHjRl25ckWJiYmaP3++/Pz8tHjxYp09e1aSZDKZCl2cV6lSRRaLRUlJSapevbq+++471a1b17btH7377ru2n3fv3n3DbSpWrGir89ChQ7blN9oWAAAAuJOVaRAYOHCgpk6dqqFDh8pisWjw4MGaM2eORowYocqVK8vZ+frhmzdvLrPZrMDAQPn5+alWrVqSpIkTJyosLEzR0dFyc3NTVFSU8vLyNHr0aFWrVk01atRQamqqpOuzCxMnTtTMmTMlXb84j4iI0JgxY2QymVS5cmXNmTNHP//8822fT2BgoEJDQ7Vp0yZbqAAAAADuRibr7z9GR7k4PaOFo0sAAACwqz8+I+Dt7aG0tEwHVXPvKG0ffXzMxa67Z74+FAAAAMDNIwgAAAAABkQQAAAAAAyIIAAAAAAYEEEAAAAAMCCCAAAAAGBABAEAAADAgAgCAAAAgAERBAAAAAADcnZ0AUZUc/I+3rRnB7yx0D7oY+nRQ/ugj/ZBH0uPHsIomBEAAAAADIggAAAAABgQQQAAAAAwIIIAAAAAYEAEAQAAAMCACAIAAACAAfH1oQ7QKXSVo0sAAAAolS0T+zq6BJQSMwIAAACAAREEAAAAAAMq8dag/Px8JSYm6vz58+rYsaMaNmyoqlWrlkdtAAAAAMpIiTMCYWFhOnfunHbt2qWMjAyFhISUR10AAAAAylCJQeD06dN67bXX5Orqqm7duunq1avlURcAAACAMlRiEMjPz1dKSookyWKxyMmJxwoAAACAu12JzwiMGzdOgwYNUnJysp577jmFhoaWR10AAAAAylCJQaBDhw767LPPlJKSwkPCAAAAwD2ixCCQkJCgNWvWKDs727Zs8+bNZVoUAAAAgLJVYhCIi4tTTEyMKleuXB71AAAAACgHJQaBxo0bq2bNmqpQoUJ51AMAAACgHJQYBDp16qTHH39ctWvXltVqlclkUlxcXHnUBgAAAKCMlBgE1qxZo7feektms7k86gEAAABQDkoMAr6+vmrRooWh3x9gsVg0ZcoUXb16VampqQoMDFTz5s01ffp0VapUSdWqVZOrq6siIyMVHx+vTz75RCaTSb1799bzzz/v6PIBAACAIkoMAjk5OXrmmWfUsGFDmUwmSdKCBQvKvLA7yS+//KI+ffroiSee0MWLFxUUFKRKlSpp3rx5atiwoRYtWqSLFy/q2LFj2rx5sz766COZTCYNGzZMjzzyiOrXr19ovPXmKAedCQAAgH1ci/7z6xnvyfvKqRLcrhKDwMsvv1weddzR7rvvPq1cuVJbt26Vp6en8vLylJSUpIYNG0qS2rZtq82bN+vo0aM6d+6chg0bJkm6cuWKTp8+XSQIAAAAAI5W4v0+jRo1UlJSks6dO6ezZ8/qf//7X3nUdUdZsWKFWrVqpfnz56tnz56yWq2qUaOGjh07Jkn64YcfJEn169dXgwYNFBcXp/j4ePXr10+NGjVyZOkAAADADZU4IzB27FjVrVtXR48elaurq9zd3cujrjvKY489pvDwcG3atEne3t6qUKGCwsLCFBoaKg8PD7m4uMjX11dNmjRR586dNWjQIOXk5Khly5by9fV1dPkAAABAESUGAUmaMWOGJk+erFmzZmnIkCFlXdMdp1OnTtqyZUuhZatWrdKyZctUtWpVLVq0SC4uLpKkv/3tb/rb3/7miDIBAACAm3ZTQSA7O1vXrl2TyWRSZmZmWdd0V6hWrZpefPFFeXh4yGw2KzIy0tElAQAAADetxCAwZMgQxcbGqkuXLnr00UfVtm3b8qjrjtezZ0/17NnT0WUAAAAAt6XEIPDXv/7V9nOvXr3k6elZpgUBAAAAKHslBoFdu3YpNjZW2dnZtmVxcXFlWhQAAACAslViEJgzZ45CQ0NVo0aN8qgHAAAAQDkoMQjUrFlTDz/8cHnUAgAAAKCclBgEqlWrprCwMD344IMymUySpOeee67MCwMAAABQdkoMArVq1ZIkXbp0qcyLAQAAAFA+SgwCr776annUAQAAAKAc3dQLxWBfNSfvU1oaL2YrLW9vD/poB/Sx9OihfdBH+6CPpUcPYRROji4AAAAAQPkrcUbAYrFo+fLlSk5OVkBAgBo3bqw6deqUR20AAAAAykiJMwKhoaGqXbu2Tp06pfvuu09Tpkwpj7oAAAAAlKESg0BaWpr69+8vZ2dntWnTRlartTzqAgAAAFCGbuoZgePHj0uSLly4ICcnHisAAAAA7nYlXtVPmTJFoaGhOnTokMaOHatJkyaVR10AAAAAypDJyr0+5a7tG3GOLgEAABjIlol9y/2YfA2rfZS2jz4+5mLXFfutQY888kixO3311Ve3XQwAAAAAxys2CHCxDwAAANy7SnxG4MCBA+rXr5/8/f01cOBAHT16tDzqAgAAAFCGSnyh2KxZszRv3jw1aNBAR44cUXh4uD766KPyqA0AAABAGSlxRsDV1VUNGjSQJDVu3FguLi5lXhQAAACAslXsjMCaNWuub+DsrPDwcLVv31779++Xp6dnuRUHAAAAoGwUGwSSk5MlSa1bt5YknTx5UmazWU2bNi2fygAAAACUmWKDwKuvvmr7OSkpSXl5ebJarUpKSiqXwgAAAACUnRIfFg4NDdW+fft07do1ZWVlqXbt2lq7dm151AYAAACgjJT4sPCJEyf06aef6pFHHtGnn34qV1fX8qgLAAAAQBkqMQhUqlRJJpNJmZmZqlq1qnJzc8ujLgAAAABlqMQg0KxZM73//vuqXr26xo8fr7y8vBIHTUxM1Pz58+1S4N0oKChIx48fd3QZAAAAQLFKfEZgwoQJysjIkKurq3bu3KmHHnqoPOoCAAAAUIaKDQLr1q1TYGCgFixYIJPJZFu+b98+TZgwocSBf/jhB7344otKSUnRoEGDVKtWLb311ltydXWVt7e3Zs+ercOHDyshIUGLFi2SJHXp0kW7du3SpEmTlJaWprS0NC1dulTjxo2T1WpVbm6upk+frsaNGxc6VlBQkOrVq6eTJ0/KarVq0aJF8vHx0YIFC/T999/LarVq2LBh6tWrl4KCglSlShWlp6fr/fffV4UKFSRJsbGxys/P14gRIxQWFqaKFStq6tSpWrp0qWrXrq1GjRopIiJCkmz1m83mGx7jNzt27NAHH3ygf/zjH/Ly8rqFPwsAAABQtooNAjVq1JAk1alTx3axfEsDOzvr/fff19mzZ/XSSy8pOztbq1evlq+vr1auXKno6GgFBAQUu3+nTp00bNgw/fvf/7ZdcB87dkwWi+WG27dp00YzZszQqlWr9O6776pr1646c+aMEhISlJ2drQEDBqhLly6SpKeeeko9evQotP8TTzyh0NBQjRgxQidPnlRWVpYk6auvvlJMTIxefPFFzZ49Ww0aNNC6dev03nvvqU2bNsUeY9u2bfr+++/17rvvysPDo9Cx1pujbrmfAAAAt+tadNFrD/fgrx1QCe4kxQaBrl27SpI2b96sFStW3PLADz74oEwmk3x8fHT+/Hn95S9/ka+vrySpffv2WrhwYZEgYLVabT/Xq1dPkuTv769Tp05p9OjRcnZ2VnBwsLZs2aJVq1ZJkkJCQiRdDw7S9UCwY8cO+fr66scff1RQUJAkKS8vT+fOnSs09qJFi/Tf//5X0vUZgaysLO3fv19+fn46d+6c9u/fL7PZLE9PTx0/flzTp0+XJOXm5qpevXo6evRoscf45ptvZLFY5Oxc4t1XAAAAQLkr8SrVbDbr888/V926deXkdP3Z4t8upP/M728nqlKliiwWi5KSklS9enV99913qlu3rlxdXW1vMD579qyuXLlSZP/du3erevXqWrFihf73v/9p4cKFio+PV8+ePQsd7+DBg6pRo4b++9//qkGDBqpfv746duyomTNnqqCgQEuXLlWtWrUKjT1+/PhCYzz66KOKiorSCy+8oHPnzikiIkKBgYG2c547d67uv/9+7d27V8nJyXJxcSn2GGFhYdq4caMWL16s119/vcR+AQAAAOWpxCCQkpKi2NhY2+8mk0lxcXG3dBCTyaSIiAiNGTNGJpNJlStX1pw5c+Tl5SWz2azAwED5+fnZLqJ/r0mTJho/frxWrlwpJycnvfLKKzc8xvr16xUbGyt3d3fNmzdP3t7e+u677zR48GBlZmbq8ccfl6en55/W+cQTT+idd95RdHS0kpKSFBkZqWXLlkmSwsPDFRISovz8fEnSrFmzVLdu3T89xiuvvKLAwEAFBASoXbt2t9QzAAAAoCyZrL+/H+cm5OTkqGLFimVVz20JCgpSeHi4/Pz8HF3KTTk9o4WjSwAAAAZX1s8IeHt7KC0ts0yPYQSl7aOPj7nYdSXOCCQkJOiDDz5QXl6erFarXFxc9Nlnn912MQAAAAAcr8QXiq1du1bx8fHy9/fXnDlz7shP3ePj4+/IugAAAIA7VYlBoEqVKqpevboyMjLUsWPHQg/0AgAAALg7lRgEzGaztm/fLpPJpISEBKWkpJRHXQAAAADKUIlBICIiQvfff7/+/ve/69SpUwoPDy+HsgAAAACUpRKDQEhIiJKTk+Xj46NJkyapY8eO5VEXAAAAgDJUYhAYNWqUvvzySz377LNasmSJzp8/Xx51AQAAAChDJX59aIsWLdSiRQtduXJF4eHh6tGjhw4ePFgetQEAAAAoIyXOCOzZs0ehoaEaOnSoGjRooO3bt5dHXQAAAADKUIlvFh4zZowCAwPVtWtXmUym8qrrnpabm8+b9uyANxbaB30sPXpoH/TRPuhj6dFD+6CP9uHQNwsvWbLktg8MAAAA4M5U4q1BAAAAAO49BAEAAADAgAgCAAAAgAERBAAAAAADIggAAAAABkQQAAAAAAyoxK8Phf11Cl3l6BIAAICdbJnY19ElALeFGQEAAADAgAgCAAAAgAERBAAAAAADIggAAAAABkQQAAAAAAyIIAAAAAAYEEEAAAAAMCCCAAAAAGBABAEAAADAgAgCAAAAgAERBAAAAAADIggAAAAABkQQAAAAAAzIZLVarY4u4m5x8uRJTZ48Wc7OzqpQoYLmzZunDz/8UN9//72sVquGDRumHj16aOjQoXrllVfUtGlTvfDCC3rvvfdUs2ZN2zinZ7Rw4FkAAICy5h78taNLcDhvbw+lpWU6uoy7Xmn76ONjLnad822PakBff/21mjVrpkmTJmnPnj3aunWrzpw5o4SEBGVnZ2vAgAHq0qWL5s+fr1GjRsnHx0cTJ04sFAIAAACAOwFB4Bb0799fy5cv19/+9jeZzWY1adJEP/74o4KCgiRJeXl5OnfunJo0aaI2bdpo37598vf3d3DVAAAAQFE8I3ALPv/8c7Vt21YrV65Uz549lZiYqI4dOyo+Pl4rV65Ur169VKtWLe3bt08///yz2rdvrxUrVji6bAAAAKAIZgRuQfPmzfXGG29oyZIlcnJy0uLFi7Vp0yYNHjxYmZmZevzxx2W1WjVlyhS98847uv/++xUYGKgOHTqoRQueCwAAAMCdg4eFHYCHhQEAuLfxsDAPC9tLWT4szK1BAAAAgAERBAAAAAADIggAAAAABkQQAAAAAAyIIAAAAAAYEEEAAAAAMCCCAAAAAGBABAEAAADAgAgCAAAAgAE5O7oAI6o5eR9v2rMD3lhoH/Sx9OihfdBH+6CPpUcPYRTMCAAAAAAGRBAAAAAADIggAAAAABgQQQAAAAAwIIIAAAAAYEAEAQAAAMCA+PpQB+gUusrRJQAAYChbJvZ1dAnAHYcZAQAAAMCACAIAAACAAREEAAAAAAMiCAAAAAAGRBAAAAAADIggAAAAABgQQQAAAAAwIIIAAAAAYEAEAQAAAMCACAIAAACAAREEAAAAAAO6Z4PA4cOH9c4770iStm3bposXLxa77ZIlS7R69epSHS8mJkb79+8v1RgAAABAeblng0DTpk316quvSpLi4uJksVjK9HgjR45Uy5Yty/QYAAAAgL04O7qAP5OVlaXJkyfr3Llzys3N1aRJk7Rq1SpdvXpVqampCgwM1ODBgxUUFKR69erp5MmTslqtWrRokU6cOKGEhAQ988wzOnz4sEJCQvTRRx9pyZIlOnjwoDIyMuTn56c5c+YUe/wvvvhCixcvlqenpypXrqzGjRtr9OjRCgsL04ULF5Samip/f3+NGzdOkyZNUu/evXXp0iV9+eWXysrK0unTp/XSSy+pX79+hcZdb44q69YBAIDfuRZd9P9e9+CvHVAJcOe4o2cEEhIS9MADD2jNmjWKjIzUjz/+qD59+mjFihVatmyZYmNjbdu2adNG8fHx6tWrl959913b8oCAADVt2lRz585VTk6OvLy89MEHHyghIUH79u0r9pah/Px8RUREaPny5YqPj5erq6sk6fz582rVqpXef/99rV69+oa3FFksFr377ruKjo5WTEyMfZsCAAAA2MEdPSNw4sQJ+fv7S5IaNWqkypUra8GCBdq6das8PT2Vl5dn27ZTp06SrgeCHTt23HA8V1dXpaSkaMKECfLw8FBmZqZyc3Nt6z/88EN99tlnkqT58+fL09NT9913nySpXbt2unTpkry9vXXgwAF9++238vT0VE5OTpHjNGnSRJJUs2bNG64HAAAAHO2OnhHw8/PTgQMHJEm//vqrZs6cqVatWmn+/Pnq2bOnrFarbduDBw9Kkv773/+qQYMGhcYxmUyyWq3auXOnzp8/r4ULF2rChAnKysoqNMbQoUMVHx+v+Ph4+fj4KCMjQykpKZKkH374QZKUmJgos9msBQsW6MUXXywyxm/HAwAAAO5kd/SMwMCBAxUaGqqhQ4cqPz9f3bt3V1xcnDZt2iRvb29VqFDB9on7+vXrFRsbK3d3d82bN09Hjx61jdO6dWtNnDhR0dHRWrp0qQYMGKCKFSuqdu3aSkpKuuGxnZycNG3aNL300ksym80qKChQnTp11LlzZ02YMEF79+6Vu7u76tSpU+wYAAAAwJ3KZP3jx9l3oaCgIIWHh8vPz8+u47777rsaPny4KlasqNdff12PPPKInn322VKPe3pGCztUBwAASqO4h4W9vT2UlpZZztXce+ijfZS2jz4+5mLX3dEzAo5WqVIlDRgwQG5ubnrggQfUu3dvR5cEAAAA2MU9MSNwt2FGAAAAx2NGoGzRR/soyxmBO/phYQAAAABlgyAAAAAAGBBBAAAAADAgggAAAABgQAQBAAAAwIAIAgAAAIABEQQAAAAAA+KFYg5Qc/I+vlfXDvh+Yvugj6VHD+2DPtoHfQRws5gRAAAAAAyIIAAAAAAYEEEAAAAAMCCCAAAAAGBABAEAAADAgAgCAAAAgAHx9aEO0Cl0laNLAADAMLZM7OvoEoA7EjMCAAAAgAERBAAAAAADIggAAAAABkQQAAAAAAyIIAAAAAAYEEEAAAAAMCCCAAAAAGBABAEAAADAgAgCAAAAgAERBAAAAAADIggAAAAABmS4IHDmzBkNGDDAbtsBAAAAdyPDBQEAAAAAkrOjC7gdffv21XvvvScvLy917NhRH374oR588EH17dtXvXr10meffSZnZ2e1a9dOb7zxhpYsWaL//e9/yszM1KxZsyRJ+fn5mjRpkho2bKiRI0dq6dKl2r59u/Lz8zVo0CA98sgjtuNt2bJFq1atsv3+9ttvS5LGjRsnq9Wq3NxcTZ8+XXXr1tVrr70mi8WirKwsvfHGG+rYsWP5NgcAAAC4CXdlEOjevbv+85//qEaNGqpVq5Z27dqlihUrqlatWtq2bZsSEhLk7OysMWPG6IsvvpAk1a9fX1OnTtWZM2eUl5en119/Xe3atdOQIUN06NAh7dy5U+vWrVNOTo4WLFigLl262I536tQpxcTEyN3dXWFhYfrqq6/k5eUls9msBQsW6NixY7JYLDp9+rQuXbqk2NhYXb58WadOnbph/evNUeXRJgAADMs9+GtHlwDc8e7KIPDEE09o2bJlqlmzpsaPH6/4+HhZrVb17t1be/fulYuLiySpXbt2+vnnnyVJ9erVs+1/5MgReXp6KjMzU5J08uRJtWzZUhUqVJC7u7stMPymWrVqCgkJUaVKlXTixAm1atVK/v7+OnXqlEaPHi1nZ2cFBwerYcOGGjJkiCZMmKC8vDwFBQWVY1cAAACAm3dXPiPQqFEjnTlzRvv379ejjz6qzMxMff7556pfv77279+vvLw8Wa1Wff/997YA4OT0/6farFkzxcTEaOPGjfrpp59Uv359HTp0SAUFBcrNzdXw4cOVk5MjSbp69aoWL16sRYsWKSIiQq6urrJardq9e7eqV6+uFStWKDg4WAsXLtSRI0eUkZGhmJgYRUZGaubMmQ7pDwAAAFCSu3JGQJLat2+vM2fOyMnJSe3bt9exY8fUuHFj9erVS4MGDVJBQYHatm2rxx9/XD/99FOR/d3c3BQeHq6QkBCtW7dOXbt2te03aNAgVaxYUZLk6empNm3aqG/fvvLw8JCXl5eSkpLUrVs3jR8/XitXrpSTk5NeeeUV1a1bV//4xz+0YcMGubi4aOzYseXdFgAAAOCmmKxWq9XRRRjN6RktHF0CAAD3tNI8I+Dt7aG0tEw7VmNM9NE+SttHHx9zsevuyluDAAAAAJQOQQAAAAAwIIIAAAAAYEAEAQAAAMCACAIAAACAAREEAAAAAAMiCAAAAAAGRBAAAAAADIggAAAAABiQs6MLMKKak/fxpj074I2F9kEfS48e2gd9tA/6COBmMSMAAAAAGBBBAAAAADAgggAAAABgQAQBAAAAwIAIAgAAAIABEQQAAAAAA+LrQx2gU+gqR5cAAP/X3r3HRVnm/x9/zcigjCKI4gEV8YRm5Gnb8ruth6JaExU1D+RiHsrTag/TTVOMNNcjeajFU5RJIWliaqJmqVva11LzgKJfytLUdj2QAirKaeD+/eGv2WVVsByZwXk//4G572uu+3N/mAE+c133fYk41JYJPZ0dgoj8ShoREBERERFxQyoERERERETckAoBERERERE3pEJARERERMQNqRAQEREREXFDKgRERERERNyQCgERERERETekQkBERERExA2pEBARERERcUMqBERERERE3JAKARERERERN6RCQERERETEDakQEBERERFxQx7ODsCVrV27lo8++oiioiI6d+7M9u3bsdlseHt7ExsbS1FREZMmTeLMmTMUFBQQHR1NSEgIU6ZM4dSpUxQVFfHiiy/y8MMPO/tURERERESKUSFQiqpVq7Jo0SIWL15MfHw8ZrOZ5557jtTUVFJTU6lbty4LFizg2LFjfPXVV6SlpVGtWjVmzpxJZmYmkZGRbNq0qVif67xfd9LZiIiIOJbXyK+cHYKI/EYqBErRsGFDzGYzFouFcePGYbVaOXfuHDabjRMnTtChQwcAgoODCQ4OZurUqezfv5/Dhw8DYLPZyMzMpFq1as48DRERERGRYlQIlMJsNvPtt9+ybds2kpKSyMnJoVevXhiGQePGjUlNTeXxxx/np59+4o033qBVq1bUrl2bESNGkJuby5IlS/Dx8XH2aYiIiIiIFKNC4DY0aNAALy8vevXqhaenJ/7+/qSnpxMREUFUVBSRkZEUFhYSFRVFs2bNeOWVV4iMjCQ7O5v+/ftjNuuabBERERFxLSbDMAxnB+FuTk97wNkhiIiIOMS9eI2Ar6+VrKxrzg6j3FMeHeNO8+jv733LffqoWkRERETEDakQEBERERFxQyoERERERETckAoBERERERE3pEJARERERNxOSsoBfvjhewCiosYDcPz4D6SkHABgypRJFBQUOC2+sqBCQERERETczqZNG7hw4WcAZs58HYAvvtjOyZMnAHjttVlYLBanxVcWtI6AiIiIiJRbmzcn8+WXO7h27SpZWVkMHvw8hmGwdm0Sv9wlf/r0GE6c+IElS2KxWCw8+OBD7NnzNceOfUtQUCOGDRvIsmUJfPLJRjw8LAQHN+fVVyeRmLiGjIyLzJ79N2w2GyaTiTFjXqJp02AiInrywAOtOH36FH5+fkyfHkOFChWcnI1fR4WAiIiIiJRrOTnXWLBgEVlZmQwdOpCuXcN5/fU3qVSpEjExM9i792tq1PAnPz+ft99+D4CzZ88QGvoktWvXBsDfvyZPPdWV6tWr06JFiL3vRYveoHfvfrRv34nvv/+O2bP/xrJlCZw58y/efHMJtWrVZuTIIaSl/R8hIeVrrSgVAiIiIiJSrrVu3Raz2YyfX3W8vatiMpmYPn0KVquVU6dOEhLSEoDAwAa/uu+TJ0/SqlVbAJo2bUZ6+nkAfHx8qVXrehFRs2Yt8vPzHHQ2ZUeFgBPUmZSilfYcQCsWOobyeOeUQ8dQHh1DeRR39N133wKQkXGRq1ezWbcuibVrNwMwduwo+xQhs9lkf47JZMIwior1YzabKSoyim0LCgri8OGD/PGPHfn+++/w86tuf355p0JARERERMq1jIyLjBkzkuzsbMaNe5nNm5MZMiQSLy8vvL29uXDhZ+rUCSj2nBYtQli6dCF16tS1b2vW7D4WL36ToKCG9m2jRr3InDnTWblyBTabjUmTosvsvO42k/FLiSRlpqCgUJ/WOIA+9XIM5fHOKYeOoTw6hvJ455RDxyirPG7enMypUycZOfKFu34sZ7jTPPr7e99yn24fKiIiIiLihjQ1SERERETKrS5dujk7hHJLIwIiIiIiIm5IhYCIiIiIiBtSISAiIiIi4oZ0jYATtItKdHYIIiIidyMZio0AABObSURBVGzLhJ7ODkFcUOeYdQ7tT6+zu0cjAiIiIiJyT1ixIp7w8D+Rl3d9ld8ZM6aye/dXxdp07/4n+/c7d37BCy8MZ/ToYQwdOpDPP992Q5/Hj/9ASsqB245hypRJFBQU/MYzKFsaERARERGRe8LWrVsIDX2S7ds/K/VuQqmph1i9+gNiYt7AarVy6VIWw4cPJiioEQ0bNrK3++KL7VSvXp3WrdveVgyvvTbrjs6hLKkQEBEREZFy78CBfQQE1KNHj6eZNu3VUguB5OT19OnzDFarFQAfH1/i4t7D2/vfC3D9/HM6n3yyEQ8PC8HBzZk1axr16zfAYrEwatQY5s6dTX5+HpcvX2LQoKF06NCJ3r27kZi4hrlzZ2GxWDh37iwXL14gKmoqzZo1v6s5+LVUCIiIiIhIubdx48d069aDwMAgLBYLR48euWk7k+n61wsXfiYgoG6xfVWrVi322N+/Jk891ZXq1avTokUIOTk5DBr0HMHBzfnmmz1ERPyZtm0fJDX1EMuWvUWHDp2KPb927TpMmDCZDRvWsWHDWsaPj3LY+TqCCgERERERKdcuX77M11/vIjMzgzVrPuTq1WzWrv0QLy8rBQX5xdoWFhYCUKtWHdLTz9O0abB93+HDKfj5Vadevfq3PFZgYBAA1avX4L33lrFp08eACZvNdkPbpk2bAVCzZi1SUw/d4Vk6ni4WFhEREZFy7bPPNtO1azgLFixi/vxY4uLeY+/ePQQE1GXHjs/t7Q4dOkhQ0PX5/2Fh3Vi5MoGcnBwAMjMzmDlzGrm5ucX6NpvNFBUZ9sem/z+k8M47S+ncOYzo6L/Rtu2DN43rl7auSiMCIiIiIuIwv9zu09fXSlbWtTI5ZnLyx0RHT7M/rlSpEh07PkZubi5eXlYGDeqP1WrFYrEwYcL16TkhIS3p3r0nY8eOwsPDg7y8XEaMGEWTJk2L9d2s2X0sXvwmQUENi21/9NFQ3nxzLgkJy6lZsxZZWVl3/0QdzGQYhlF6M3Gk341/39khiIiI3LF79f7uZfkP7L1MeXSMO82jv7/3LfdpapCIiIiIiBtSISAiIiIi4oZUCIiIiIiIuKF7uhBIS0tj4cKFAGzdupXz58/fsm1sbCwrV668Yftjjz1mX6ZaRERERORecU8XAvfddx+jR48G4P333yc7O9vJEYmIiIiIuAaXv31obm4ukyZN4syZMxQUFDBx4kQSExO5cuUKmZmZ9OnTh/79+zNgwAAaNmzIjz/+iGEYLFiwgBMnTrBq1SrCw8NJS0vj5Zdf5oMPPiA2NpYjR45w9epVGjduzKxZs0qN45///CeTJ0/GZrNhMpl45ZVXaN68ORMnTuT06dPk5eXx3HPP0aVLFxYsWMDu3bspKioiLCyMQYMGFetrnffrdylbIiIiZenevGuQ3JmcJX+4/tVB/XmN/MpBPcl/c/kRgVWrVlG3bl0+/PBDZs+ezdGjRwkLC+Pdd99l6dKlxMfH29u2bduWhIQEnnrqKd566y379k6dOnHfffcxZ84c8vPzqVq1KsuXL2fVqlWkpKSUOGXoFzExMQwYMIDExEQmT55MVFQU2dnZ7Nmzh4ULF/L222/bV6pbv349c+fOJTExkUqVKjk8JyIiIiJyoxUr4gkP/5N9WveMGVPZvbt4IdG9+5/s3+/c+QUvvDCc0aOHMXToQD7/fNtvPvbZs2cYNmzQr3pOXl4eycnrAdi8OZn//d8dv/n4v4XLjwicOHGCDh06ABAcHIyPjw/z5s3js88+o0qVKsWWc27Xrh1wvSD4xz/+cdP+KlasSEZGBuPGjcNqtXLt2jUKCgrs+1esWMGnn34KwNy5c+3bjx8/zu9//3vg+pSjc+fOUaVKFaKjo4mOjiY7O5vu3bsDMH/+fObPn8+FCxdo3769A7MhIiIiIreydesWQkOfZPv2z+jSpVuJbVNTD7F69QfExLyB1Wrl0qUshg8fTFBQIxo2bFQm8WZkXCQ5eT3duvUoNd67weULgcaNG5Oamsrjjz/OTz/9xJw5c/jDH/5A//792b17Nzt2/LtyOnLkCLVr1+bAgQM0adKkWD8mkwnDMNi5cydnz57ljTfeICMjg61bt/Kfa6pFRkYSGRl50zj27dtHaGgoaWlp1KhRg/T0dI4ePcqiRYvIy8ujY8eOdOvWjS1btjB//nwMwyAsLIywsDDq1q1795IkIiIi4uYOHNhHQEA9evR4mmnTXi31H+vk5PX06fMMVqsVAB8fX+Li3sPbu/gCXGvXJvHJJxsxm820bNmaUaPGcP78OWJiZpKfn4enZ0X7asW/OHhwP3Fxi6lQoQIBAXWZMGEyhYU2Zs58jXPnzmGz2Rg7djybNm3g5MkfWb78bYqKiqhevTo9evQmNnYBhw+nANC9e3e6dXuaGTOmYrFYOHfuLBcvXiAqairNmjW/o5y5fCEQERFBVFQUkZGRFBYWEhoayvvvv09ycjK+vr5UqFCB/Px8ANatW0d8fDxeXl7ExMRw7Ngxez9t2rRhwoQJLFmyhMWLF9O3b188PT2pX78+6enppcYxYcIEoqOjeffdd7HZbMyYMQN/f39+/vlnevTogdVqZciQIXh6euLj40N4eDg+Pj488sgjBAQE3LX8iIiIiAhs3Pgx3br1IDAwCIvFwtGjR27azmS6/vXChZ8JCCj+QW3VqlVvaL95czIvvjiekJAHWLduDTabjUWL3qR37378z/88wr59e1m6dCHDhv0FAMMwmDNnBkuWvEO1an68/fYSNm9OJifnGrVrB/Daa7M4ceIH9u3by7PPDuH48R8YPHgoy5Zdn9a+a9eXnD17hri4eAoLC3nhhaG0aNEKgNq16zBhwmQ2bFjHhg1rGT8+6oZ4fw2XLwQqVqzIvHnzim17/vnnb9p23LhxNG7c2P744Ycf5uGHHwZg7NixjB07FoCPPvrohuf+7ne/u2mfv0wxqlevHsuXL79h/7Rp027YNnr0aPvdikRERETk7rp8+TJff72LzMwM1qz5kKtXs1m79kO8vKwUFOQXa/vLNZ21atUhPf08TZsG2/cdPpyCn1916tWrb98WFfUqK1euYOnSWO6//wEATpz4gYSE5SQmvgeAh8e//6XOysrk4sULREdPBK5fB/DQQ+3IysqkXbvrF1I3atSERo2acPbsmRvO5dSpH2nVqjUmkwkPDw9atmzFyZMnAGjatBkANWvWIjX10J0ljXJwsbCIiIiISEk++2wzXbuGs2DBIubPjyUu7j327t1DQEBdduz43N7u0KGDBAVdn/8fFtaNlSsTyMm5fn+jzMwMZs6cRm5ubrG+N2xYz0svTWLhwji+//47UlMPERgYxMiRL7BwYRzjx0fRqVOovb2Pjy81a9Zk9uz5LFwYx8CBQ2jb9kEaNGhIWtr/AfCvf/2TqVMnYzKZMYyiYsdr0KChfVqQzWYjJSWFevUCgetT3R3J5UcEbldCQoKzQxARERFxe7/c7tPX10pW1rUyOWZy8sdER/97lkalSpXo2PExcnNz8fKyMmhQf6xWKxaLxT6fPySkJd2792Ts2FF4eHiQl5fLiBGjaNKkabG+GzduwtChz+LrWw1/f39atAhh1KgxzJs3m/z8fPLychkz5iV7e7PZzJgxLzF+/BgMw8BqrUx09GuEhLRk1qxpjB49jMLCQsaM+SvVqlWjoMDG4sV/p2LFigA88kh7Dh7cz/DhgykoKCAsrMsdXwtwKybjP6+UlTJxetoDzg5BRETkjt2r93cvy39g72XKo2PcaR79/b1vuU9Tg0RERERE3JAKARERERERN6RCQERERETEDakQEBERERFxQyoERERERETckAoBERERERE3dM+sI1Ce1JmUottpOYBuS+YYyuOdUw4dQ3l0DOVRRG6XRgRERERERNyQCgERERERETeklYVFRERERNyQRgRERERERNyQCgERERERETekQkBERERExA2pEBARERERcUNaR+AuKioqYurUqXz33Xd4enoyffp0GjRoYN+/evVqVq1ahYeHByNHjuTRRx91YrSuqbQcAmRkZBAREUFycjIVK1Z0UqSurbQ8xsfHs2nTJgA6duzI6NGjnRWqSystj4mJiaxduxaTycSoUaP0nr6J23lPFxUVMWzYMEJDQ3nmmWecFKlrKy2P06dP58CBA1SuXBmAxYsX4+3t7axwXVZpedyxYweLFi0CoEWLFkyZMgWTyeSscF1SSTlMS0tj5syZ9rYpKSksWrSIDh06OCtcl1Xaa3HZsmVs2rQJk8nEiBEjeOKJJxxzYEPumk8//dR4+eWXDcMwjIMHDxojRoyw70tPTze6du1q5OXlGZcvX7Z/L8WVlEPDMIydO3ca4eHhRps2bYzc3FxnhFgulJTH06dPGz179jRsNptRWFho9OvXz0hLS3NWqC6tpDxevHjR6NKli5Gfn29cuXLF6NChg1FUVOSsUF1Wae9pwzCMefPmGb179zY++OCDsg6v3CgtjxEREcbFixedEVq5UlIer1y5YoSFhdnzGBcXp5zexO28pw3DMDZv3myMGzeuLEMrV0rK46VLl4yOHTsaeXl5RlZWltGpUyeHHVdTg+6i/fv30759ewBat27NkSNH7PsOHz5MmzZt8PT0xNvbm8DAQL799ltnheqySsohgNlsZvny5fj6+jojvHKjpDzWrl2bd955hwoVKmA2m7HZbBpZuYWS8ujn58fHH3+MxWLhwoULVK1aVZ8c3kRp7+ktW7ZgMpn0iWEpSspjUVERp06d4tVXXyUiIoI1a9Y4K0yXV1IeDx48SHBwMHPmzKF///7UqFEDPz8/Z4Xqskp7TwNcu3aN2NhYJk+eXNbhlRsl5dHLy4uAgABycnLIyclx6N8WTQ26i7Kzs6lSpYr9cYUKFbDZbHh4eJCdnV1smLZy5cpkZ2c7I0yXVlIOAR555BFnhVaulJRHi8WCn58fhmEQExNDixYtaNiwoROjdV2lvR49PDxYsWIFsbGxDBgwwFlhurSScnjs2DE2btzI3//+d/t0DLm5kvJ47do1IiMjGTx4MIWFhTz77LOEhITQvHlzJ0bsmkrKY2ZmJnv27GH9+vVYrVb+/Oc/07p1a/1+/C+l/V4EWLNmDZ07d1YhVYLS8linTh3CwsIoLCxk+PDhDjuuRgTuoipVqnD16lX746KiIvsP9L/3Xb16VfM3b6KkHMrtKy2PeXl5vPTSS1y9epUpU6Y4I8Ry4XZej5GRkXz55Zd888037N69u6xDdHkl5XD9+vWcP3+egQMHsm7dOuLj49m5c6ezQnVpJeXRy8uLZ599Fi8vL6pUqUK7du004nwLJeXR19eXBx54AH9/fypXrsyDDz5IWlqas0J1WbfzezE5OZk+ffqUdWjlSkl53LlzJ+np6Wzfvp0vvviCbdu2cfjwYYccV4XAXdS2bVv7H7GUlBSCg4Pt+1q2bMn+/fvJy8vjypUrHD9+vNh+ua6kHMrtKymPhmHwl7/8hWbNmjFt2jQqVKjgrDBdXkl5PHHiBKNHj8YwDCwWC56enpjN+hX730rK4YQJE0hKSiIhIYGePXsyaNAgTRG6hZLyePLkSfr3709hYSEFBQUcOHCA+++/31mhurSS8hgSEsKxY8fIyMjAZrNx6NAhmjRp4qxQXVZpf6evXLlCfn4+derUcUZ45UZJefTx8aFSpUp4enpSsWJFvL29uXz5skOOq49W76InnniCXbt2ERERgWEYzJw5k+XLlxMYGEhoaCgDBgygf//+GIbB2LFjNS/7JkrLodyekvJYVFTE3r17yc/P58svvwRg3LhxtGnTxslRu57SXo/NmzenX79+mEwm2rdvz0MPPeTskF2O3tOOUVoeu3XrRt++fbFYLISHh9O0aVNnh+ySSsvjX//6V55//nkAOnfurA+jbqK0HP7444/UrVvX2WG6vNLy+NVXX9G3b1/MZjNt27Z12NRok2EYhkN6EhERERGRckPj1iIiIiIibkiFgIiIiIiIG1IhICIiIiLihlQIiIiIiIi4IRUCIiIiIiJuSIWAiIi4jLi4OP74xz+Sl5cHwMSJE29YVOw/b5u3bds2BgwYwIABA+jTpw9btmwp03hFRMozrSMgIiIuIzk5mS5durBp0yZ69epVYtsDBw4QHx/PW2+9ReXKlcnMzKRfv340adJECz+JiNwGjQiIiIhL2LNnD4GBgURERJCYmFhq+6SkJAYOHEjlypUBqFatGklJSTRu3Phuhyoick/QiICIiLiEpKQk+vTpQ6NGjfD09OTQoUM3bWcymQBIT0+nfv36xfb5+Pjc9ThFRO4VKgRERMTpLl26xM6dO8nIyCAhIYHs7GxWrFiB1WolPz+/WFubzQZAQEAAZ8+epXnz5vZ9+/fvp0aNGjRo0KBM4xcRKY80NUhERJxuw4YNPP3007z77rssW7aM1atXs2vXLurXr8/WrVvt7fbt22ef/9+rVy+WLVvGtWvXALh48SJRUVHk5OQ45RxERMobjQiIiIjTJSUlERMTY3/s5eXFk08+SW5uLlarlfDwcCpXrozFYmHatGkAtGnThr59+zJkyBA8PDzIzc1l3LhxxUYIRETk1kyGYRjODkJERERERMqWpgaJiIiIiLghFQIiIiIiIm5IhYCIiIiIiBtSISAiIiIi4oZUCIiIiIiIuCEVAiIiIiIibkiFgIiIiIiIG/p/XNZEXbrf2TIAAAAASUVORK5CYII=\n", - "text/plain": [ - "
    " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "build.plotPredictorQuality(df_unisel, #Dataframe with univariate selection\n", - " dim=(12,8)) #Size of the figure, if not specified, automatically set to (12,8)" + "Second output is a correlation matrix." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 32, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
    " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "build.plotCorrMatrix(df_corr, #Dataframe with correlation\n", - " dim=(12,8)) #Size of the figure, if not specified, automatically set to (12,8)" + "df_corr = (univariate_selection\n", + " .compute_correlations(basetable[basetable[\"split\"] == \"train\"],\n", + " predictors=preprocessed_predictors))" ] }, { - "cell_type": "code", - "execution_count": 11, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
    " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ - "build.plotIncidence(df_transformed, #Dataframe with transformed dataset\n", - " variable='age', #Which variable should be visualized\n", - " dim=(12,8)) #Size of the figure, if not specified, automatically set to (12,8)" + "Clearly, for meaningful inspection, we need to visualize the data. Therefore, below are plots for **Predictor quality** and **Correlation Matrix**." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Model building\n", - "The next step is to find the best model. The library allows to build a model and specify which variables will be forced or excluded." + "### Create pig tables " ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ - "df_model1 = build.fit_model(df_transformed, #Dataframe with transformed data \n", - " df_unisel, #Dataframe with univariate selection\n", - " modeling_nsteps=30, #How many variables will be used for modelling\n", - " forced_vars=['scont_1','scont_2'], #List of variables forced to be in the models, list (or None))\n", - " excluded_vars=None, #List of variables to be excluded, list (or None)\n", - " name='Experiment', #Name of the model\n", - " verbose=False, #Verbose=True will print extra output about skipped models\n", - " positive_only=True) #positive_only=True [recommended, set automatically]:\n", - " #(whether only positive coeficients should be considered)" + "columns = [col for col in basetable.columns if col.endswith(\"_bin\") or col.endswith(\"_processed\")]" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 36, "metadata": {}, + "outputs": [], "source": [ - "We can inspect what the output is. It contais all the necessary information about the model building:\n", - " * model coefficients\n", - " * predictors from forward selection\n", - " * AUC performance on train/selection/validation set\n", - " * cumulative response/gains\n", - " * variable importance\n", - " * etc.\n", - " \n", - "Ever row contains one step in the forward selection process, thus one model with selected variables." + "pigs = generate_pig_tables(basetable[basetable[\"split\"] == \"train\"],\n", + " id_column_name=\"ID\", target_column_name=\"TARGET\",\n", + " preprocessed_predictors= columns)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -1725,362 +890,278 @@ " \n", " \n", " \n", - " step\n", - " coef\n", - " intercept\n", - " auc_train\n", - " auc_selection\n", - " auc_validation\n", - " predictors_subset\n", - " last_var_added\n", - " pred_training\n", - " pred_selection\n", - " pred_validation\n", - " cum_response\n", - " cum_gains\n", - " importance\n", + " variable\n", + " label\n", + " pop_size\n", + " avg_incidence\n", + " incidence\n", " \n", " \n", " \n", " \n", + " 1\n", + " sflag_4_processed\n", + " 1\n", + " 0.499198\n", + " 0.239277\n", + " 0.243216\n", + " \n", + " \n", " 0\n", + " sflag_5_processed\n", + " 0\n", + " 0.503395\n", + " 0.239277\n", + " 0.235426\n", + " \n", + " \n", + " 1\n", + " sflag_5_processed\n", " 1\n", - " [[4.8883387625778765]]\n", - " [-2.326640746611817]\n", - " 0.510697\n", - " 0.505875\n", - " 0.502901\n", - " [D_scont_2]\n", - " D_scont_2\n", - " [[0.768583054973849, 0.231416945026151], [0.76...\n", - " [[0.768583054973849, 0.231416945026151], [0.76...\n", - " [[0.7638833845524492, 0.23611661544755083], [0...\n", - " [21.77, 24.49, 26.36, 26.58, 24.83, 25.11, 24....\n", - " [0, 0.91, 2.05, 3.31, 4.45, 5.19, 6.3, 7.04, 8...\n", - " {'scont_2': 0.9999602164427089}\n", + " 0.496605\n", + " 0.239277\n", + " 0.243180\n", + " \n", + " \n", + " 0\n", + " workclass_processed\n", + " ?\n", + " 0.057430\n", + " 0.239277\n", + " 0.093880\n", " \n", " \n", " 1\n", - " 2\n", - " [[5.6064185594945055, 5.4040124941185645]]\n", - " [-3.792097034819709]\n", - " 0.515900\n", - " 0.500047\n", - " 0.495731\n", - " [D_scont_2, D_scont_1]\n", - " D_scont_1\n", - " [[0.762753676141239, 0.23724632385876096], [0....\n", - " [[0.762753676141239, 0.23724632385876096], [0....\n", - " [[0.7572651247462866, 0.24273487525371337], [0...\n", - " [38.78, 40.48, 40.91, 35.26, 28.24, 23.86, 20....\n", - " [0, 1.63, 3.39, 5.13, 5.9, 5.9, 5.99, 6.07, 7....\n", - " {'scont_2': 0.7487108741471443, 'scont_1': 0.6...\n", + " workclass_processed\n", + " Federal-gov\n", + " 0.029790\n", + " 0.239277\n", + " 0.388316\n", " \n", " \n", " 2\n", - " 3\n", - " [[5.236067628970329, 5.8321544245240755, 6.589...\n", - " [-5.805207410483017]\n", - " 0.784975\n", - " 0.779989\n", - " 0.774255\n", - " [D_scont_2, D_scont_1, D_relationship]\n", - " D_relationship\n", - " [[0.5571763524627912, 0.4428236475372088], [0....\n", - " [[0.5571763524627912, 0.4428236475372088], [0....\n", - " [[0.5502313500711142, 0.4497686499288857], [0....\n", - " [43.54, 46.94, 45.68, 45.66, 44.75, 45.57, 45....\n", - " [0, 1.82, 3.93, 5.73, 7.64, 9.35, 11.43, 13.34...\n", - " {'scont_2': 0.03837649674025111, 'scont_1': 0....\n", + " workclass_processed\n", + " Local-gov\n", + " 0.065791\n", + " 0.239277\n", + " 0.303423\n", " \n", " \n", " 3\n", - " 4\n", - " [[5.829258106552866, 7.1572932587913565, 6.910...\n", - " [-7.817413679010036]\n", - " 0.858115\n", - " 0.856198\n", - " 0.853614\n", - " [D_scont_2, D_scont_1, D_relationship, D_educa...\n", - " D_education\n", - " [[0.07688945912663137, 0.9231105408733686], [0...\n", - " [[0.3481266903051372, 0.6518733096948628], [0....\n", - " [[0.3410603659913337, 0.6589396340086663], [0....\n", - " [82.99, 83.33, 81.59, 80.75, 78.17, 75.57, 74....\n", - " [0, 3.48, 6.99, 10.24, 13.52, 16.34, 18.96, 21...\n", - " {'scont_2': 0.02268402069355277, 'scont_1': 0....\n", + " workclass_processed\n", + " Other\n", + " 0.040915\n", + " 0.239277\n", + " 0.262719\n", " \n", " \n", " 4\n", - " 5\n", - " [[6.308402146245059, 6.6555736472448945, 6.864...\n", - " [-8.726303718168493]\n", - " 0.871315\n", - " 0.871706\n", - " 0.868557\n", - " [D_scont_2, D_scont_1, D_relationship, D_educa...\n", - " D_capital-gain\n", - " [[0.09781071808143116, 0.9021892819185688], [0...\n", - " [[0.39754912429362177, 0.6024508757063782], [0...\n", - " [[0.3894682127983645, 0.6105317872016355], [0....\n", - " [94.56, 88.44, 86.36, 87.39, 84.04, 82.5, 79.7...\n", - " [0, 3.96, 7.41, 10.84, 14.63, 17.56, 20.7, 23....\n", - " {'scont_2': 0.024497630031203383, 'scont_1': 0...\n", + " workclass_processed\n", + " Private\n", + " 0.690599\n", + " 0.239277\n", + " 0.218006\n", + " \n", + " \n", + " 5\n", + " workclass_processed\n", + " Self-emp-inc\n", + " 0.034977\n", + " 0.239277\n", + " 0.541463\n", + " \n", + " \n", + " 6\n", + " workclass_processed\n", + " Self-emp-not-inc\n", + " 0.080498\n", + " 0.239277\n", + " 0.274693\n", " \n", " \n", "\n", "" ], "text/plain": [ - " step coef \\\n", - "0 1 [[4.8883387625778765]] \n", - "1 2 [[5.6064185594945055, 5.4040124941185645]] \n", - "2 3 [[5.236067628970329, 5.8321544245240755, 6.589... \n", - "3 4 [[5.829258106552866, 7.1572932587913565, 6.910... \n", - "4 5 [[6.308402146245059, 6.6555736472448945, 6.864... \n", - "\n", - " intercept auc_train auc_selection auc_validation \\\n", - "0 [-2.326640746611817] 0.510697 0.505875 0.502901 \n", - "1 [-3.792097034819709] 0.515900 0.500047 0.495731 \n", - "2 [-5.805207410483017] 0.784975 0.779989 0.774255 \n", - "3 [-7.817413679010036] 0.858115 0.856198 0.853614 \n", - "4 [-8.726303718168493] 0.871315 0.871706 0.868557 \n", - "\n", - " predictors_subset last_var_added \\\n", - "0 [D_scont_2] D_scont_2 \n", - "1 [D_scont_2, D_scont_1] D_scont_1 \n", - "2 [D_scont_2, D_scont_1, D_relationship] D_relationship \n", - "3 [D_scont_2, D_scont_1, D_relationship, D_educa... D_education \n", - "4 [D_scont_2, D_scont_1, D_relationship, D_educa... D_capital-gain \n", - "\n", - " pred_training \\\n", - "0 [[0.768583054973849, 0.231416945026151], [0.76... \n", - "1 [[0.762753676141239, 0.23724632385876096], [0.... \n", - "2 [[0.5571763524627912, 0.4428236475372088], [0.... \n", - "3 [[0.07688945912663137, 0.9231105408733686], [0... \n", - "4 [[0.09781071808143116, 0.9021892819185688], [0... \n", - "\n", - " pred_selection \\\n", - "0 [[0.768583054973849, 0.231416945026151], [0.76... \n", - "1 [[0.762753676141239, 0.23724632385876096], [0.... \n", - "2 [[0.5571763524627912, 0.4428236475372088], [0.... \n", - "3 [[0.3481266903051372, 0.6518733096948628], [0.... \n", - "4 [[0.39754912429362177, 0.6024508757063782], [0... \n", - "\n", - " pred_validation \\\n", - "0 [[0.7638833845524492, 0.23611661544755083], [0... \n", - "1 [[0.7572651247462866, 0.24273487525371337], [0... \n", - "2 [[0.5502313500711142, 0.4497686499288857], [0.... \n", - "3 [[0.3410603659913337, 0.6589396340086663], [0.... \n", - "4 [[0.3894682127983645, 0.6105317872016355], [0.... \n", - "\n", - " cum_response \\\n", - "0 [21.77, 24.49, 26.36, 26.58, 24.83, 25.11, 24.... \n", - "1 [38.78, 40.48, 40.91, 35.26, 28.24, 23.86, 20.... \n", - "2 [43.54, 46.94, 45.68, 45.66, 44.75, 45.57, 45.... \n", - "3 [82.99, 83.33, 81.59, 80.75, 78.17, 75.57, 74.... \n", - "4 [94.56, 88.44, 86.36, 87.39, 84.04, 82.5, 79.7... \n", - "\n", - " cum_gains \\\n", - "0 [0, 0.91, 2.05, 3.31, 4.45, 5.19, 6.3, 7.04, 8... \n", - "1 [0, 1.63, 3.39, 5.13, 5.9, 5.9, 5.99, 6.07, 7.... \n", - "2 [0, 1.82, 3.93, 5.73, 7.64, 9.35, 11.43, 13.34... \n", - "3 [0, 3.48, 6.99, 10.24, 13.52, 16.34, 18.96, 21... \n", - "4 [0, 3.96, 7.41, 10.84, 14.63, 17.56, 20.7, 23.... \n", - "\n", - " importance \n", - "0 {'scont_2': 0.9999602164427089} \n", - "1 {'scont_2': 0.7487108741471443, 'scont_1': 0.6... \n", - "2 {'scont_2': 0.03837649674025111, 'scont_1': 0.... \n", - "3 {'scont_2': 0.02268402069355277, 'scont_1': 0.... \n", - "4 {'scont_2': 0.024497630031203383, 'scont_1': 0... " + " variable label pop_size avg_incidence incidence\n", + "1 sflag_4_processed 1 0.499198 0.239277 0.243216\n", + "0 sflag_5_processed 0 0.503395 0.239277 0.235426\n", + "1 sflag_5_processed 1 0.496605 0.239277 0.243180\n", + "0 workclass_processed ? 0.057430 0.239277 0.093880\n", + "1 workclass_processed Federal-gov 0.029790 0.239277 0.388316\n", + "2 workclass_processed Local-gov 0.065791 0.239277 0.303423\n", + "3 workclass_processed Other 0.040915 0.239277 0.262719\n", + "4 workclass_processed Private 0.690599 0.239277 0.218006\n", + "5 workclass_processed Self-emp-inc 0.034977 0.239277 0.541463\n", + "6 workclass_processed Self-emp-not-inc 0.080498 0.239277 0.274693" ] }, - "execution_count": 13, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_model1.head(n=5)" + "# As you can see below, the pigs DataFrame is idealy suited to export to csv and load into Jan's Excel template!\n", + "pigs.tail(n=10)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 39, "metadata": {}, + "outputs": [], "source": [ - "Clearly, we again need visual inspection. Therefore, below are plots for **AUC**, **Variable importance** and **Cumulative gain/response**.\n", - "\n", - "AUC plots is supposed to help choosing optimal number of variables in the model. Further down, it needs to be specified which model should be used." + "preselected_predictors = univariate_selection.get_preselected_predictors(df_auc)" ] }, { - "cell_type": "code", - "execution_count": 14, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
    " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ - "build.plotAUC(df=df_model1, #Dataframe with models\n", - " dim=(12,8)) #Size of the figure, if not specified, automatically set to (12,8)" + "## Model building\n", + "The next step is to find the best model. COBRA allows you to build a model and specify which variables will be forced or excluded." ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 40, "metadata": {}, + "outputs": [], "source": [ - "For example below, we plot variable importance for 5th model with 5 variables.\n", + "forward_selection = ForwardFeatureSelection(pos_only=True)\n", "\n", - "Keep in mind that the `fit_model()` method will return multiple models, each with different set of variables." + "forward_selection.fit(basetable[basetable[\"split\"] == \"train\"], \n", + " \"TARGET\", \n", + " preselected_predictors,\n", + " forced_predictors=[\"scont_1_enc\", \"scont_1_enc\"])" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 41, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
    " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "build.plotVariableImportance(df_model1, #Dataframe with models\n", - " step=5, #What model should be plotted\n", - " dim=(12,8)) #Size of the figure, if not specified, automatically set to (12,8)" + "performances = forward_selection.compute_model_performances(basetable, \"TARGET\")" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 42, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    predictorslast_added_predictortrain_performanceselection_performancevalidation_performance
    0[scont_1_enc]scont_1_enc0.5060350.4891550.497746
    1[scont_1_enc, relationship_enc]relationship_enc0.7796810.7752670.787203
    2[scont_1_enc, relationship_enc, education_enc]education_enc0.8580480.8563760.859707
    3[scont_1_enc, education_enc, relationship_enc,...age_enc0.8705360.8721490.871496
    4[scont_1_enc, education_enc, relationship_enc,...occupation_enc0.8783800.8824680.880178
    \n", + "
    " + ], "text/plain": [ - "
    " + " predictors last_added_predictor \\\n", + "0 [scont_1_enc] scont_1_enc \n", + "1 [scont_1_enc, relationship_enc] relationship_enc \n", + "2 [scont_1_enc, relationship_enc, education_enc] education_enc \n", + "3 [scont_1_enc, education_enc, relationship_enc,... age_enc \n", + "4 [scont_1_enc, education_enc, relationship_enc,... occupation_enc \n", + "\n", + " train_performance selection_performance validation_performance \n", + "0 0.506035 0.489155 0.497746 \n", + "1 0.779681 0.775267 0.787203 \n", + "2 0.858048 0.856376 0.859707 \n", + "3 0.870536 0.872149 0.871496 \n", + "4 0.878380 0.882468 0.880178 " ] }, + "execution_count": 42, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "build.plotCumulatives([(df_model1,5)], #List of tuples (dataframe with models, model number)\n", - " df_transformed, #Dataframe with transformed dataset\n", - " dim=(12,8)) #Size of the figure, if not specified, automatically set to (12,8)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After fitting the model, following class attributes are available" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "#Dictionary containing paritioned data in forms of dataframes\n", - "#partition_dict = build._partition_dict \n", - "#print(partition_dict.values)" + "performances.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Model comparison\n", - "In the next step, the analyst needs to try different models. Thus, we can build multiple models and compare them. Below we build another two models and give them a name." + "## Model selection and evaluation" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "df_model2 = build.fit_model(df_transformed, \n", - " df_unisel,\n", - " modeling_nsteps=30,\n", - " forced_vars=None,\n", - " excluded_vars=None,\n", - " name='All variables')\n", - "\n", - "df_model3 = build.fit_model(df_transformed, \n", - " df_unisel,\n", - " modeling_nsteps=5,\n", - " forced_vars=['capital-gain','hours-per-week'],\n", - " excluded_vars=['age','relationship', 'sex'],\n", - " name='Limited model')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now we can compare all the three models as we are used to from the web version of COBRA - with **AUC comparison** and **Cumulative gains/response** plots." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
    " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "build.plotAUCComparison([(df_model1,3), (df_model2,5), (df_model3, 4)])" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
    " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "build.plotCumulatives([(df_model1,3), (df_model2,5), (df_model3, 4)], df_transformed)" - ] + "source": [] } ], "metadata": { @@ -2099,9 +1180,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.8.0" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From ba01627a4081d42a0daa0cdab86708fca1bedbae Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 11:22:31 +0100 Subject: [PATCH 72/98] Modify evaluation's __init__.py --- cobra/evaluation/__init__.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py index 3172704..4575e23 100644 --- a/cobra/evaluation/__init__.py +++ b/cobra/evaluation/__init__.py @@ -1,5 +1,17 @@ -from .pigs_tables import generate_pig_tables, compute_pig_table, plot_pig_graph +from .pigs_tables import generate_pig_tables +from .pigs_tables import compute_pig_table +from .pigs_tables import plot_pig_graph + +from .performance_curves import plot_performance_curves + +from .predictor_quality import plot_variable_importance +from .predictor_quality import plot_predictor_quality +from .predictor_quality import plot_correlation_matrix __all__ = ['generate_pig_tables', 'compute_pig_table', - 'plot_pig_graph'] + 'plot_pig_graph', + 'plot_performance_curves', + 'plot_variable_importance', + 'plot_predictor_quality', + 'plot_correlation_matrix'] From 2c47dd215168738a1b96c63a90300f215b899e8c Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 11:27:05 +0100 Subject: [PATCH 73/98] Add evaluation to setup.py --- setup.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 20ac799..a13d0d8 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,10 @@ name="cobra", version="1.0.0", description="Python Prediction's methodology for predictive analytics", - packages=["cobra", "cobra.preprocessing", "cobra.model_building"], + packages=["cobra", + "cobra.preprocessing", + "cobra.model_building", + "cobra.evaluation"], url="https://github.com/PythonPredictions", #long_description=long_description, # TO DO #long_description_content_type="text/markdown", @@ -16,5 +19,5 @@ "scikit_learn>=0.22.1", "matplotlib>=3.0.2", "seaborn>=0.9.0"], - python_requires=">=3.5", + python_requires=">=3.6", ) From 746c99b7fac3528b1d1adf4932f020248395d7b6 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 14:14:09 +0100 Subject: [PATCH 74/98] Change line endings to linux style in evaluation module --- cobra/evaluation/__init__.py | 34 +- cobra/evaluation/model_evaluator.py | 708 ++++++++++++------------- cobra/evaluation/performance_curves.py | 70 +-- cobra/evaluation/pigs_tables.py | 218 ++++---- cobra/evaluation/predictor_quality.py | 170 +++--- 5 files changed, 600 insertions(+), 600 deletions(-) diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py index 4575e23..5a3f437 100644 --- a/cobra/evaluation/__init__.py +++ b/cobra/evaluation/__init__.py @@ -1,17 +1,17 @@ -from .pigs_tables import generate_pig_tables -from .pigs_tables import compute_pig_table -from .pigs_tables import plot_pig_graph - -from .performance_curves import plot_performance_curves - -from .predictor_quality import plot_variable_importance -from .predictor_quality import plot_predictor_quality -from .predictor_quality import plot_correlation_matrix - -__all__ = ['generate_pig_tables', - 'compute_pig_table', - 'plot_pig_graph', - 'plot_performance_curves', - 'plot_variable_importance', - 'plot_predictor_quality', - 'plot_correlation_matrix'] +from .pigs_tables import generate_pig_tables +from .pigs_tables import compute_pig_table +from .pigs_tables import plot_pig_graph + +from .performance_curves import plot_performance_curves + +from .predictor_quality import plot_variable_importance +from .predictor_quality import plot_predictor_quality +from .predictor_quality import plot_correlation_matrix + +__all__ = ['generate_pig_tables', + 'compute_pig_table', + 'plot_pig_graph', + 'plot_performance_curves', + 'plot_variable_importance', + 'plot_predictor_quality', + 'plot_correlation_matrix'] diff --git a/cobra/evaluation/model_evaluator.py b/cobra/evaluation/model_evaluator.py index a0addd9..f2d1093 100644 --- a/cobra/evaluation/model_evaluator.py +++ b/cobra/evaluation/model_evaluator.py @@ -1,354 +1,354 @@ -""" -Created on Fri Apr 12 09:36:37 2019 -@author: AP_JBENEK -""" -import numpy as np -import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt - -from sklearn.metrics import precision_score -from sklearn.metrics import recall_score -from sklearn.metrics import f1_score -from sklearn.metrics import accuracy_score -from sklearn.metrics import roc_curve -from sklearn.metrics import confusion_matrix -from sklearn.metrics import roc_auc_score - - -class Evaluator(): - - def __init__(self, y_true, y_pred_p, threshold=0.5, lift_at=0.1): - self.y_true = y_true - self.y_pred_p = y_pred_p # As probability - self.lift_at = lift_at - self.threshold = threshold - - #Convert to bool - self.y_pred_b = np.array([0 if pred <= self.threshold else 1 - for pred in self.y_pred_p]) - - def plotROCCurve(self, save_pth=None, desc=None): - ''' - Plot ROC curve and print best cutoff value - - Parameters - ---------- - y_true: True values of target y - proba: Predicted values of target y, probabilities - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - if desc is None: - desc = '' - - fpr, tpr, thresholds = roc_curve(self.y_true, self.y_pred_p) - - #--------------------------- - #Calculate AUC - #-------------------------- - out_perfo = self.evaluation() - score = out_perfo['AUC'] - - fig, ax = plt.subplots(figsize=(8, 5)) - ax.plot(fpr, tpr, color='darkorange', lw=2, - label='ROC curve (area = {s:.3})'.format(s=score)) - ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') - ax.set_xlabel('False Positive Rate', fontsize=15) - ax.set_ylabel('True Positive Rate', fontsize=15) - ax.legend(loc="lower right") - ax.set_title('ROC Curve {}' .format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - #Best cutoff value - #i want value where FPR is highest and FPR is lowest - #https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python - i = np.arange(len(tpr)) - roc = pd.DataFrame({'tf': pd.Series(tpr-(1-fpr), index=i), - 'threshold': pd.Series(thresholds, index=i)}) - roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]] - - best_cutoff = list(roc_t['threshold']) - print(f'Best cutoff value for probability is: {best_cutoff[0]}') - - def plotConfusionMatrix(self, labels=None, color='Reds', save_pth=None, desc=None): - ''' - Plot Confusion matrix with performance measures - - Parameters - ---------- - y_test: True values of target y - pred: Predicted values of target y, boolean - labels: labels for the matrix, if empty, values from y_test_ are used - color: Color of the matrix, its a cmap, so many values possible - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - if labels is None: - labels = [str(lab) for lab in np.unique(self.y_true)] - - if desc is None: - desc = '' - - cm = confusion_matrix(self.y_true, self.y_pred_b) - - fig, ax = plt.subplots(figsize=(8,5)) - ax = sns.heatmap(cm, annot=cm.astype(str), fmt="s", cmap=color, - xticklabels=labels, yticklabels=labels) - ax.set_title('Confusion matrix {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - out_perfo = self.evaluation() - - # If we mark customer as a churner, how often we are correct - print('Precision: {s:.3}'.format(s=out_perfo['precision'])) - # Overall performance - print('Accuracy: {s:.3}'.format(s=out_perfo['accuracy'])) - # How many churners can the model detect - print('Recall: {s:.3}'.format(s=out_perfo['recall'])) - # 2 * (precision * recall) / (precision + recall) - print('F1 Score: {s:.3}'.format(s=out_perfo['F1'])) - # 2 * (precision * recall) / (precision + recall) - print('Lift at top {l}%: {s:.3}' - .format(l=self.lift_at*100, s=out_perfo['lift'])) - # 2 * (precision * recall) / (precision + recall) - print('AUC: {s:.3}'.format(s=out_perfo['AUC'])) - - def plotCumulativeGains(self, save_pth=None, desc=None): - ''' - Functions plot cumulative gains - - Parameters - ---------- - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - if desc is None: - desc = '' - - #--------------------------- - #Calculate cumulative gains - #-------------------------- - nrows = len(self.y_true) - npositives = self.y_true.sum() - df_y_pred = (pd.DataFrame({"y": self.y_true, "y_pred": self.y_pred_p}) - .sort_values(by='y_pred', ascending=False) - .reset_index(drop=True)) - cgains = [0] - for stop in (np.linspace(0.01, 1, 100) * nrows).astype(int): - cgains.append(round(df_y_pred.loc[:stop, 'y'].sum()/npositives*max(100, 1), 2)) - - #--------------------------- - #Plot it - #--------------------------- - plt.style.use('seaborn-darkgrid') - fig, ax_cgains = plt.subplots(figsize=(8, 5)) - ax_cgains.plot(cgains, color='blue', linewidth=3, - label='cumulative gains') - ax_cgains.plot(ax_cgains.get_xlim(), ax_cgains.get_ylim(), linewidth=3, - ls="--", color="darkorange", label='random selection') - ax_cgains.set_title('Cumulative Gains ' + desc, fontsize=20) - - ax_cgains.set_title('Cumulative Gains {}' .format(desc), fontsize=20) - #Format axes - ax_cgains.set_xlim([0, 100]) - ax_cgains.set_ylim([0, 100]) - #Format ticks - ax_cgains.set_yticklabels(['{:3.0f}%'.format(x) - for x in ax_cgains.get_yticks()]) - ax_cgains.set_xticklabels(['{:3.0f}%'.format(x) - for x in ax_cgains.get_xticks()]) - #Legend - ax_cgains.legend(loc='lower right') - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - def plotLift(self, desc=None, save_pth=None): - ''' - Method plots lift per decile - - Parameters - ---------- - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - #--------------------- - #-- CALCULATE LIFT --- - #--------------------- -# inc_rate = self.y_true.mean() - lifts = [Evaluator.liftCalculator(y_true=self.y_true, - y_pred=self.y_pred_p, - lift_at=perc_lift) - for perc_lift in np.arange(0.1, 1.1, 0.1)] - - #--------------------- - #------- PLOT -------- - #--------------------- - if desc is None: - desc = '' - - fig, ax = plt.subplots(figsize=(8,5)) - plt.style.use('seaborn-darkgrid') - - nrows = len(lifts) - x_labels = [nrows-x for x in np.arange(0, nrows, 1)] - - plt.bar(x_labels[::-1], lifts, align='center', color="cornflowerblue") - plt.ylabel('lift', fontsize=15) - plt.xlabel('decile', fontsize=15) - ax.set_xticks(x_labels) - ax.set_xticklabels(x_labels) - - plt.axhline(y=1, color='darkorange', linestyle='--', - xmin=0.1, xmax=0.9, linewidth=3, label='Baseline') - - #Legend - ax.legend(loc='upper right') - - ##Set Axis - make them pretty - sns.despine(ax=ax, right=True, left=True) - - #Remove white lines from the second axis - ax.grid(False) - - ##Description - ax.set_title('Cumulative Lift {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - def plotCumulativeResponse(self, desc=None, save_pth=None): - #--------------------- - #-- CALCULATE LIFT --- - #--------------------- - inc_rate = self.y_true.mean() - lifts = [Evaluator.liftCalculator(y_true=self.y_true, - y_pred=self.y_pred_p, - lift_at=perc_lift) - for perc_lift in np.arange(0.1, 1.1, 0.1)] - lifts = np.array(lifts)*inc_rate*100 - #--------------------- - #------- PLOT -------- - #--------------------- - if desc is None: - desc = '' - - fig, ax = plt.subplots(figsize=(8, 5)) - #plt.style.use('seaborn-darkgrid') - plt.style.use('default') - - nrows = len(lifts) - x_labels = [nrows-x for x in np.arange(0, nrows, 1)] - - plt.bar(x_labels[::-1], lifts, align='center', color="#00ccff") - plt.ylabel('response (%)', fontsize=16) - plt.xlabel('decile', fontsize=16) - ax.set_xticks(x_labels) - ax.set_xticklabels(x_labels) - - plt.axhline(y=inc_rate*100, color='#ff9500', linestyle='--', - xmin=0.05, xmax=0.95, linewidth=3, label='Incidence') - - #Legend - ax.legend(loc='upper right') - - ##Set Axis - make them pretty - sns.despine(ax=ax, right=True, left=True) - - #Remove white lines from the second axis - ax.grid(False) - - ##Description - ax.set_title('Cumulative response {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - def evaluation(self): - ''' - Convenient function, returns various performance measures in a dict - - Parameters - ---------- - y_true: true values - y_pred: predictions as booleans - - Output - ------ - Returns dictionary with the measures - ''' - - dict_perfo = {'precision': precision_score(self.y_true, self.y_pred_b), - 'accuracy': accuracy_score(self.y_true, self.y_pred_b), - 'recall': recall_score(self.y_true, self.y_pred_b), - 'F1': f1_score(self.y_true, self.y_pred_b, - average=None)[1], - 'lift': np.round(Evaluator - .liftCalculator(y_true=self.y_true, - y_pred=self.y_pred_p, - lift_at=self.lift_at), - 2), - 'AUC': roc_auc_score(self.y_true, self.y_pred_p) - } - return dict_perfo - - @staticmethod - def liftCalculator(y_true, y_pred, lift_at=0.05, **kwargs): - ''' - Calculates lift given two arrays on specified level - - Parameters - ---------- - y_true: numpy array with true values - y_pred: numpy array with predictions (probabilities) - lift_at: lift at what top percentage - - Output - ------ - Scalar value, lift. - - 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, - 10000 loops each) - ''' - #Make sure it is numpy array - y_true_ = np.array(y_true) - y_pred_ = np.array(y_pred) - - #Make sure it has correct shape - y_true_ = y_true_.reshape(len(y_true_), 1) - y_pred_ = y_pred_.reshape(len(y_pred_), 1) - - #Merge data together - y_data = np.hstack([y_true_, y_pred_]) - - #Calculate necessary variables - nrows = len(y_data) - stop = int(np.floor(nrows*lift_at)) - avg_incidence = np.einsum('ij->j', y_true_)/float(len(y_true_)) - - #Sort and filter data - data_sorted = (y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] - .reshape(stop, 1)) - - #Calculate lift (einsum is very fast way of summing, - # needs specific shape) - inc_in_top_n = np.einsum('ij->j', data_sorted)/float(len(data_sorted)) - - lift = np.round(inc_in_top_n/avg_incidence, 2)[0] - - return lift +""" +Created on Fri Apr 12 09:36:37 2019 +@author: AP_JBENEK +""" +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +from sklearn.metrics import precision_score +from sklearn.metrics import recall_score +from sklearn.metrics import f1_score +from sklearn.metrics import accuracy_score +from sklearn.metrics import roc_curve +from sklearn.metrics import confusion_matrix +from sklearn.metrics import roc_auc_score + + +class Evaluator(): + + def __init__(self, y_true, y_pred_p, threshold=0.5, lift_at=0.1): + self.y_true = y_true + self.y_pred_p = y_pred_p # As probability + self.lift_at = lift_at + self.threshold = threshold + + #Convert to bool + self.y_pred_b = np.array([0 if pred <= self.threshold else 1 + for pred in self.y_pred_p]) + + def plotROCCurve(self, save_pth=None, desc=None): + ''' + Plot ROC curve and print best cutoff value + + Parameters + ---------- + y_true: True values of target y + proba: Predicted values of target y, probabilities + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + if desc is None: + desc = '' + + fpr, tpr, thresholds = roc_curve(self.y_true, self.y_pred_p) + + #--------------------------- + #Calculate AUC + #-------------------------- + out_perfo = self.evaluation() + score = out_perfo['AUC'] + + fig, ax = plt.subplots(figsize=(8, 5)) + ax.plot(fpr, tpr, color='darkorange', lw=2, + label='ROC curve (area = {s:.3})'.format(s=score)) + ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') + ax.set_xlabel('False Positive Rate', fontsize=15) + ax.set_ylabel('True Positive Rate', fontsize=15) + ax.legend(loc="lower right") + ax.set_title('ROC Curve {}' .format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + #Best cutoff value + #i want value where FPR is highest and FPR is lowest + #https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python + i = np.arange(len(tpr)) + roc = pd.DataFrame({'tf': pd.Series(tpr-(1-fpr), index=i), + 'threshold': pd.Series(thresholds, index=i)}) + roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]] + + best_cutoff = list(roc_t['threshold']) + print(f'Best cutoff value for probability is: {best_cutoff[0]}') + + def plotConfusionMatrix(self, labels=None, color='Reds', save_pth=None, desc=None): + ''' + Plot Confusion matrix with performance measures + + Parameters + ---------- + y_test: True values of target y + pred: Predicted values of target y, boolean + labels: labels for the matrix, if empty, values from y_test_ are used + color: Color of the matrix, its a cmap, so many values possible + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + if labels is None: + labels = [str(lab) for lab in np.unique(self.y_true)] + + if desc is None: + desc = '' + + cm = confusion_matrix(self.y_true, self.y_pred_b) + + fig, ax = plt.subplots(figsize=(8,5)) + ax = sns.heatmap(cm, annot=cm.astype(str), fmt="s", cmap=color, + xticklabels=labels, yticklabels=labels) + ax.set_title('Confusion matrix {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + out_perfo = self.evaluation() + + # If we mark customer as a churner, how often we are correct + print('Precision: {s:.3}'.format(s=out_perfo['precision'])) + # Overall performance + print('Accuracy: {s:.3}'.format(s=out_perfo['accuracy'])) + # How many churners can the model detect + print('Recall: {s:.3}'.format(s=out_perfo['recall'])) + # 2 * (precision * recall) / (precision + recall) + print('F1 Score: {s:.3}'.format(s=out_perfo['F1'])) + # 2 * (precision * recall) / (precision + recall) + print('Lift at top {l}%: {s:.3}' + .format(l=self.lift_at*100, s=out_perfo['lift'])) + # 2 * (precision * recall) / (precision + recall) + print('AUC: {s:.3}'.format(s=out_perfo['AUC'])) + + def plotCumulativeGains(self, save_pth=None, desc=None): + ''' + Functions plot cumulative gains + + Parameters + ---------- + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + if desc is None: + desc = '' + + #--------------------------- + #Calculate cumulative gains + #-------------------------- + nrows = len(self.y_true) + npositives = self.y_true.sum() + df_y_pred = (pd.DataFrame({"y": self.y_true, "y_pred": self.y_pred_p}) + .sort_values(by='y_pred', ascending=False) + .reset_index(drop=True)) + cgains = [0] + for stop in (np.linspace(0.01, 1, 100) * nrows).astype(int): + cgains.append(round(df_y_pred.loc[:stop, 'y'].sum()/npositives*max(100, 1), 2)) + + #--------------------------- + #Plot it + #--------------------------- + plt.style.use('seaborn-darkgrid') + fig, ax_cgains = plt.subplots(figsize=(8, 5)) + ax_cgains.plot(cgains, color='blue', linewidth=3, + label='cumulative gains') + ax_cgains.plot(ax_cgains.get_xlim(), ax_cgains.get_ylim(), linewidth=3, + ls="--", color="darkorange", label='random selection') + ax_cgains.set_title('Cumulative Gains ' + desc, fontsize=20) + + ax_cgains.set_title('Cumulative Gains {}' .format(desc), fontsize=20) + #Format axes + ax_cgains.set_xlim([0, 100]) + ax_cgains.set_ylim([0, 100]) + #Format ticks + ax_cgains.set_yticklabels(['{:3.0f}%'.format(x) + for x in ax_cgains.get_yticks()]) + ax_cgains.set_xticklabels(['{:3.0f}%'.format(x) + for x in ax_cgains.get_xticks()]) + #Legend + ax_cgains.legend(loc='lower right') + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + def plotLift(self, desc=None, save_pth=None): + ''' + Method plots lift per decile + + Parameters + ---------- + save: whether plot should be saved (if yes, then now shown) + desc: description of the plot, used also as a name of saved plot + ''' + #--------------------- + #-- CALCULATE LIFT --- + #--------------------- +# inc_rate = self.y_true.mean() + lifts = [Evaluator.liftCalculator(y_true=self.y_true, + y_pred=self.y_pred_p, + lift_at=perc_lift) + for perc_lift in np.arange(0.1, 1.1, 0.1)] + + #--------------------- + #------- PLOT -------- + #--------------------- + if desc is None: + desc = '' + + fig, ax = plt.subplots(figsize=(8,5)) + plt.style.use('seaborn-darkgrid') + + nrows = len(lifts) + x_labels = [nrows-x for x in np.arange(0, nrows, 1)] + + plt.bar(x_labels[::-1], lifts, align='center', color="cornflowerblue") + plt.ylabel('lift', fontsize=15) + plt.xlabel('decile', fontsize=15) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=1, color='darkorange', linestyle='--', + xmin=0.1, xmax=0.9, linewidth=3, label='Baseline') + + #Legend + ax.legend(loc='upper right') + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title('Cumulative Lift {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + def plotCumulativeResponse(self, desc=None, save_pth=None): + #--------------------- + #-- CALCULATE LIFT --- + #--------------------- + inc_rate = self.y_true.mean() + lifts = [Evaluator.liftCalculator(y_true=self.y_true, + y_pred=self.y_pred_p, + lift_at=perc_lift) + for perc_lift in np.arange(0.1, 1.1, 0.1)] + lifts = np.array(lifts)*inc_rate*100 + #--------------------- + #------- PLOT -------- + #--------------------- + if desc is None: + desc = '' + + fig, ax = plt.subplots(figsize=(8, 5)) + #plt.style.use('seaborn-darkgrid') + plt.style.use('default') + + nrows = len(lifts) + x_labels = [nrows-x for x in np.arange(0, nrows, 1)] + + plt.bar(x_labels[::-1], lifts, align='center', color="#00ccff") + plt.ylabel('response (%)', fontsize=16) + plt.xlabel('decile', fontsize=16) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=inc_rate*100, color='#ff9500', linestyle='--', + xmin=0.05, xmax=0.95, linewidth=3, label='Incidence') + + #Legend + ax.legend(loc='upper right') + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title('Cumulative response {}'.format(desc), fontsize=20) + + if save_pth is not None: + plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') + + plt.show() + + def evaluation(self): + ''' + Convenient function, returns various performance measures in a dict + + Parameters + ---------- + y_true: true values + y_pred: predictions as booleans + + Output + ------ + Returns dictionary with the measures + ''' + + dict_perfo = {'precision': precision_score(self.y_true, self.y_pred_b), + 'accuracy': accuracy_score(self.y_true, self.y_pred_b), + 'recall': recall_score(self.y_true, self.y_pred_b), + 'F1': f1_score(self.y_true, self.y_pred_b, + average=None)[1], + 'lift': np.round(Evaluator + .liftCalculator(y_true=self.y_true, + y_pred=self.y_pred_p, + lift_at=self.lift_at), + 2), + 'AUC': roc_auc_score(self.y_true, self.y_pred_p) + } + return dict_perfo + + @staticmethod + def liftCalculator(y_true, y_pred, lift_at=0.05, **kwargs): + ''' + Calculates lift given two arrays on specified level + + Parameters + ---------- + y_true: numpy array with true values + y_pred: numpy array with predictions (probabilities) + lift_at: lift at what top percentage + + Output + ------ + Scalar value, lift. + + 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, + 10000 loops each) + ''' + #Make sure it is numpy array + y_true_ = np.array(y_true) + y_pred_ = np.array(y_pred) + + #Make sure it has correct shape + y_true_ = y_true_.reshape(len(y_true_), 1) + y_pred_ = y_pred_.reshape(len(y_pred_), 1) + + #Merge data together + y_data = np.hstack([y_true_, y_pred_]) + + #Calculate necessary variables + nrows = len(y_data) + stop = int(np.floor(nrows*lift_at)) + avg_incidence = np.einsum('ij->j', y_true_)/float(len(y_true_)) + + #Sort and filter data + data_sorted = (y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] + .reshape(stop, 1)) + + #Calculate lift (einsum is very fast way of summing, + # needs specific shape) + inc_in_top_n = np.einsum('ij->j', data_sorted)/float(len(data_sorted)) + + lift = np.round(inc_in_top_n/avg_incidence, 2)[0] + + return lift diff --git a/cobra/evaluation/performance_curves.py b/cobra/evaluation/performance_curves.py index 9ce9be4..1e0032d 100644 --- a/cobra/evaluation/performance_curves.py +++ b/cobra/evaluation/performance_curves.py @@ -1,35 +1,35 @@ -# third party imports -import numpy as np -import pandas as pd - -import matplotlib.pyplot as plt - - -def plot_performance_curves(model_performances: list, - dim: tuple=(12, 8)): - - df_plt = pd.DataFrame(model_performances) - - highest_auc = np.round(max(max(df_plt['train_performance']), - max(df_plt['selection_performance']), - max(df_plt['validation_performance'])), 1) - - fig, ax = plt.subplots(figsize=dim) - - plt.plot(df_plt['train_performance'], marker=".", markersize=20, - linewidth=3, label='AUC train') - plt.plot(df_plt['selection_performance'], marker=".", markersize=20, - linewidth=3, label='AUC selection') - plt.plot(df_plt['validation_performance'], marker=".", markersize=20, - linewidth=3, label='AUC validation') - # Set x/yticks - ax.set_xticks(np.arange(len(df_plt['last_added_predictor']) + 1)) - ax.set_xticklabels(df_plt['last_added_predictor'].tolist(), - rotation=40, ha='right') - ax.set_yticks(np.arange(0.5, highest_auc + 0.02, 0.05)) - #Make Pretty - ax.legend(loc='lower right') - fig.suptitle('Performance curves - forward feature selection', - fontsize=20) - plt.ylabel('Model performance') - plt.show() +# third party imports +import numpy as np +import pandas as pd + +import matplotlib.pyplot as plt + + +def plot_performance_curves(model_performance: pd.DataFrame, + dim: tuple=(12, 8)): + + highest_auc = np.round(max(max(model_performance['train_performance']), + max(model_performance['selection_performance']), + max(model_performance['validation_performance']) + ), 1) + + fig, ax = plt.subplots(figsize=dim) + + plt.plot(model_performance['train_performance'], marker=".", markersize=20, + linewidth=3, label='AUC train') + plt.plot(model_performance['selection_performance'], marker=".", + markersize=20, linewidth=3, label='AUC selection') + plt.plot(model_performance['validation_performance'], marker=".", + markersize=20, linewidth=3, label='AUC validation') + # Set x/yticks + ax.set_xticks(np.arange(len(model_performance['last_added_predictor']) + + 1)) + ax.set_xticklabels(model_performance['last_added_predictor'].tolist(), + rotation=40, ha='right') + ax.set_yticks(np.arange(0.5, highest_auc + 0.02, 0.05)) + #Make Pretty + ax.legend(loc='lower right') + fig.suptitle('Performance curves - forward feature selection', + fontsize=20) + plt.ylabel('Model performance') + plt.show() diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index ccd64e9..8afb4f7 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -1,109 +1,109 @@ -# third party imports -import pandas as pd -#import matplotlib.pyplot as plt -#import seaborn as sns - -import cobra.utils as utils - - -def generate_pig_tables(data: pd.DataFrame, - id_column_name: str, - target_column_name: str, - preprocessed_predictors: list) -> pd.DataFrame: - """Summary - - Parameters - ---------- - data : pd.DataFrame - basetable to compute PIG tables of - id_column_name : str - column name of the id (e.g. customernumber) - target_column_name : str - column name of the target - predictors: list - list of preprocessed predictor names - - Returns - ------- - pd.DataFrame - DataFrame containing a PIG table for all predictors - """ - - # Based on the data, get column names by datatype - # threshold to decide whether a numeric column should be considered - # a categorical variable (if the number of distinct values is smaller - # or equal to the number of requested bins) - - pigs = [compute_pig_table(data, column_name, target_column_name, - id_column_name) - for column_name in sorted(preprocessed_predictors) - if column_name not in [id_column_name, target_column_name]] - - output = pd.concat(pigs) - - return output - - -def compute_pig_table(data: pd.DataFrame, - column_name: str, - target_column_name: str, - id_column_name: str) -> pd.DataFrame: - """Compute the pig table of a given predictor for a given target - - Parameters - ---------- - data : pd.DataFrame - input data from which to compute the pig table - column_name : str - predictor name of which to compute the pig table - target_column_name : str - name of the target variable - id_column_name : str - name of the id column (used to count population size) - - Returns - ------- - pd.DataFrame - pig table as a DataFrame - """ - avg_incidence = data[target_column_name].mean() - - # group by the binned variable, compute the incidence - # (=mean of the target for the given bin) and compute the bin size - # (e.g. COUNT(id_column_name)). After that, rename the columns - res = (data.groupby(column_name) - .agg({target_column_name: "mean", id_column_name: "size"}) - .reset_index() - .rename(columns={column_name: "label", - target_column_name: "incidence", - id_column_name: "pop_size"})) - - # add the column name to a variable column - # add the average incidence - # replace population size by a percentage of total population - res["variable"] = utils.clean_predictor_name(column_name) - res["avg_incidence"] = avg_incidence - res["pop_size"] = res["pop_size"]/len(data.index) - - # make sure to always return the data with the proper column order - column_order = ["variable", "label", "pop_size", - "avg_incidence", "incidence"] - - return res[column_order] - - -def plot_pig_graph(pig_table: pd.DataFrame, - dim: tuple=(12, 8), - save_path: str=None): - """Create the Predictor Insights Graphs from a PIG table - - Parameters - ---------- - pig_table : pd.DataFrame - Description - dim : tuple, optional - Tuple with width and lentgh of the plot - save_path : str, optional - path to store the plot on disk - """ - pass +# third party imports +import pandas as pd +#import matplotlib.pyplot as plt +#import seaborn as sns + +import cobra.utils as utils + + +def generate_pig_tables(data: pd.DataFrame, + id_column_name: str, + target_column_name: str, + preprocessed_predictors: list) -> pd.DataFrame: + """Summary + + Parameters + ---------- + data : pd.DataFrame + basetable to compute PIG tables of + id_column_name : str + column name of the id (e.g. customernumber) + target_column_name : str + column name of the target + predictors: list + list of preprocessed predictor names + + Returns + ------- + pd.DataFrame + DataFrame containing a PIG table for all predictors + """ + + # Based on the data, get column names by datatype + # threshold to decide whether a numeric column should be considered + # a categorical variable (if the number of distinct values is smaller + # or equal to the number of requested bins) + + pigs = [compute_pig_table(data, column_name, target_column_name, + id_column_name) + for column_name in sorted(preprocessed_predictors) + if column_name not in [id_column_name, target_column_name]] + + output = pd.concat(pigs) + + return output + + +def compute_pig_table(data: pd.DataFrame, + column_name: str, + target_column_name: str, + id_column_name: str) -> pd.DataFrame: + """Compute the pig table of a given predictor for a given target + + Parameters + ---------- + data : pd.DataFrame + input data from which to compute the pig table + column_name : str + predictor name of which to compute the pig table + target_column_name : str + name of the target variable + id_column_name : str + name of the id column (used to count population size) + + Returns + ------- + pd.DataFrame + pig table as a DataFrame + """ + avg_incidence = data[target_column_name].mean() + + # group by the binned variable, compute the incidence + # (=mean of the target for the given bin) and compute the bin size + # (e.g. COUNT(id_column_name)). After that, rename the columns + res = (data.groupby(column_name) + .agg({target_column_name: "mean", id_column_name: "size"}) + .reset_index() + .rename(columns={column_name: "label", + target_column_name: "incidence", + id_column_name: "pop_size"})) + + # add the column name to a variable column + # add the average incidence + # replace population size by a percentage of total population + res["variable"] = utils.clean_predictor_name(column_name) + res["avg_incidence"] = avg_incidence + res["pop_size"] = res["pop_size"]/len(data.index) + + # make sure to always return the data with the proper column order + column_order = ["variable", "label", "pop_size", + "avg_incidence", "incidence"] + + return res[column_order] + + +def plot_pig_graph(pig_table: pd.DataFrame, + dim: tuple=(12, 8), + save_path: str=None): + """Create the Predictor Insights Graphs from a PIG table + + Parameters + ---------- + pig_table : pd.DataFrame + Description + dim : tuple, optional + Tuple with width and lentgh of the plot + save_path : str, optional + path to store the plot on disk + """ + pass diff --git a/cobra/evaluation/predictor_quality.py b/cobra/evaluation/predictor_quality.py index f2a2ea3..327d592 100644 --- a/cobra/evaluation/predictor_quality.py +++ b/cobra/evaluation/predictor_quality.py @@ -1,85 +1,85 @@ -# third party imports -import pandas as pd - -import matplotlib.pyplot as plt -import seaborn as sns - - -def plot_variable_importance(importance_by_variable: dict, - title: str=None, - dim: tuple=(12, 8)): - """Plot variable importance of a given model - - Parameters - ---------- - importance_by_variable : dict - Map of predictor -> importance - title : str, optional - Title of the plot - dim : tuple, optional - tuple with width and lentgh of the plot - """ - df = pd.DataFrame.from_dict(importance_by_variable, - orient='index').reset_index() - - df.columns = ["predictor", "importance"] - - df = df.sort_values(by="importance", ascending=False) - - # plot data - fig, ax = plt.subplots(figsize=dim) - ax = sns.barplot(x="importance", y="predictor", data=df) - if title: - ax.set_title(title) - else: - ax.set_title("Variable importance") - plt.show() - - -def plot_predictor_quality(df_auc: pd.DataFrame, - dim: tuple=(12, 8)): - """Plot univariate quality of the predictors - - Parameters - ---------- - df_auc : pd.DatFrame - Contains for each variable the train auc and selection auc allong with - a boolean indicating whether or not it is selected based on the - criteria - dim : tuple, optional - tuple with width and lentgh of the plot - """ - - plt.style.use('seaborn-darkgrid') - - df = (df_auc[df_auc["preselection"]] - .sort_values(by='AUC train', ascending=False)) - - df = pd.melt(df, id_vars=["predictor"], - value_vars=["AUC train", "AUC selection"], - var_name="partition", - value_name="AUC") - - # plots - fig, ax = plt.subplots(figsize=dim) - - ax = sns.barplot(x="AUC", y="predictor", hue="partition", data=df) - ax.set_title('Univariate Quality of Predictors') - plt.show() - - -def plot_correlation_matrix(df_corr: pd.DataFrame, - dim: tuple=(12, 8)): - """Plot correlation matrix amongst the predictors - - Parameters - ---------- - df_corr : pd.DataFrame - Correlation matrix - dim : tuple, optional - tuple with width and lentgh of the plot - """ - fig, ax = plt.subplots(figsize=dim) - ax = sns.heatmap(df_corr, cmap='Blues') - ax.set_title('Correlation Matrix') - plt.show() +# third party imports +import pandas as pd + +import matplotlib.pyplot as plt +import seaborn as sns + + +def plot_variable_importance(importance_by_variable: dict, + title: str=None, + dim: tuple=(12, 8)): + """Plot variable importance of a given model + + Parameters + ---------- + importance_by_variable : dict + Map of predictor -> importance + title : str, optional + Title of the plot + dim : tuple, optional + tuple with width and lentgh of the plot + """ + df = pd.DataFrame.from_dict(importance_by_variable, + orient='index').reset_index() + + df.columns = ["predictor", "importance"] + + df = df.sort_values(by="importance", ascending=False) + + # plot data + fig, ax = plt.subplots(figsize=dim) + ax = sns.barplot(x="importance", y="predictor", data=df) + if title: + ax.set_title(title) + else: + ax.set_title("Variable importance") + plt.show() + + +def plot_predictor_quality(df_auc: pd.DataFrame, + dim: tuple=(12, 8)): + """Plot univariate quality of the predictors + + Parameters + ---------- + df_auc : pd.DatFrame + Contains for each variable the train auc and selection auc allong with + a boolean indicating whether or not it is selected based on the + criteria + dim : tuple, optional + tuple with width and lentgh of the plot + """ + + plt.style.use('seaborn-darkgrid') + + df = (df_auc[df_auc["preselection"]] + .sort_values(by='AUC train', ascending=False)) + + df = pd.melt(df, id_vars=["predictor"], + value_vars=["AUC train", "AUC selection"], + var_name="partition", + value_name="AUC") + + # plots + fig, ax = plt.subplots(figsize=dim) + + ax = sns.barplot(x="AUC", y="predictor", hue="partition", data=df) + ax.set_title('Univariate Quality of Predictors') + plt.show() + + +def plot_correlation_matrix(df_corr: pd.DataFrame, + dim: tuple=(12, 8)): + """Plot correlation matrix amongst the predictors + + Parameters + ---------- + df_corr : pd.DataFrame + Correlation matrix + dim : tuple, optional + tuple with width and lentgh of the plot + """ + fig, ax = plt.subplots(figsize=dim) + ax = sns.heatmap(df_corr, cmap='Blues') + ax.set_title('Correlation Matrix') + plt.show() From 45c1ee6b56ca3ba5dc0c6d3141ce920dc34c82ae Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Wed, 25 Mar 2020 14:15:57 +0100 Subject: [PATCH 75/98] Type fix in README, add examples.ipynb --- README.md | 13 +- examples/examples.ipynb | 356 ++++++++++++++++++++++++++++++++++------ 2 files changed, 318 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 9dd035c..1c7bcc6 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ preprocessor.fit(basetable[basetable["split"]=="train"], basetable = preprocessor.transform(basetable, continuous_vars=continuous_vars, discrete_vars=discrete_vars) + ``` Once the preprocessing pipeline is fitted and applied to your data, it is time for the actual modelling. In this part of the process, @@ -103,14 +104,18 @@ we first start with the _univariate preselection_: ```python from cobra.model_building import univariate_selection +# Get list of predictor names to use for univariate_selection +preprocessed_predictors = [col for col in basetable.columns if col.endswith("_enc")] + # perform univariate selection on preprocessed predictors: df_auc = univariate_selection.compute_univariate_preselection( target_enc_train_data=basetable[basetable["split"] == "train"], target_enc_selection_data=basetable[basetable["split"] == "selection"], predictors=preprocessed_predictors, target_column=target_column_name, - preselect_auc_threshold=0.5, - preselect_overtrain_threshold=5) + preselect_auc_threshold=0.53, # if auc_selection <= 0.53 exclude predictor + preselect_overtrain_threshold=0.05 # if (auc_train - auc_selection) >= 0.05 --> overfitting! + ) # compute correlations between preprocessed predictors: df_corr = (univariate_selection @@ -142,13 +147,13 @@ performances = (forward_selection # After plotting the performances and selecting the model, # we can extract this model from the forward_selection class: -model = forward_selection.get_model_from_step(5) +model = forward_selection.get_model_from_step(5) # Python indexing starts from 0, so this model has 6 predictors # Note that model has 6 variables (python lists start with index 0), # which can be obtained as follows: final_predictors = model.predictors # We can also compute the importance of each predictor in the model (dict): -variable_importance = model.compute_variable_importance(transformed_data) +variable_importance = model.compute_variable_importance(basetable) ``` ## Development diff --git a/examples/examples.ipynb b/examples/examples.ipynb index 4611637..146ec0c 100644 --- a/examples/examples.ipynb +++ b/examples/examples.ipynb @@ -41,20 +41,18 @@ "outputs": [], "source": [ "# third party imports \n", - "import numpy as np\n", "import pandas as pd \n", "\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", "# Custom imports\n", - "import sys\n", - "sys.path.append(\"/mnt/c/Users/matroe/Documents/workspace/cobra\")\n", "from cobra.preprocessing import PreProcessor\n", "from cobra.model_building import univariate_selection\n", "from cobra.model_building import ForwardFeatureSelection\n", - "from cobra.evaluation import generate_pig_tables" + "\n", + "from cobra.evaluation import generate_pig_tables\n", + "from cobra.evaluation import plot_performance_curves\n", + "from cobra.evaluation import plot_variable_importance\n", + "from cobra.evaluation import plot_predictor_quality\n", + "from cobra.evaluation import plot_correlation_matrix" ] }, { @@ -67,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -272,13 +270,13 @@ "[5 rows x 36 columns]" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "path = \"../datasets/data.csv\"\n", + "path = \"\"\n", "\n", "basetable = pd.read_csv(path)\n", "\n", @@ -287,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -296,14 +294,14 @@ "Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',\n", " 'marital-status', 'occupation', 'relationship', 'race', 'sex',\n", " 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',\n", - " 'ID', 'scont_1', 'scont_2', 'scont_3', 'scont_4', 'scont_5', 'scont_6',\n", - " 'scont_7', 'scont_8', 'scont_9', 'scont_10', 'scat_1', 'scat_2',\n", - " 'scat_3', 'scat_4', 'scat_5', 'sflag_1', 'sflag_2', 'sflag_3',\n", - " 'sflag_4', 'sflag_5', 'TARGET', 'split'],\n", + " 'TARGET', 'ID', 'scont_1', 'scont_2', 'scont_3', 'scont_4', 'scont_5',\n", + " 'scont_6', 'scont_7', 'scont_8', 'scont_9', 'scont_10', 'scat_1',\n", + " 'scat_2', 'scat_3', 'scat_4', 'scat_5', 'sflag_1', 'sflag_2', 'sflag_3',\n", + " 'sflag_4', 'sflag_5'],\n", " dtype='object')" ] }, - "execution_count": 12, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -321,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -333,14 +331,14 @@ "Name: split, dtype: int64" ] }, - "execution_count": 8, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Prepare data\n", - "path = \"../test_pipeline.json\"\n", + "path = \"\"\n", "preprocessor = PreProcessor.from_params(serialization_path=path)\n", "\n", "basetable = preprocessor.train_selection_validation_split(basetable, target_column_name=\"TARGET\",\n", @@ -351,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -366,7 +364,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -402,7 +400,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -423,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -628,7 +626,7 @@ "[5 rows x 99 columns]" ] }, - "execution_count": 24, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -646,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -664,7 +662,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -684,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -803,7 +801,7 @@ "9 native-country 0.514099 0.513408 True" ] }, - "execution_count": 30, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -821,7 +819,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -837,6 +835,46 @@ "Clearly, for meaningful inspection, we need to visualize the data. Therefore, below are plots for **Predictor quality** and **Correlation Matrix**." ] }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_predictor_quality(df_auc)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_correlation_matrix(df_corr)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -846,7 +884,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -855,7 +893,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -866,7 +904,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -996,7 +1034,7 @@ "6 workclass_processed Self-emp-not-inc 0.080498 0.239277 0.274693" ] }, - "execution_count": 38, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1008,7 +1046,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -1025,7 +1063,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -1039,7 +1077,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -1048,7 +1086,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1106,15 +1144,15 @@ " \n", " \n", " 3\n", - " [scont_1_enc, education_enc, relationship_enc,...\n", + " [scont_1_enc, relationship_enc, education_enc,...\n", " age_enc\n", - " 0.870536\n", + " 0.870537\n", " 0.872149\n", " 0.871496\n", " \n", " \n", " 4\n", - " [scont_1_enc, education_enc, relationship_enc,...\n", + " [scont_1_enc, relationship_enc, age_enc, educa...\n", " occupation_enc\n", " 0.878380\n", " 0.882468\n", @@ -1129,18 +1167,18 @@ "0 [scont_1_enc] scont_1_enc \n", "1 [scont_1_enc, relationship_enc] relationship_enc \n", "2 [scont_1_enc, relationship_enc, education_enc] education_enc \n", - "3 [scont_1_enc, education_enc, relationship_enc,... age_enc \n", - "4 [scont_1_enc, education_enc, relationship_enc,... occupation_enc \n", + "3 [scont_1_enc, relationship_enc, education_enc,... age_enc \n", + "4 [scont_1_enc, relationship_enc, age_enc, educa... occupation_enc \n", "\n", " train_performance selection_performance validation_performance \n", "0 0.506035 0.489155 0.497746 \n", "1 0.779681 0.775267 0.787203 \n", "2 0.858048 0.856376 0.859707 \n", - "3 0.870536 0.872149 0.871496 \n", + "3 0.870537 0.872149 0.871496 \n", "4 0.878380 0.882468 0.880178 " ] }, - "execution_count": 42, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1149,6 +1187,26 @@ "performances.head()" ] }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_performance_curves(performances)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1158,10 +1216,214 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['age_enc', 'occupation_enc', 'scont_1_enc', 'relationship_enc', 'education_enc', 'hours-per-week_enc']\n" + ] + } + ], + "source": [ + "model = forward_selection.get_model_from_step(5) # Python starts to count from 0!\n", + "print(model.predictors)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'age': 0.5172855940047185,\n", + " 'occupation': 0.5634536788555236,\n", + " 'scont_1': 0.012280802872168558,\n", + " 'relationship': 0.7363851709976446,\n", + " 'education': 0.5989073013361791,\n", + " 'hours-per-week': 0.44453822509112334}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "importance_by_variable = model.compute_variable_importance(basetable)\n", + "importance_by_variable" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_variable_importance(importance_by_variable)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "from cobra.evaluation.model_evaluator import Evaluator" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "y_true = basetable[basetable[\"split\"] == \"selection\"][\"TARGET\"].values\n", + "y_pred = model.score_model(basetable[basetable[\"split\"] == \"selection\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 32, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "evaluator = Evaluator(y_true, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best cutoff value for probability is: 0.28123688299785216\n" + ] + } + ], + "source": [ + "evaluator.plotROCCurve()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Precision: 0.712\n", + "Accuracy: 0.836\n", + "Recall: 0.526\n", + "F1 Score: 0.605\n", + "Lift at top 10.0%: 3.32\n", + "AUC: 0.885\n" + ] + } + ], + "source": [ + "evaluator.plotConfusionMatrix()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "evaluator.plotCumulativeGains()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "evaluator.plotCumulativeResponse()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "evaluator.plotLift()" + ] } ], "metadata": { From d1a67bbadc5060d41af9032d0ffdd19cf9cbedd7 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 26 Mar 2020 09:25:22 +0100 Subject: [PATCH 76/98] Update README --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1c7bcc6..53ce70a 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,13 @@ As this package is an internal package that is not open-sourced, it is not avail ### Usage -This section contains detailed examples for each step. We assume the data for model building is available in a pandas DataFrame called `basetable`. +This section contains detailed examples for each step on how to use COBRA for building a predictive model. All classes and functions contain detailed documentation, so in case you want more information on a class or function, simply run the following python snippet: + +```python +help(function_or_class_you_want_info_from) +``` + +In the examples below, we assume the data for model building is available in a pandas DataFrame called `basetable`. ```python from cobra.preprocessing import PreProcessor @@ -158,4 +164,4 @@ variable_importance = model.compute_variable_importance(basetable) ## Development -We'd love you to contribute to the development of Cobra! To do so, clone the repo and create a _feature branch_ to do your development. Once your are finished, you can create a _pull request_ to merge it back into the main branch. Make sure to write or modify unit test for your changes if they are related to preprocessing! +We'd love you to contribute to the development of Cobra! To do so, clone the repo and create a _feature branch_ to do your development. Once your are finished, you can create a _pull request_ to merge it back into the main branch. Make sure to follow the _PEP 8_ styleguide if you make any changes to COBRA. You should also write or modify unit test for your changes if they are related to preprocessing! From 21cc7a735517bd549bb22fae276edab18bc73db7 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 30 Mar 2020 14:57:08 +0200 Subject: [PATCH 77/98] Change return type of Models.compute_variable_importance --- cobra/evaluation/predictor_quality.py | 15 +++++---------- cobra/model_building/models.py | 16 +++++++++++----- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/cobra/evaluation/predictor_quality.py b/cobra/evaluation/predictor_quality.py index 327d592..3fac91f 100644 --- a/cobra/evaluation/predictor_quality.py +++ b/cobra/evaluation/predictor_quality.py @@ -5,30 +5,25 @@ import seaborn as sns -def plot_variable_importance(importance_by_variable: dict, +def plot_variable_importance(df_variable_importance: pd.DataFrame, title: str=None, dim: tuple=(12, 8)): """Plot variable importance of a given model Parameters ---------- - importance_by_variable : dict - Map of predictor -> importance + df_variable_importance : pd.DataFrame + DataFrame containing columns predictor and importance title : str, optional Title of the plot dim : tuple, optional tuple with width and lentgh of the plot """ - df = pd.DataFrame.from_dict(importance_by_variable, - orient='index').reset_index() - - df.columns = ["predictor", "importance"] - - df = df.sort_values(by="importance", ascending=False) # plot data fig, ax = plt.subplots(figsize=dim) - ax = sns.barplot(x="importance", y="predictor", data=df) + ax = sns.barplot(x="importance", y="predictor", + data=df_variable_importance) if title: ax.set_title(title) else: diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 53f8c3c..597117a 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -124,9 +124,9 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series, return self._eval_metrics_by_split[split] - def compute_variable_importance(self, data: pd.DataFrame) -> dict: + def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: """Compute the importance of each predictor in the model and return - it as a dictionary + it as a DataFrame Parameters ---------- @@ -135,16 +135,22 @@ def compute_variable_importance(self, data: pd.DataFrame) -> dict: Returns ------- - dict - Map of predictor -> importance + pd.DataFrame + DataFrame containing columns predictor and importance """ y_pred = self.score_model(data) - return { + importance_by_variable = { utils.clean_predictor_name(predictor): stats.pearsonr( data[predictor], y_pred )[0] for predictor in self.predictors } + + df = pd.DataFrame.from_dict(importance_by_variable, + orient='index').reset_index() + df.columns = ["predictor", "importance"] + + return df.sort_values(by="importance", ascending=False) From 0d4b1571e4837b061542795043cf2861786490ca Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 30 Mar 2020 15:07:57 +0200 Subject: [PATCH 78/98] Add Evaluator class --- cobra/evaluation/__init__.py | 17 +- cobra/evaluation/evaluator.py | 456 ++++++++++++++++++++++++++++++++++ 2 files changed, 466 insertions(+), 7 deletions(-) create mode 100644 cobra/evaluation/evaluator.py diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py index 5a3f437..b7e5f8e 100644 --- a/cobra/evaluation/__init__.py +++ b/cobra/evaluation/__init__.py @@ -7,11 +7,14 @@ from .predictor_quality import plot_variable_importance from .predictor_quality import plot_predictor_quality from .predictor_quality import plot_correlation_matrix +from .evaluator import Evaluator -__all__ = ['generate_pig_tables', - 'compute_pig_table', - 'plot_pig_graph', - 'plot_performance_curves', - 'plot_variable_importance', - 'plot_predictor_quality', - 'plot_correlation_matrix'] + +__all__ = ["generate_pig_tables", + "compute_pig_table", + "plot_pig_graph", + "plot_performance_curves", + "plot_variable_importance", + "plot_predictor_quality", + "plot_correlation_matrix", + "Evaluator"] diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py new file mode 100644 index 0000000..6583670 --- /dev/null +++ b/cobra/evaluation/evaluator.py @@ -0,0 +1,456 @@ +import numpy as np +import pandas as pd + +import matplotlib.pyplot as plt +import seaborn as sns + +from sklearn.metrics import precision_score +from sklearn.metrics import recall_score +from sklearn.metrics import f1_score +from sklearn.metrics import accuracy_score +from sklearn.metrics import roc_curve +from sklearn.metrics import confusion_matrix +from sklearn.metrics import roc_auc_score +from sklearn.exceptions import NotFittedError + + +class Evaluator(): + + """Summary + + Attributes + ---------- + confusion_matrix : np.ndarray + Confusion matrix computed for a particular cut-off + cumulative_gains : tuple + data for plotting cumulative gains curve + evaluation_metrics : dict + map containing various scalar evaluation metics such as AUC, ... + lift_at : float + parameter to determine at which top level percentage the lift of the + model should be computed + lift_curve : tuple + data for plotting lift curve(s) + probability_cutoff : float + probability cut off to convert probability scores to a binary score + roc_curve : dict + map containing true-positive-rate, false-positve-rate at various + thresholds (also incl.) + """ + + def __init__(self, probability_cutoff: float=None, + lift_at: float=0.05): + + self.lift_at = lift_at + self.probability_cutoff = probability_cutoff + + # Placeholder to store fitted output + self.evaluation_metrics = None + self.roc_curve = None + self.confusion_matrix = None + self.lift_curve = None + self.cumulative_gains = None + + def fit(self, y_true: np.ndarray, y_pred: np.ndarray): + + fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_pred) + + # if probability_cutoff is not set, take the optimal cut off + if not self.probability_cutoff: + self.probability_cutoff = (Evaluator. + _compute_optimal_cutoff(fpr, tpr, + thresholds)) + + # Transform probabilities to binary array using cut off: + y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1 + for pred in y_pred]) + + # Compute the various evaluation metrics + self.evaluation_metrics = Evaluator.compute_evaluation_metrics( + y_true, + y_pred, + y_pred_b, + self.lift_at + ) + + self.roc_curve = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds} + self.confusion_matrix = confusion_matrix(y_true, y_pred_b) + self.lift_curve = Evaluator._compute_lift_per_decile(y_true, y_pred) + self.cumulative_gains = Evaluator._compute_cumulative_gains(y_true, + y_pred) + + def get_metrics(self) -> pd.Series: + """Get the evaluation_metrics attribute as a pandas Series + + Returns + ------- + pd.Series + Score of various scalar evaluation metrics for the model + """ + return pd.Series(self.evaluation_metrics) + + @staticmethod + def compute_evaluation_metrics(y_true: np.ndarray, + y_pred: np.ndarray, + y_pred_b: np.ndarray, + lift_at: float) -> dict: + """Convenient function to compute various performance measures and + return them in a dict + + Parameters + ---------- + y_true : np.ndarray + True binary target data labels + y_pred : np.ndarray + Target scores of the model + y_pred_b : np.ndarray + Predicted target data labels (binary) + lift_at : float + At what top level percentage the lift should be computed + + Returns + ------- + dict + contains various performance measures of the model + """ + return { + "accuracy": accuracy_score(y_true, y_pred_b), + "AUC": roc_auc_score(y_true, y_pred), + "precision": precision_score(y_true, y_pred_b), + "recall": recall_score(y_true, y_pred_b), + "F1": f1_score(y_true, y_pred_b, average=None)[1], + f"lift at {lift_at}": np.round(Evaluator + ._compute_lift(y_true=y_true, + y_pred=y_pred, + lift_at=lift_at), 2) + } + + def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): + """Plot ROC curves of the model + + Parameters + ---------- + path : str, optional + path to store the figure + dim : tuple, optional + tuple with width and lentgh of the plot + """ + + if self.roc_curve is None: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + auc = self.evaluation_metrics["AUC"] + + fig, ax = plt.subplots(figsize=dim) + + ax.plot(self.roc_curve["fpr"], + self.roc_curve["tpr"], + color="darkorange", lw=2, + label="ROC curve (area = {s:.3})".format(s=auc)) + + ax.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") + ax.set_xlabel("False Positive Rate", fontsize=15) + ax.set_ylabel("True Positive Rate", fontsize=15) + ax.legend(loc="lower right") + ax.set_title("ROC Curve", fontsize=20) + + if path: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + + plt.show() + + def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), + labels: list=["0", "1"]): + """Plot the confusion matrix + + Parameters + ---------- + path : str, optional + path to store the figure + dim : tuple, optional + tuple with width and lentgh of the plot + labels : list, optional + Optional list of labels, default "0" and "1" + """ + + if self.confusion_matrix is None: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + fig, ax = plt.subplots(figsize=dim) + ax = sns.heatmap(self.confusion_matrix, + annot=self.confusion_matrix.astype(str), + fmt="s", cmap="Reds", + xticklabels=labels, yticklabels=labels) + ax.set_title("Confusion matrix", fontsize=20) + + if path: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + + plt.show() + + def plot_cumulative_response_curve(self, path: str=None, + dim: tuple=(12, 8)): + """Plot cumulative response curve + + Parameters + ---------- + path : str, optional + path to store the figure + dim : tuple, optional + tuple with width and lentgh of the plot + """ + + if self.lift_curve is None: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + x_labels, lifts, inc_rate = self.lift_curve + + lifts = np.array(lifts)*inc_rate*100 + + fig, ax = plt.subplots(figsize=dim) + plt.style.use("default") + + plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue") + plt.ylabel("response (%)", fontsize=16) + plt.xlabel("decile", fontsize=16) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=inc_rate*100, color="darkorange", linestyle="--", + xmin=0.05, xmax=0.95, linewidth=3, label="Incidence") + + #Legend + ax.legend(loc="upper right") + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title("Cumulative response", fontsize=20) + + if path is not None: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + + plt.show() + + def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): + """Plot lift per decile + + Parameters + ---------- + path : str, optional + path to store the figure + dim : tuple, optional + tuple with width and lentgh of the plot + """ + + if self.lift_curve is None: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + x_labels, lifts, _ = self.lift_curve + + fig, ax = plt.subplots(figsize=dim) + plt.style.use("default") + + plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue") + plt.ylabel("lift", fontsize=16) + plt.xlabel("decile", fontsize=16) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=1, color="darkorange", linestyle="--", + xmin=0.05, xmax=0.95, linewidth=3, label="Baseline") + + #Legend + ax.legend(loc="upper right") + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title("Cumulative Lift", fontsize=20) + + if path is not None: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + + plt.show() + + def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): + """Plot lift per decile + + Parameters + ---------- + path : str, optional + path to store the figure + dim : tuple, optional + tuple with width and lentgh of the plot + """ + pass + + @staticmethod + def find_optimal_cutoff(y_true: np.ndarray, + y_pred: np.ndarray) -> float: + """Find the optimal probability cut off point for a + classification model. Wrapper around _compute_optimal_cutoff + + Parameters + ---------- + y_true : np.ndarray + True binary target data labels + y_pred : np.ndarray + Target scores of the model + + Returns + ------- + float + Optimal cut off probability for the model + """ + return Evaluator._compute_optimal_cutoff(roc_curve(y_true=y_true, + y_score=y_pred)) + + @staticmethod + def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, + thresholds: np.ndarray) -> float: + """Find the optimal probability cut off point for a + classification model + + Parameters + ---------- + fpr : np.ndarray + false positive rate for various thresholds + tpr : np.ndarray + true positive rate for various thresholds + thresholds : np.ndarray + list of thresholds for which fpr and tpr were computed + + Returns + ------- + float + Description + """ + + # The optimal cut off would be where tpr is high and fpr is low, hence + # tpr - (1-fpr) should be zero or close to zero for the optimal cut off + temp = np.absolute(tpr - (1-fpr)) + + # index for optimal value is the one for which temp is minimal + optimal_index = np.where(temp == min(temp))[0] + + return thresholds[optimal_index][0] + + @staticmethod + def _compute_cumulative_gains(y_true: np.ndarray, + y_pred: np.ndarray) -> tuple: + """Compute lift of the model per decile, returns x-labels, lifts and + the target incidence to create cummulative response curves + + Parameters + ---------- + y_true : np.ndarray + True binary target data labels + y_pred : np.ndarray + Target scores of the model + + Returns + ------- + tuple + x-labels, lifts per decile and target incidence + """ + pass + + @staticmethod + def _compute_lift_per_decile(y_true: np.ndarray, + y_pred: np.ndarray) -> tuple: + """Compute lift of the model per decile, returns x-labels, lifts and + the target incidence to create cummulative response curves + + Parameters + ---------- + y_true : np.ndarray + True binary target data labels + y_pred : np.ndarray + Target scores of the model + + Returns + ------- + tuple + x-labels, lifts per decile and target incidence + """ + + lifts = [Evaluator._compute_lift(y_true=y_true, + y_pred=y_pred, + lift_at=perc_lift) + for perc_lift in np.arange(0.1, 1.1, 0.1)] + + x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)] + + return x_labels, lifts, y_true.mean() + + @staticmethod + def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, + lift_at: float=0.05) -> float: + """Calculates lift given two arrays on specified level + %timeit + 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, + 10000 loops each) + + Parameters + ---------- + y_true : np.ndarray + True binary target data labels + y_pred : np.ndarray + Target scores of the model + lift_at : float, optional + At what top level percentage the lift should be computed + + Returns + ------- + float + lift of the model + """ + + #Make sure it is numpy array + y_true_ = np.array(y_true) + y_pred_ = np.array(y_pred) + + #Make sure it has correct shape + y_true_ = y_true_.reshape(len(y_true_), 1) + y_pred_ = y_pred_.reshape(len(y_pred_), 1) + + #Merge data together + y_data = np.hstack([y_true_, y_pred_]) + + #Calculate necessary variables + nrows = len(y_data) + stop = int(np.floor(nrows*lift_at)) + avg_incidence = np.einsum("ij->j", y_true_)/float(len(y_true_)) + + #Sort and filter data + data_sorted = (y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] + .reshape(stop, 1)) + + #Calculate lift (einsum is very fast way of summing, + # needs specific shape) + inc_in_top_n = np.einsum("ij->j", data_sorted)/float(len(data_sorted)) + + lift = np.round(inc_in_top_n/avg_incidence, 2)[0] + + return lift From ec1903a60d15ea2e68d0e3ac4415dfcdb7282eb8 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 30 Mar 2020 15:38:22 +0200 Subject: [PATCH 79/98] Add cumulative gains metric to Evaluator --- cobra/evaluation/evaluator.py | 50 +++- cobra/evaluation/model_evaluator.py | 354 ---------------------------- 2 files changed, 48 insertions(+), 356 deletions(-) delete mode 100644 cobra/evaluation/model_evaluator.py diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 6583670..69fa997 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -303,7 +303,33 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): dim : tuple, optional tuple with width and lentgh of the plot """ - pass + plt.style.use('seaborn-darkgrid') + + fig, ax = plt.subplots(figsize=dim) + + ax.plot(self.cumulative_gains[0]*100, self.cumulative_gains[1]*100, + color='blue', linewidth=3, + label='cumulative gains') + ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3, + ls="--", color="darkorange", label='random selection') + + ax.set_title('Cumulative Gains', fontsize=20) + + #Format axes + ax.set_xlim([0, 100]) + ax.set_ylim([0, 100]) + #Format ticks + ax.set_yticklabels(['{:3.0f}%'.format(x) + for x in ax.get_yticks()]) + ax.set_xticklabels(['{:3.0f}%'.format(x) + for x in ax.get_xticks()]) + #Legend + ax.legend(loc='lower right') + + if path is not None: + plt.savefig(path, format='png', dpi=300, bbox_inches='tight') + + plt.show() @staticmethod def find_optimal_cutoff(y_true: np.ndarray, @@ -362,6 +388,10 @@ def _compute_cumulative_gains(y_true: np.ndarray, """Compute lift of the model per decile, returns x-labels, lifts and the target incidence to create cummulative response curves + Code from (https://github.com/reiinakano/scikit-plot/blob/ + 2dd3e6a76df77edcbd724c4db25575f70abb57cb/ + scikitplot/helpers.py#L157) + Parameters ---------- y_true : np.ndarray @@ -374,7 +404,23 @@ def _compute_cumulative_gains(y_true: np.ndarray, tuple x-labels, lifts per decile and target incidence """ - pass + + # make y_true a boolean vector + y_true = (y_true == 1) + + sorted_indices = np.argsort(y_pred)[::-1] + y_true = y_true[sorted_indices] + gains = np.cumsum(y_true) + + percentages = np.arange(start=1, stop=len(y_true) + 1) + + gains = gains / float(np.sum(y_true)) + percentages = percentages / float(len(y_true)) + + gains = np.insert(gains, 0, [0]) + percentages = np.insert(percentages, 0, [0]) + + return percentages, gains @staticmethod def _compute_lift_per_decile(y_true: np.ndarray, diff --git a/cobra/evaluation/model_evaluator.py b/cobra/evaluation/model_evaluator.py deleted file mode 100644 index f2d1093..0000000 --- a/cobra/evaluation/model_evaluator.py +++ /dev/null @@ -1,354 +0,0 @@ -""" -Created on Fri Apr 12 09:36:37 2019 -@author: AP_JBENEK -""" -import numpy as np -import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt - -from sklearn.metrics import precision_score -from sklearn.metrics import recall_score -from sklearn.metrics import f1_score -from sklearn.metrics import accuracy_score -from sklearn.metrics import roc_curve -from sklearn.metrics import confusion_matrix -from sklearn.metrics import roc_auc_score - - -class Evaluator(): - - def __init__(self, y_true, y_pred_p, threshold=0.5, lift_at=0.1): - self.y_true = y_true - self.y_pred_p = y_pred_p # As probability - self.lift_at = lift_at - self.threshold = threshold - - #Convert to bool - self.y_pred_b = np.array([0 if pred <= self.threshold else 1 - for pred in self.y_pred_p]) - - def plotROCCurve(self, save_pth=None, desc=None): - ''' - Plot ROC curve and print best cutoff value - - Parameters - ---------- - y_true: True values of target y - proba: Predicted values of target y, probabilities - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - if desc is None: - desc = '' - - fpr, tpr, thresholds = roc_curve(self.y_true, self.y_pred_p) - - #--------------------------- - #Calculate AUC - #-------------------------- - out_perfo = self.evaluation() - score = out_perfo['AUC'] - - fig, ax = plt.subplots(figsize=(8, 5)) - ax.plot(fpr, tpr, color='darkorange', lw=2, - label='ROC curve (area = {s:.3})'.format(s=score)) - ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') - ax.set_xlabel('False Positive Rate', fontsize=15) - ax.set_ylabel('True Positive Rate', fontsize=15) - ax.legend(loc="lower right") - ax.set_title('ROC Curve {}' .format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - #Best cutoff value - #i want value where FPR is highest and FPR is lowest - #https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python - i = np.arange(len(tpr)) - roc = pd.DataFrame({'tf': pd.Series(tpr-(1-fpr), index=i), - 'threshold': pd.Series(thresholds, index=i)}) - roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]] - - best_cutoff = list(roc_t['threshold']) - print(f'Best cutoff value for probability is: {best_cutoff[0]}') - - def plotConfusionMatrix(self, labels=None, color='Reds', save_pth=None, desc=None): - ''' - Plot Confusion matrix with performance measures - - Parameters - ---------- - y_test: True values of target y - pred: Predicted values of target y, boolean - labels: labels for the matrix, if empty, values from y_test_ are used - color: Color of the matrix, its a cmap, so many values possible - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - if labels is None: - labels = [str(lab) for lab in np.unique(self.y_true)] - - if desc is None: - desc = '' - - cm = confusion_matrix(self.y_true, self.y_pred_b) - - fig, ax = plt.subplots(figsize=(8,5)) - ax = sns.heatmap(cm, annot=cm.astype(str), fmt="s", cmap=color, - xticklabels=labels, yticklabels=labels) - ax.set_title('Confusion matrix {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - out_perfo = self.evaluation() - - # If we mark customer as a churner, how often we are correct - print('Precision: {s:.3}'.format(s=out_perfo['precision'])) - # Overall performance - print('Accuracy: {s:.3}'.format(s=out_perfo['accuracy'])) - # How many churners can the model detect - print('Recall: {s:.3}'.format(s=out_perfo['recall'])) - # 2 * (precision * recall) / (precision + recall) - print('F1 Score: {s:.3}'.format(s=out_perfo['F1'])) - # 2 * (precision * recall) / (precision + recall) - print('Lift at top {l}%: {s:.3}' - .format(l=self.lift_at*100, s=out_perfo['lift'])) - # 2 * (precision * recall) / (precision + recall) - print('AUC: {s:.3}'.format(s=out_perfo['AUC'])) - - def plotCumulativeGains(self, save_pth=None, desc=None): - ''' - Functions plot cumulative gains - - Parameters - ---------- - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - if desc is None: - desc = '' - - #--------------------------- - #Calculate cumulative gains - #-------------------------- - nrows = len(self.y_true) - npositives = self.y_true.sum() - df_y_pred = (pd.DataFrame({"y": self.y_true, "y_pred": self.y_pred_p}) - .sort_values(by='y_pred', ascending=False) - .reset_index(drop=True)) - cgains = [0] - for stop in (np.linspace(0.01, 1, 100) * nrows).astype(int): - cgains.append(round(df_y_pred.loc[:stop, 'y'].sum()/npositives*max(100, 1), 2)) - - #--------------------------- - #Plot it - #--------------------------- - plt.style.use('seaborn-darkgrid') - fig, ax_cgains = plt.subplots(figsize=(8, 5)) - ax_cgains.plot(cgains, color='blue', linewidth=3, - label='cumulative gains') - ax_cgains.plot(ax_cgains.get_xlim(), ax_cgains.get_ylim(), linewidth=3, - ls="--", color="darkorange", label='random selection') - ax_cgains.set_title('Cumulative Gains ' + desc, fontsize=20) - - ax_cgains.set_title('Cumulative Gains {}' .format(desc), fontsize=20) - #Format axes - ax_cgains.set_xlim([0, 100]) - ax_cgains.set_ylim([0, 100]) - #Format ticks - ax_cgains.set_yticklabels(['{:3.0f}%'.format(x) - for x in ax_cgains.get_yticks()]) - ax_cgains.set_xticklabels(['{:3.0f}%'.format(x) - for x in ax_cgains.get_xticks()]) - #Legend - ax_cgains.legend(loc='lower right') - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - def plotLift(self, desc=None, save_pth=None): - ''' - Method plots lift per decile - - Parameters - ---------- - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - #--------------------- - #-- CALCULATE LIFT --- - #--------------------- -# inc_rate = self.y_true.mean() - lifts = [Evaluator.liftCalculator(y_true=self.y_true, - y_pred=self.y_pred_p, - lift_at=perc_lift) - for perc_lift in np.arange(0.1, 1.1, 0.1)] - - #--------------------- - #------- PLOT -------- - #--------------------- - if desc is None: - desc = '' - - fig, ax = plt.subplots(figsize=(8,5)) - plt.style.use('seaborn-darkgrid') - - nrows = len(lifts) - x_labels = [nrows-x for x in np.arange(0, nrows, 1)] - - plt.bar(x_labels[::-1], lifts, align='center', color="cornflowerblue") - plt.ylabel('lift', fontsize=15) - plt.xlabel('decile', fontsize=15) - ax.set_xticks(x_labels) - ax.set_xticklabels(x_labels) - - plt.axhline(y=1, color='darkorange', linestyle='--', - xmin=0.1, xmax=0.9, linewidth=3, label='Baseline') - - #Legend - ax.legend(loc='upper right') - - ##Set Axis - make them pretty - sns.despine(ax=ax, right=True, left=True) - - #Remove white lines from the second axis - ax.grid(False) - - ##Description - ax.set_title('Cumulative Lift {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - def plotCumulativeResponse(self, desc=None, save_pth=None): - #--------------------- - #-- CALCULATE LIFT --- - #--------------------- - inc_rate = self.y_true.mean() - lifts = [Evaluator.liftCalculator(y_true=self.y_true, - y_pred=self.y_pred_p, - lift_at=perc_lift) - for perc_lift in np.arange(0.1, 1.1, 0.1)] - lifts = np.array(lifts)*inc_rate*100 - #--------------------- - #------- PLOT -------- - #--------------------- - if desc is None: - desc = '' - - fig, ax = plt.subplots(figsize=(8, 5)) - #plt.style.use('seaborn-darkgrid') - plt.style.use('default') - - nrows = len(lifts) - x_labels = [nrows-x for x in np.arange(0, nrows, 1)] - - plt.bar(x_labels[::-1], lifts, align='center', color="#00ccff") - plt.ylabel('response (%)', fontsize=16) - plt.xlabel('decile', fontsize=16) - ax.set_xticks(x_labels) - ax.set_xticklabels(x_labels) - - plt.axhline(y=inc_rate*100, color='#ff9500', linestyle='--', - xmin=0.05, xmax=0.95, linewidth=3, label='Incidence') - - #Legend - ax.legend(loc='upper right') - - ##Set Axis - make them pretty - sns.despine(ax=ax, right=True, left=True) - - #Remove white lines from the second axis - ax.grid(False) - - ##Description - ax.set_title('Cumulative response {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - def evaluation(self): - ''' - Convenient function, returns various performance measures in a dict - - Parameters - ---------- - y_true: true values - y_pred: predictions as booleans - - Output - ------ - Returns dictionary with the measures - ''' - - dict_perfo = {'precision': precision_score(self.y_true, self.y_pred_b), - 'accuracy': accuracy_score(self.y_true, self.y_pred_b), - 'recall': recall_score(self.y_true, self.y_pred_b), - 'F1': f1_score(self.y_true, self.y_pred_b, - average=None)[1], - 'lift': np.round(Evaluator - .liftCalculator(y_true=self.y_true, - y_pred=self.y_pred_p, - lift_at=self.lift_at), - 2), - 'AUC': roc_auc_score(self.y_true, self.y_pred_p) - } - return dict_perfo - - @staticmethod - def liftCalculator(y_true, y_pred, lift_at=0.05, **kwargs): - ''' - Calculates lift given two arrays on specified level - - Parameters - ---------- - y_true: numpy array with true values - y_pred: numpy array with predictions (probabilities) - lift_at: lift at what top percentage - - Output - ------ - Scalar value, lift. - - 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, - 10000 loops each) - ''' - #Make sure it is numpy array - y_true_ = np.array(y_true) - y_pred_ = np.array(y_pred) - - #Make sure it has correct shape - y_true_ = y_true_.reshape(len(y_true_), 1) - y_pred_ = y_pred_.reshape(len(y_pred_), 1) - - #Merge data together - y_data = np.hstack([y_true_, y_pred_]) - - #Calculate necessary variables - nrows = len(y_data) - stop = int(np.floor(nrows*lift_at)) - avg_incidence = np.einsum('ij->j', y_true_)/float(len(y_true_)) - - #Sort and filter data - data_sorted = (y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] - .reshape(stop, 1)) - - #Calculate lift (einsum is very fast way of summing, - # needs specific shape) - inc_in_top_n = np.einsum('ij->j', data_sorted)/float(len(data_sorted)) - - lift = np.round(inc_in_top_n/avg_incidence, 2)[0] - - return lift From 237ed45ee7c1a918fe020af2e67d7c0f81e1e071 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 30 Mar 2020 15:42:35 +0200 Subject: [PATCH 80/98] Cleanup of Evaluator --- cobra/evaluation/evaluator.py | 1006 +++++++++++++++++---------------- 1 file changed, 504 insertions(+), 502 deletions(-) diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 69fa997..de09f77 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -1,502 +1,504 @@ -import numpy as np -import pandas as pd - -import matplotlib.pyplot as plt -import seaborn as sns - -from sklearn.metrics import precision_score -from sklearn.metrics import recall_score -from sklearn.metrics import f1_score -from sklearn.metrics import accuracy_score -from sklearn.metrics import roc_curve -from sklearn.metrics import confusion_matrix -from sklearn.metrics import roc_auc_score -from sklearn.exceptions import NotFittedError - - -class Evaluator(): - - """Summary - - Attributes - ---------- - confusion_matrix : np.ndarray - Confusion matrix computed for a particular cut-off - cumulative_gains : tuple - data for plotting cumulative gains curve - evaluation_metrics : dict - map containing various scalar evaluation metics such as AUC, ... - lift_at : float - parameter to determine at which top level percentage the lift of the - model should be computed - lift_curve : tuple - data for plotting lift curve(s) - probability_cutoff : float - probability cut off to convert probability scores to a binary score - roc_curve : dict - map containing true-positive-rate, false-positve-rate at various - thresholds (also incl.) - """ - - def __init__(self, probability_cutoff: float=None, - lift_at: float=0.05): - - self.lift_at = lift_at - self.probability_cutoff = probability_cutoff - - # Placeholder to store fitted output - self.evaluation_metrics = None - self.roc_curve = None - self.confusion_matrix = None - self.lift_curve = None - self.cumulative_gains = None - - def fit(self, y_true: np.ndarray, y_pred: np.ndarray): - - fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_pred) - - # if probability_cutoff is not set, take the optimal cut off - if not self.probability_cutoff: - self.probability_cutoff = (Evaluator. - _compute_optimal_cutoff(fpr, tpr, - thresholds)) - - # Transform probabilities to binary array using cut off: - y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1 - for pred in y_pred]) - - # Compute the various evaluation metrics - self.evaluation_metrics = Evaluator.compute_evaluation_metrics( - y_true, - y_pred, - y_pred_b, - self.lift_at - ) - - self.roc_curve = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds} - self.confusion_matrix = confusion_matrix(y_true, y_pred_b) - self.lift_curve = Evaluator._compute_lift_per_decile(y_true, y_pred) - self.cumulative_gains = Evaluator._compute_cumulative_gains(y_true, - y_pred) - - def get_metrics(self) -> pd.Series: - """Get the evaluation_metrics attribute as a pandas Series - - Returns - ------- - pd.Series - Score of various scalar evaluation metrics for the model - """ - return pd.Series(self.evaluation_metrics) - - @staticmethod - def compute_evaluation_metrics(y_true: np.ndarray, - y_pred: np.ndarray, - y_pred_b: np.ndarray, - lift_at: float) -> dict: - """Convenient function to compute various performance measures and - return them in a dict - - Parameters - ---------- - y_true : np.ndarray - True binary target data labels - y_pred : np.ndarray - Target scores of the model - y_pred_b : np.ndarray - Predicted target data labels (binary) - lift_at : float - At what top level percentage the lift should be computed - - Returns - ------- - dict - contains various performance measures of the model - """ - return { - "accuracy": accuracy_score(y_true, y_pred_b), - "AUC": roc_auc_score(y_true, y_pred), - "precision": precision_score(y_true, y_pred_b), - "recall": recall_score(y_true, y_pred_b), - "F1": f1_score(y_true, y_pred_b, average=None)[1], - f"lift at {lift_at}": np.round(Evaluator - ._compute_lift(y_true=y_true, - y_pred=y_pred, - lift_at=lift_at), 2) - } - - def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): - """Plot ROC curves of the model - - Parameters - ---------- - path : str, optional - path to store the figure - dim : tuple, optional - tuple with width and lentgh of the plot - """ - - if self.roc_curve is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - - raise NotFittedError(msg.format(self.__class__.__name__)) - - auc = self.evaluation_metrics["AUC"] - - fig, ax = plt.subplots(figsize=dim) - - ax.plot(self.roc_curve["fpr"], - self.roc_curve["tpr"], - color="darkorange", lw=2, - label="ROC curve (area = {s:.3})".format(s=auc)) - - ax.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") - ax.set_xlabel("False Positive Rate", fontsize=15) - ax.set_ylabel("True Positive Rate", fontsize=15) - ax.legend(loc="lower right") - ax.set_title("ROC Curve", fontsize=20) - - if path: - plt.savefig(path, format="png", dpi=300, bbox_inches="tight") - - plt.show() - - def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), - labels: list=["0", "1"]): - """Plot the confusion matrix - - Parameters - ---------- - path : str, optional - path to store the figure - dim : tuple, optional - tuple with width and lentgh of the plot - labels : list, optional - Optional list of labels, default "0" and "1" - """ - - if self.confusion_matrix is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - - raise NotFittedError(msg.format(self.__class__.__name__)) - - fig, ax = plt.subplots(figsize=dim) - ax = sns.heatmap(self.confusion_matrix, - annot=self.confusion_matrix.astype(str), - fmt="s", cmap="Reds", - xticklabels=labels, yticklabels=labels) - ax.set_title("Confusion matrix", fontsize=20) - - if path: - plt.savefig(path, format="png", dpi=300, bbox_inches="tight") - - plt.show() - - def plot_cumulative_response_curve(self, path: str=None, - dim: tuple=(12, 8)): - """Plot cumulative response curve - - Parameters - ---------- - path : str, optional - path to store the figure - dim : tuple, optional - tuple with width and lentgh of the plot - """ - - if self.lift_curve is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - - raise NotFittedError(msg.format(self.__class__.__name__)) - - x_labels, lifts, inc_rate = self.lift_curve - - lifts = np.array(lifts)*inc_rate*100 - - fig, ax = plt.subplots(figsize=dim) - plt.style.use("default") - - plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue") - plt.ylabel("response (%)", fontsize=16) - plt.xlabel("decile", fontsize=16) - ax.set_xticks(x_labels) - ax.set_xticklabels(x_labels) - - plt.axhline(y=inc_rate*100, color="darkorange", linestyle="--", - xmin=0.05, xmax=0.95, linewidth=3, label="Incidence") - - #Legend - ax.legend(loc="upper right") - - ##Set Axis - make them pretty - sns.despine(ax=ax, right=True, left=True) - - #Remove white lines from the second axis - ax.grid(False) - - ##Description - ax.set_title("Cumulative response", fontsize=20) - - if path is not None: - plt.savefig(path, format="png", dpi=300, bbox_inches="tight") - - plt.show() - - def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): - """Plot lift per decile - - Parameters - ---------- - path : str, optional - path to store the figure - dim : tuple, optional - tuple with width and lentgh of the plot - """ - - if self.lift_curve is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - - raise NotFittedError(msg.format(self.__class__.__name__)) - - x_labels, lifts, _ = self.lift_curve - - fig, ax = plt.subplots(figsize=dim) - plt.style.use("default") - - plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue") - plt.ylabel("lift", fontsize=16) - plt.xlabel("decile", fontsize=16) - ax.set_xticks(x_labels) - ax.set_xticklabels(x_labels) - - plt.axhline(y=1, color="darkorange", linestyle="--", - xmin=0.05, xmax=0.95, linewidth=3, label="Baseline") - - #Legend - ax.legend(loc="upper right") - - ##Set Axis - make them pretty - sns.despine(ax=ax, right=True, left=True) - - #Remove white lines from the second axis - ax.grid(False) - - ##Description - ax.set_title("Cumulative Lift", fontsize=20) - - if path is not None: - plt.savefig(path, format="png", dpi=300, bbox_inches="tight") - - plt.show() - - def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): - """Plot lift per decile - - Parameters - ---------- - path : str, optional - path to store the figure - dim : tuple, optional - tuple with width and lentgh of the plot - """ - plt.style.use('seaborn-darkgrid') - - fig, ax = plt.subplots(figsize=dim) - - ax.plot(self.cumulative_gains[0]*100, self.cumulative_gains[1]*100, - color='blue', linewidth=3, - label='cumulative gains') - ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3, - ls="--", color="darkorange", label='random selection') - - ax.set_title('Cumulative Gains', fontsize=20) - - #Format axes - ax.set_xlim([0, 100]) - ax.set_ylim([0, 100]) - #Format ticks - ax.set_yticklabels(['{:3.0f}%'.format(x) - for x in ax.get_yticks()]) - ax.set_xticklabels(['{:3.0f}%'.format(x) - for x in ax.get_xticks()]) - #Legend - ax.legend(loc='lower right') - - if path is not None: - plt.savefig(path, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - @staticmethod - def find_optimal_cutoff(y_true: np.ndarray, - y_pred: np.ndarray) -> float: - """Find the optimal probability cut off point for a - classification model. Wrapper around _compute_optimal_cutoff - - Parameters - ---------- - y_true : np.ndarray - True binary target data labels - y_pred : np.ndarray - Target scores of the model - - Returns - ------- - float - Optimal cut off probability for the model - """ - return Evaluator._compute_optimal_cutoff(roc_curve(y_true=y_true, - y_score=y_pred)) - - @staticmethod - def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, - thresholds: np.ndarray) -> float: - """Find the optimal probability cut off point for a - classification model - - Parameters - ---------- - fpr : np.ndarray - false positive rate for various thresholds - tpr : np.ndarray - true positive rate for various thresholds - thresholds : np.ndarray - list of thresholds for which fpr and tpr were computed - - Returns - ------- - float - Description - """ - - # The optimal cut off would be where tpr is high and fpr is low, hence - # tpr - (1-fpr) should be zero or close to zero for the optimal cut off - temp = np.absolute(tpr - (1-fpr)) - - # index for optimal value is the one for which temp is minimal - optimal_index = np.where(temp == min(temp))[0] - - return thresholds[optimal_index][0] - - @staticmethod - def _compute_cumulative_gains(y_true: np.ndarray, - y_pred: np.ndarray) -> tuple: - """Compute lift of the model per decile, returns x-labels, lifts and - the target incidence to create cummulative response curves - - Code from (https://github.com/reiinakano/scikit-plot/blob/ - 2dd3e6a76df77edcbd724c4db25575f70abb57cb/ - scikitplot/helpers.py#L157) - - Parameters - ---------- - y_true : np.ndarray - True binary target data labels - y_pred : np.ndarray - Target scores of the model - - Returns - ------- - tuple - x-labels, lifts per decile and target incidence - """ - - # make y_true a boolean vector - y_true = (y_true == 1) - - sorted_indices = np.argsort(y_pred)[::-1] - y_true = y_true[sorted_indices] - gains = np.cumsum(y_true) - - percentages = np.arange(start=1, stop=len(y_true) + 1) - - gains = gains / float(np.sum(y_true)) - percentages = percentages / float(len(y_true)) - - gains = np.insert(gains, 0, [0]) - percentages = np.insert(percentages, 0, [0]) - - return percentages, gains - - @staticmethod - def _compute_lift_per_decile(y_true: np.ndarray, - y_pred: np.ndarray) -> tuple: - """Compute lift of the model per decile, returns x-labels, lifts and - the target incidence to create cummulative response curves - - Parameters - ---------- - y_true : np.ndarray - True binary target data labels - y_pred : np.ndarray - Target scores of the model - - Returns - ------- - tuple - x-labels, lifts per decile and target incidence - """ - - lifts = [Evaluator._compute_lift(y_true=y_true, - y_pred=y_pred, - lift_at=perc_lift) - for perc_lift in np.arange(0.1, 1.1, 0.1)] - - x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)] - - return x_labels, lifts, y_true.mean() - - @staticmethod - def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, - lift_at: float=0.05) -> float: - """Calculates lift given two arrays on specified level - %timeit - 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, - 10000 loops each) - - Parameters - ---------- - y_true : np.ndarray - True binary target data labels - y_pred : np.ndarray - Target scores of the model - lift_at : float, optional - At what top level percentage the lift should be computed - - Returns - ------- - float - lift of the model - """ - - #Make sure it is numpy array - y_true_ = np.array(y_true) - y_pred_ = np.array(y_pred) - - #Make sure it has correct shape - y_true_ = y_true_.reshape(len(y_true_), 1) - y_pred_ = y_pred_.reshape(len(y_pred_), 1) - - #Merge data together - y_data = np.hstack([y_true_, y_pred_]) - - #Calculate necessary variables - nrows = len(y_data) - stop = int(np.floor(nrows*lift_at)) - avg_incidence = np.einsum("ij->j", y_true_)/float(len(y_true_)) - - #Sort and filter data - data_sorted = (y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] - .reshape(stop, 1)) - - #Calculate lift (einsum is very fast way of summing, - # needs specific shape) - inc_in_top_n = np.einsum("ij->j", data_sorted)/float(len(data_sorted)) - - lift = np.round(inc_in_top_n/avg_incidence, 2)[0] - - return lift +import numpy as np +import pandas as pd + +import matplotlib.pyplot as plt +import seaborn as sns + +from sklearn.metrics import precision_score +from sklearn.metrics import recall_score +from sklearn.metrics import f1_score +from sklearn.metrics import accuracy_score +from sklearn.metrics import roc_curve +from sklearn.metrics import confusion_matrix +from sklearn.metrics import roc_auc_score +from sklearn.exceptions import NotFittedError + + +class Evaluator(): + + """Summary + + Attributes + ---------- + confusion_matrix : np.ndarray + Confusion matrix computed for a particular cut-off + cumulative_gains : tuple + data for plotting cumulative gains curve + evaluation_metrics : dict + map containing various scalar evaluation metics such as AUC, ... + lift_at : float + parameter to determine at which top level percentage the lift of the + model should be computed + lift_curve : tuple + data for plotting lift curve(s) + probability_cutoff : float + probability cut off to convert probability scores to a binary score + roc_curve : dict + map containing true-positive-rate, false-positve-rate at various + thresholds (also incl.) + """ + + def __init__(self, probability_cutoff: float=None, + lift_at: float=0.05): + + self.lift_at = lift_at + self.probability_cutoff = probability_cutoff + + # Placeholder to store fitted output + self._scalar_metrics = None + self.roc_curve = None + self.confusion_matrix = None + self.lift_curve = None + self.cumulative_gains = None + + def fit(self, y_true: np.ndarray, y_pred: np.ndarray): + + fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_pred) + + # if probability_cutoff is not set, take the optimal cut off + if not self.probability_cutoff: + self.probability_cutoff = (Evaluator. + _compute_optimal_cutoff(fpr, tpr, + thresholds)) + + # Transform probabilities to binary array using cut off: + y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1 + for pred in y_pred]) + + # Compute the various evaluation metrics + self._scalar_metrics = Evaluator.compute_scalar_metrics( + y_true, + y_pred, + y_pred_b, + self.lift_at + ) + + self.roc_curve = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds} + self.confusion_matrix = confusion_matrix(y_true, y_pred_b) + self.lift_curve = Evaluator._compute_lift_per_decile(y_true, y_pred) + self.cumulative_gains = Evaluator._compute_cumulative_gains(y_true, + y_pred) + + def get_scalar_scalar_metrics(self) -> pd.Series: + """Get the evaluation_metrics attribute as a pandas Series + + Returns + ------- + pd.Series + Score of various scalar evaluation metrics for the model + """ + return pd.Series(self._scalar_metrics) + + @staticmethod + def compute_scalar_metrics(y_true: np.ndarray, + y_pred: np.ndarray, + y_pred_b: np.ndarray, + lift_at: float) -> dict: + """Convenient function to compute various scalar performance measures + and return them in a dict + + Parameters + ---------- + y_true : np.ndarray + True binary target data labels + y_pred : np.ndarray + Target scores of the model + y_pred_b : np.ndarray + Predicted target data labels (binary) + lift_at : float + At what top level percentage the lift should be computed + + Returns + ------- + dict + contains various performance measures of the model + """ + return { + "accuracy": accuracy_score(y_true, y_pred_b), + "AUC": roc_auc_score(y_true, y_pred), + "precision": precision_score(y_true, y_pred_b), + "recall": recall_score(y_true, y_pred_b), + "F1": f1_score(y_true, y_pred_b, average=None)[1], + f"lift at {lift_at}": np.round(Evaluator + ._compute_lift(y_true=y_true, + y_pred=y_pred, + lift_at=lift_at), 2) + } + + def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): + """Plot ROC curves of the model + + Parameters + ---------- + path : str, optional + path to store the figure + dim : tuple, optional + tuple with width and lentgh of the plot + """ + + if self.roc_curve is None: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + auc = self._scalar_metrics["AUC"] + + fig, ax = plt.subplots(figsize=dim) + + ax.plot(self.roc_curve["fpr"], + self.roc_curve["tpr"], + color="darkorange", lw=2, + label="ROC curve (area = {s:.3})".format(s=auc)) + + ax.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") + ax.set_xlabel("False Positive Rate", fontsize=15) + ax.set_ylabel("True Positive Rate", fontsize=15) + ax.legend(loc="lower right") + ax.set_title("ROC Curve", fontsize=20) + + if path: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + + plt.show() + + def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), + labels: list=["0", "1"]): + """Plot the confusion matrix + + Parameters + ---------- + path : str, optional + path to store the figure + dim : tuple, optional + tuple with width and lentgh of the plot + labels : list, optional + Optional list of labels, default "0" and "1" + """ + + if self.confusion_matrix is None: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + fig, ax = plt.subplots(figsize=dim) + ax = sns.heatmap(self.confusion_matrix, + annot=self.confusion_matrix.astype(str), + fmt="s", cmap="Reds", + xticklabels=labels, yticklabels=labels) + ax.set_title("Confusion matrix", fontsize=20) + + if path: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + + plt.show() + + def plot_cumulative_response_curve(self, path: str=None, + dim: tuple=(12, 8)): + """Plot cumulative response curve + + Parameters + ---------- + path : str, optional + path to store the figure + dim : tuple, optional + tuple with width and lentgh of the plot + """ + + if self.lift_curve is None: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + x_labels, lifts, inc_rate = self.lift_curve + + lifts = np.array(lifts)*inc_rate*100 + + with plt.style.context("seaborn-ticks"): + fig, ax = plt.subplots(figsize=dim) + + plt.bar(x_labels[::-1], lifts, align="center", + color="cornflowerblue") + plt.ylabel("response (%)", fontsize=16) + plt.xlabel("decile", fontsize=16) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=inc_rate*100, color="darkorange", linestyle="--", + xmin=0.05, xmax=0.95, linewidth=3, label="Incidence") + + #Legend + ax.legend(loc="upper right") + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title("Cumulative response", fontsize=20) + + if path is not None: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + + plt.show() + + def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): + """Plot lift per decile + + Parameters + ---------- + path : str, optional + path to store the figure + dim : tuple, optional + tuple with width and lentgh of the plot + """ + + if self.lift_curve is None: + msg = ("This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + raise NotFittedError(msg.format(self.__class__.__name__)) + + x_labels, lifts, _ = self.lift_curve + + with plt.style.context("seaborn-ticks"): + fig, ax = plt.subplots(figsize=dim) + + plt.bar(x_labels[::-1], lifts, align="center", + color="cornflowerblue") + plt.ylabel("lift", fontsize=16) + plt.xlabel("decile", fontsize=16) + ax.set_xticks(x_labels) + ax.set_xticklabels(x_labels) + + plt.axhline(y=1, color="darkorange", linestyle="--", + xmin=0.05, xmax=0.95, linewidth=3, label="Baseline") + + #Legend + ax.legend(loc="upper right") + + ##Set Axis - make them pretty + sns.despine(ax=ax, right=True, left=True) + + #Remove white lines from the second axis + ax.grid(False) + + ##Description + ax.set_title("Cumulative Lift", fontsize=20) + + if path is not None: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + + plt.show() + + def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): + """Plot lift per decile + + Parameters + ---------- + path : str, optional + path to store the figure + dim : tuple, optional + tuple with width and lentgh of the plot + """ + + with plt.style.context("seaborn-whitegrid"): + fig, ax = plt.subplots(figsize=dim) + + ax.plot(self.cumulative_gains[0]*100, self.cumulative_gains[1]*100, + color="blue", linewidth=3, + label="cumulative gains") + ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3, + ls="--", color="darkorange", label="random selection") + + ax.set_title("Cumulative Gains", fontsize=20) + + #Format axes + ax.set_xlim([0, 100]) + ax.set_ylim([0, 100]) + #Format ticks + ax.set_yticklabels(["{:3.0f}%".format(x) + for x in ax.get_yticks()]) + ax.set_xticklabels(["{:3.0f}%".format(x) + for x in ax.get_xticks()]) + #Legend + ax.legend(loc="lower right") + + if path is not None: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + + plt.show() + + @staticmethod + def find_optimal_cutoff(y_true: np.ndarray, + y_pred: np.ndarray) -> float: + """Find the optimal probability cut off point for a + classification model. Wrapper around _compute_optimal_cutoff + + Parameters + ---------- + y_true : np.ndarray + True binary target data labels + y_pred : np.ndarray + Target scores of the model + + Returns + ------- + float + Optimal cut off probability for the model + """ + return Evaluator._compute_optimal_cutoff(roc_curve(y_true=y_true, + y_score=y_pred)) + + @staticmethod + def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, + thresholds: np.ndarray) -> float: + """Find the optimal probability cut off point for a + classification model + + Parameters + ---------- + fpr : np.ndarray + false positive rate for various thresholds + tpr : np.ndarray + true positive rate for various thresholds + thresholds : np.ndarray + list of thresholds for which fpr and tpr were computed + + Returns + ------- + float + Description + """ + + # The optimal cut off would be where tpr is high and fpr is low, hence + # tpr - (1-fpr) should be zero or close to zero for the optimal cut off + temp = np.absolute(tpr - (1-fpr)) + + # index for optimal value is the one for which temp is minimal + optimal_index = np.where(temp == min(temp))[0] + + return thresholds[optimal_index][0] + + @staticmethod + def _compute_cumulative_gains(y_true: np.ndarray, + y_pred: np.ndarray) -> tuple: + """Compute lift of the model per decile, returns x-labels, lifts and + the target incidence to create cummulative response curves + + Code from (https://github.com/reiinakano/scikit-plot/blob/ + 2dd3e6a76df77edcbd724c4db25575f70abb57cb/ + scikitplot/helpers.py#L157) + + Parameters + ---------- + y_true : np.ndarray + True binary target data labels + y_pred : np.ndarray + Target scores of the model + + Returns + ------- + tuple + x-labels, lifts per decile and target incidence + """ + + # make y_true a boolean vector + y_true = (y_true == 1) + + sorted_indices = np.argsort(y_pred)[::-1] + y_true = y_true[sorted_indices] + gains = np.cumsum(y_true) + + percentages = np.arange(start=1, stop=len(y_true) + 1) + + gains = gains / float(np.sum(y_true)) + percentages = percentages / float(len(y_true)) + + gains = np.insert(gains, 0, [0]) + percentages = np.insert(percentages, 0, [0]) + + return percentages, gains + + @staticmethod + def _compute_lift_per_decile(y_true: np.ndarray, + y_pred: np.ndarray) -> tuple: + """Compute lift of the model per decile, returns x-labels, lifts and + the target incidence to create cummulative response curves + + Parameters + ---------- + y_true : np.ndarray + True binary target data labels + y_pred : np.ndarray + Target scores of the model + + Returns + ------- + tuple + x-labels, lifts per decile and target incidence + """ + + lifts = [Evaluator._compute_lift(y_true=y_true, + y_pred=y_pred, + lift_at=perc_lift) + for perc_lift in np.arange(0.1, 1.1, 0.1)] + + x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)] + + return x_labels, lifts, y_true.mean() + + @staticmethod + def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, + lift_at: float=0.05) -> float: + """Calculates lift given two arrays on specified level + %timeit + 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, + 10000 loops each) + + Parameters + ---------- + y_true : np.ndarray + True binary target data labels + y_pred : np.ndarray + Target scores of the model + lift_at : float, optional + At what top level percentage the lift should be computed + + Returns + ------- + float + lift of the model + """ + + #Make sure it is numpy array + y_true_ = np.array(y_true) + y_pred_ = np.array(y_pred) + + #Make sure it has correct shape + y_true_ = y_true_.reshape(len(y_true_), 1) + y_pred_ = y_pred_.reshape(len(y_pred_), 1) + + #Merge data together + y_data = np.hstack([y_true_, y_pred_]) + + #Calculate necessary variables + nrows = len(y_data) + stop = int(np.floor(nrows*lift_at)) + avg_incidence = np.einsum("ij->j", y_true_)/float(len(y_true_)) + + #Sort and filter data + data_sorted = (y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] + .reshape(stop, 1)) + + #Calculate lift (einsum is very fast way of summing, + # needs specific shape) + inc_in_top_n = np.einsum("ij->j", data_sorted)/float(len(data_sorted)) + + lift = np.round(inc_in_top_n/avg_incidence, 2)[0] + + return lift From 63b584ad0b5258348b12e68ef7e361b30bc20648 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 30 Mar 2020 16:10:11 +0200 Subject: [PATCH 81/98] Add documentation to evaluation module --- cobra/evaluation/performance_curves.py | 10 ++++++++++ cobra/evaluation/pigs_tables.py | 9 +++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/cobra/evaluation/performance_curves.py b/cobra/evaluation/performance_curves.py index 1e0032d..0ff5aa8 100644 --- a/cobra/evaluation/performance_curves.py +++ b/cobra/evaluation/performance_curves.py @@ -7,7 +7,17 @@ def plot_performance_curves(model_performance: pd.DataFrame, dim: tuple=(12, 8)): + """Plot performance curves generated by the forward feature selection + for the train-selection-validation sets + Parameters + ---------- + model_performance : pd.DataFrame + contains train-selection-validation performance for each model trained + in the forward feature selection + dim : tuple, optional + tuple with width and lentgh of the plot + """ highest_auc = np.round(max(max(model_performance['train_performance']), max(model_performance['selection_performance']), max(model_performance['validation_performance']) diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index 8afb4f7..c2e53a9 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -10,7 +10,9 @@ def generate_pig_tables(data: pd.DataFrame, id_column_name: str, target_column_name: str, preprocessed_predictors: list) -> pd.DataFrame: - """Summary + """Compute PIG tables for all predictors in preprocessed_predictors. The + output is a DataFrame with columns "variable", "label", "pop_size", + "avg_incidence" and "incidence" Parameters ---------- @@ -29,11 +31,6 @@ def generate_pig_tables(data: pd.DataFrame, DataFrame containing a PIG table for all predictors """ - # Based on the data, get column names by datatype - # threshold to decide whether a numeric column should be considered - # a categorical variable (if the number of distinct values is smaller - # or equal to the number of requested bins) - pigs = [compute_pig_table(data, column_name, target_column_name, id_column_name) for column_name in sorted(preprocessed_predictors) From 2c441f5361343d9f1c2b904e11ad278d7c0d0323 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 30 Mar 2020 16:31:34 +0200 Subject: [PATCH 82/98] Update README --- README.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 53ce70a..b723c29 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ This section contains detailed examples for each step on how to use COBRA for bu help(function_or_class_you_want_info_from) ``` -In the examples below, we assume the data for model building is available in a pandas DataFrame called `basetable`. +In the examples below, we assume the data for model building is available in a pandas DataFrame called `basetable`. This DataFrame should contain an ID columns (e.g. customernumber), a target column (e.g. "TARGET") and a number of candidate predictors to build or model with. ```python from cobra.preprocessing import PreProcessor @@ -104,11 +104,23 @@ basetable = preprocessor.transform(basetable, ``` -Once the preprocessing pipeline is fitted and applied to your data, it is time for the actual modelling. In this part of the process, -we first start with the _univariate preselection_: +Once the preprocessing pipeline is fitted and applied to your data, we are ready to start modelling. However, we could already compute the PIG tables here for later use: + +```python +from cobra.evaluation import generate_pig_tables + +pig_tables = generate_pig_tables(basetable[basetable["split"] == "selection"], + id_column_name, + target_column_name + preprocessed_predictors) +``` + +Once these PIG tables are computed, we can start with the _univariate preselection_: ```python from cobra.model_building import univariate_selection +from cobra.evaluation import plot_predictor_quality +from cobra.evaluation import plot_correlation_matrix # Get list of predictor names to use for univariate_selection preprocessed_predictors = [col for col in basetable.columns if col.endswith("_enc")] @@ -123,20 +135,28 @@ df_auc = univariate_selection.compute_univariate_preselection( preselect_overtrain_threshold=0.05 # if (auc_train - auc_selection) >= 0.05 --> overfitting! ) +# Plot df_auc to get a horizontal barplot: +plot_predictor_quality(df_auc) + # compute correlations between preprocessed predictors: df_corr = (univariate_selection .compute_correlations(basetable[basetable["split"] == "train"], preprocessed_predictors)) +# plot correlation matrix +plot_correlation_matrix(df_corr) + # get a list of predictors selection by the univariate selection preselected_predictors = (univariate_selection .get_preselected_predictors(df_auc)) ``` -After a preselection is done on the predictors, we can start the model building itself using _forward feature selection_ to choose the right set of predictors: +After a preselection is done on the predictors, we can start the model building itself using forward feature selection to choose the right set of predictors: ```python from cobra.model_building import ForwardFeatureSelection +from cobra.evaluation import plot_performance_curves +from cobra.evaluation import plot_variable_importance forward_selection = ForwardFeatureSelection(max_predictors=30, pos_only=True) @@ -151,15 +171,49 @@ forward_selection.fit(basetable[basetable["split"] == "train"], performances = (forward_selection .compute_model_performances(basetable, target_column_name)) +# plot performance curves +plot_performance_curves(performances) + # After plotting the performances and selecting the model, # we can extract this model from the forward_selection class: -model = forward_selection.get_model_from_step(5) # Python indexing starts from 0, so this model has 6 predictors +model = forward_selection.get_model_from_step(5) -# Note that model has 6 variables (python lists start with index 0), +# Note that chosen model has 6 variables (python lists start with index 0), # which can be obtained as follows: final_predictors = model.predictors -# We can also compute the importance of each predictor in the model (dict): -variable_importance = model.compute_variable_importance(basetable) +# We can also compute and plot the importance of each predictor in the model: +variable_importance = model.compute_variable_importance( + basetable[basetable["split"] == "selection"] +) +plot_variable_importance(variable_importance) +``` + +Now that we have build and selected a final model, it is time to evaluate it against various evaluation metrics: + +```python +from cobra.evaluation import Evaluator + +# get numpy array of True target labels and predicted scores: +y_true = basetable[basetable["split"] == "selection"][target_column_name].values +y_pred = model.score_model(basetable[basetable["split"] == "selection"]) + +evaluator = Evaluator() +evaluator.fit(y_true, y_pred) # Automatically find the best cut-off probability + +# Get various scalar metrics such as accuracy, AUC, precision, recall, ... +evaluator.get_scalar_scalar_metrics() + +# Plot non-scalar evaluation metrics: +evaluator.plot_roc_curve() + +evaluator.plot_confusion_matrix() + +evaluator.plot_cumulative_gains() + +evaluator.plot_lift_curve() + +evaluator.plot_cumulative_response_curve() + ``` ## Development From 748f6819ce237e93b92ea26592bb07f4ddb8b8e0 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 30 Mar 2020 17:05:16 +0200 Subject: [PATCH 83/98] Change color scheme in Evaluator plots --- cobra/evaluation/evaluator.py | 31 +++++++++++++++++-------------- cobra/model_building/models.py | 3 ++- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index de09f77..39f8f7c 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -144,21 +144,24 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): auc = self._scalar_metrics["AUC"] - fig, ax = plt.subplots(figsize=dim) + with plt.style.context("seaborn-whitegrid"): - ax.plot(self.roc_curve["fpr"], - self.roc_curve["tpr"], - color="darkorange", lw=2, - label="ROC curve (area = {s:.3})".format(s=auc)) + fig, ax = plt.subplots(figsize=dim) - ax.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") - ax.set_xlabel("False Positive Rate", fontsize=15) - ax.set_ylabel("True Positive Rate", fontsize=15) - ax.legend(loc="lower right") - ax.set_title("ROC Curve", fontsize=20) + ax.plot(self.roc_curve["fpr"], + self.roc_curve["tpr"], + color="cornflowerblue", linewidth=3, + label="ROC curve (area = {s:.3})".format(s=auc)) - if path: - plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + ax.plot([0, 1], [0, 1], color="darkorange", linewidth=3, + linestyle="--") + ax.set_xlabel("False Positive Rate", fontsize=15) + ax.set_ylabel("True Positive Rate", fontsize=15) + ax.legend(loc="lower right") + ax.set_title("ROC Curve", fontsize=20) + + if path: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") plt.show() @@ -310,7 +313,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): fig, ax = plt.subplots(figsize=dim) ax.plot(self.cumulative_gains[0]*100, self.cumulative_gains[1]*100, - color="blue", linewidth=3, + color="cornflowerblue", linewidth=3, label="cumulative gains") ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3, ls="--", color="darkorange", label="random selection") @@ -319,7 +322,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): #Format axes ax.set_xlim([0, 100]) - ax.set_ylim([0, 100]) + ax.set_ylim([0, 105]) #Format ticks ax.set_yticklabels(["{:3.0f}%".format(x) for x in ax.get_yticks()]) diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 597117a..a736c96 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -153,4 +153,5 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: orient='index').reset_index() df.columns = ["predictor", "importance"] - return df.sort_values(by="importance", ascending=False) + return (df.sort_values(by="importance", ascending=False) + .reset_index(drop=True)) From 0030b3e9e0372aca3a742fe48cdaa24139fc8ba8 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 30 Mar 2020 17:05:32 +0200 Subject: [PATCH 84/98] Update example notebook --- examples/examples.ipynb | 286 +++++++++++++++++++++++++++------------- 1 file changed, 192 insertions(+), 94 deletions(-) diff --git a/examples/examples.ipynb b/examples/examples.ipynb index 146ec0c..1cf1527 100644 --- a/examples/examples.ipynb +++ b/examples/examples.ipynb @@ -52,7 +52,8 @@ "from cobra.evaluation import plot_performance_curves\n", "from cobra.evaluation import plot_variable_importance\n", "from cobra.evaluation import plot_predictor_quality\n", - "from cobra.evaluation import plot_correlation_matrix" + "from cobra.evaluation import plot_correlation_matrix\n", + "from cobra.evaluation import Evaluator" ] }, { @@ -276,7 +277,7 @@ } ], "source": [ - "path = \"\"\n", + "path = \"\"\"\n", "\n", "basetable = pd.read_csv(path)\n", "\n", @@ -1072,7 +1073,7 @@ "forward_selection.fit(basetable[basetable[\"split\"] == \"train\"], \n", " \"TARGET\", \n", " preselected_predictors,\n", - " forced_predictors=[\"scont_1_enc\", \"scont_1_enc\"])" + " forced_predictors=[\"age_enc\"])" ] }, { @@ -1120,43 +1121,83 @@ " \n", " \n", " 0\n", - " [scont_1_enc]\n", - " scont_1_enc\n", - " 0.506035\n", - " 0.489155\n", - " 0.497746\n", + " [age_enc]\n", + " age_enc\n", + " 0.707488\n", + " 0.716277\n", + " 0.703346\n", " \n", " \n", " 1\n", - " [scont_1_enc, relationship_enc]\n", + " [age_enc, relationship_enc]\n", " relationship_enc\n", - " 0.779681\n", - " 0.775267\n", - " 0.787203\n", + " 0.817672\n", + " 0.823040\n", + " 0.823285\n", " \n", " \n", " 2\n", - " [scont_1_enc, relationship_enc, education_enc]\n", + " [age_enc, relationship_enc, education_enc]\n", " education_enc\n", - " 0.858048\n", - " 0.856376\n", - " 0.859707\n", + " 0.870507\n", + " 0.872161\n", + " 0.871720\n", " \n", " \n", " 3\n", - " [scont_1_enc, relationship_enc, education_enc,...\n", - " age_enc\n", - " 0.870537\n", - " 0.872149\n", - " 0.871496\n", + " [age_enc, relationship_enc, education_enc, occ...\n", + " occupation_enc\n", + " 0.878316\n", + " 0.882509\n", + " 0.880340\n", " \n", " \n", " 4\n", - " [scont_1_enc, relationship_enc, age_enc, educa...\n", - " occupation_enc\n", - " 0.878380\n", - " 0.882468\n", - " 0.880178\n", + " [age_enc, relationship_enc, occupation_enc, ed...\n", + " hours-per-week_enc\n", + " 0.882722\n", + " 0.885464\n", + " 0.885040\n", + " \n", + " \n", + " 5\n", + " [occupation_enc, age_enc, hours-per-week_enc, ...\n", + " fnlwgt_enc\n", + " 0.883650\n", + " 0.885744\n", + " 0.885099\n", + " \n", + " \n", + " 6\n", + " [fnlwgt_enc, occupation_enc, age_enc, hours-pe...\n", + " native-country_enc\n", + " 0.884048\n", + " 0.886110\n", + " 0.885772\n", + " \n", + " \n", + " 7\n", + " [fnlwgt_enc, occupation_enc, native-country_en...\n", + " marital-status_enc\n", + " 0.884377\n", + " 0.886766\n", + " 0.886017\n", + " \n", + " \n", + " 8\n", + " [fnlwgt_enc, occupation_enc, native-country_en...\n", + " scont_2_enc\n", + " 0.884610\n", + " 0.886622\n", + " 0.885594\n", + " \n", + " \n", + " 9\n", + " [fnlwgt_enc, occupation_enc, native-country_en...\n", + " workclass_enc\n", + " 0.884795\n", + " 0.886878\n", + " 0.885854\n", " \n", " \n", "\n", @@ -1164,18 +1205,28 @@ ], "text/plain": [ " predictors last_added_predictor \\\n", - "0 [scont_1_enc] scont_1_enc \n", - "1 [scont_1_enc, relationship_enc] relationship_enc \n", - "2 [scont_1_enc, relationship_enc, education_enc] education_enc \n", - "3 [scont_1_enc, relationship_enc, education_enc,... age_enc \n", - "4 [scont_1_enc, relationship_enc, age_enc, educa... occupation_enc \n", + "0 [age_enc] age_enc \n", + "1 [age_enc, relationship_enc] relationship_enc \n", + "2 [age_enc, relationship_enc, education_enc] education_enc \n", + "3 [age_enc, relationship_enc, education_enc, occ... occupation_enc \n", + "4 [age_enc, relationship_enc, occupation_enc, ed... hours-per-week_enc \n", + "5 [occupation_enc, age_enc, hours-per-week_enc, ... fnlwgt_enc \n", + "6 [fnlwgt_enc, occupation_enc, age_enc, hours-pe... native-country_enc \n", + "7 [fnlwgt_enc, occupation_enc, native-country_en... marital-status_enc \n", + "8 [fnlwgt_enc, occupation_enc, native-country_en... scont_2_enc \n", + "9 [fnlwgt_enc, occupation_enc, native-country_en... workclass_enc \n", "\n", " train_performance selection_performance validation_performance \n", - "0 0.506035 0.489155 0.497746 \n", - "1 0.779681 0.775267 0.787203 \n", - "2 0.858048 0.856376 0.859707 \n", - "3 0.870537 0.872149 0.871496 \n", - "4 0.878380 0.882468 0.880178 " + "0 0.707488 0.716277 0.703346 \n", + "1 0.817672 0.823040 0.823285 \n", + "2 0.870507 0.872161 0.871720 \n", + "3 0.878316 0.882509 0.880340 \n", + "4 0.882722 0.885464 0.885040 \n", + "5 0.883650 0.885744 0.885099 \n", + "6 0.884048 0.886110 0.885772 \n", + "7 0.884377 0.886766 0.886017 \n", + "8 0.884610 0.886622 0.885594 \n", + "9 0.884795 0.886878 0.885854 " ] }, "execution_count": 22, @@ -1184,17 +1235,17 @@ } ], "source": [ - "performances.head()" + "performances.head(n=10)" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
    " ] @@ -1216,39 +1267,86 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['age_enc', 'occupation_enc', 'scont_1_enc', 'relationship_enc', 'education_enc', 'hours-per-week_enc']\n" + "['age_enc', 'relationship_enc', 'education_enc', 'occupation_enc']\n" ] } ], "source": [ - "model = forward_selection.get_model_from_step(5) # Python starts to count from 0!\n", + "model = forward_selection.get_model_from_step(3) # Python starts to count from 0!\n", "print(model.predictors)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    predictorimportance
    0relationship0.747456
    1education0.609713
    2occupation0.572291
    3age0.524466
    \n", + "
    " + ], "text/plain": [ - "{'age': 0.5172855940047185,\n", - " 'occupation': 0.5634536788555236,\n", - " 'scont_1': 0.012280802872168558,\n", - " 'relationship': 0.7363851709976446,\n", - " 'education': 0.5989073013361791,\n", - " 'hours-per-week': 0.44453822509112334}" + " predictor importance\n", + "0 relationship 0.747456\n", + "1 education 0.609713\n", + "2 occupation 0.572291\n", + "3 age 0.524466" ] }, - "execution_count": 28, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1260,12 +1358,12 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
    " ] @@ -1280,89 +1378,89 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "from cobra.evaluation.model_evaluator import Evaluator" + "y_true = basetable[basetable[\"split\"] == \"selection\"][\"TARGET\"].values\n", + "y_pred = model.score_model(basetable[basetable[\"split\"] == \"selection\"])" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ - "y_true = basetable[basetable[\"split\"] == \"selection\"][\"TARGET\"].values\n", - "y_pred = model.score_model(basetable[basetable[\"split\"] == \"selection\"])" + "evaluator = Evaluator()\n", + "evaluator.fit(y_true, y_pred)" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "accuracy 0.798628\n", + "AUC 0.882509\n", + "precision 0.555125\n", + "recall 0.797176\n", + "F1 0.654488\n", + "lift at 0.05 3.420000\n", + "dtype: float64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "evaluator = Evaluator(y_true, y_pred)" + "# Get various scalar metrics such as accuracy, AUC, precision, recall, ...\n", + "evaluator.get_scalar_scalar_metrics()" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 36, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
    " ] }, "metadata": {}, "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best cutoff value for probability is: 0.28123688299785216\n" - ] } ], "source": [ - "evaluator.plotROCCurve()" + "evaluator.plot_roc_curve(dim=(8, 5))" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
    " ] }, "metadata": {}, "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Precision: 0.712\n", - "Accuracy: 0.836\n", - "Recall: 0.526\n", - "F1 Score: 0.605\n", - "Lift at top 10.0%: 3.32\n", - "AUC: 0.885\n" - ] } ], "source": [ - "evaluator.plotConfusionMatrix()" + "evaluator.plot_confusion_matrix(dim=(8, 5))" ] }, { @@ -1372,7 +1470,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
    " ] @@ -1382,17 +1480,17 @@ } ], "source": [ - "evaluator.plotCumulativeGains()" + "evaluator.plot_cumulative_gains(dim=(8, 5))" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 33, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
    " ] @@ -1402,19 +1500,19 @@ } ], "source": [ - "evaluator.plotCumulativeResponse()" + "evaluator.plot_lift_curve(dim=(8, 5))" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 34, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ - "
    " + "
    " ] }, "metadata": {}, @@ -1422,7 +1520,7 @@ } ], "source": [ - "evaluator.plotLift()" + "evaluator.plot_cumulative_response_curve(dim=(8, 5))" ] } ], From c1fb00276f4a7180bc129d7c14d8a5628de507ff Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 31 Mar 2020 14:36:44 +0200 Subject: [PATCH 85/98] Clean up setup and requirements.txt --- requirements.txt | 12 ++++++------ setup.py | 4 +--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 784dea5..c5d375c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -pandas==0.25.1 -matplotlib==3.0.2 -scipy==1.2.0 -seaborn==0.9.0 -numpy==1.17.2 -scikit_learn==0.22.1 +numpy>=1.17.2 +pandas>=0.25.1 +scipy>=1.2.0 +scikit_learn>=0.22.1 +matplotlib>=3.0.2 +seaborn>=0.9.0 diff --git a/setup.py b/setup.py index a13d0d8..5944027 100644 --- a/setup.py +++ b/setup.py @@ -10,11 +10,9 @@ "cobra.model_building", "cobra.evaluation"], url="https://github.com/PythonPredictions", - #long_description=long_description, # TO DO - #long_description_content_type="text/markdown", install_requires=[ - "pandas>=0.25.1", "numpy>=1.17.2", + "pandas>=0.25.1", "scipy>=1.2.0", "scikit_learn>=0.22.1", "matplotlib>=3.0.2", From 6774f99498a15f1db0a73e0fa460945643887999 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 31 Mar 2020 16:04:58 +0200 Subject: [PATCH 86/98] Refactor plotting functions from evaluation - Rename performance_curves.py to plotting_utils.py - Remove predictor_quality.py and move content to plotting_utils - Make style of plots compatible with the ones from Evaluator - README and examples.ipynb are modified accordingly --- README.md | 4 +- cobra/evaluation/__init__.py | 11 +- cobra/evaluation/performance_curves.py | 45 -------- cobra/evaluation/plotting_utils.py | 142 +++++++++++++++++++++++++ cobra/evaluation/predictor_quality.py | 80 -------------- examples/examples.ipynb | 6 +- 6 files changed, 153 insertions(+), 135 deletions(-) delete mode 100644 cobra/evaluation/performance_curves.py create mode 100644 cobra/evaluation/plotting_utils.py delete mode 100644 cobra/evaluation/predictor_quality.py diff --git a/README.md b/README.md index b723c29..a6ac9d7 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ Once these PIG tables are computed, we can start with the _univariate preselecti ```python from cobra.model_building import univariate_selection -from cobra.evaluation import plot_predictor_quality +from cobra.evaluation import plot_univariate_predictor_quality from cobra.evaluation import plot_correlation_matrix # Get list of predictor names to use for univariate_selection @@ -136,7 +136,7 @@ df_auc = univariate_selection.compute_univariate_preselection( ) # Plot df_auc to get a horizontal barplot: -plot_predictor_quality(df_auc) +plot_univariate_predictor_quality(df_auc) # compute correlations between preprocessed predictors: df_corr = (univariate_selection diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py index b7e5f8e..c648cb2 100644 --- a/cobra/evaluation/__init__.py +++ b/cobra/evaluation/__init__.py @@ -2,11 +2,12 @@ from .pigs_tables import compute_pig_table from .pigs_tables import plot_pig_graph -from .performance_curves import plot_performance_curves +from .plotting_utils import plot_performance_curves +from .plotting_utils import plot_variable_importance + +from .plotting_utils import plot_univariate_predictor_quality +from .plotting_utils import plot_correlation_matrix -from .predictor_quality import plot_variable_importance -from .predictor_quality import plot_predictor_quality -from .predictor_quality import plot_correlation_matrix from .evaluator import Evaluator @@ -15,6 +16,6 @@ "plot_pig_graph", "plot_performance_curves", "plot_variable_importance", - "plot_predictor_quality", + "plot_univariate_predictor_quality", "plot_correlation_matrix", "Evaluator"] diff --git a/cobra/evaluation/performance_curves.py b/cobra/evaluation/performance_curves.py deleted file mode 100644 index 0ff5aa8..0000000 --- a/cobra/evaluation/performance_curves.py +++ /dev/null @@ -1,45 +0,0 @@ -# third party imports -import numpy as np -import pandas as pd - -import matplotlib.pyplot as plt - - -def plot_performance_curves(model_performance: pd.DataFrame, - dim: tuple=(12, 8)): - """Plot performance curves generated by the forward feature selection - for the train-selection-validation sets - - Parameters - ---------- - model_performance : pd.DataFrame - contains train-selection-validation performance for each model trained - in the forward feature selection - dim : tuple, optional - tuple with width and lentgh of the plot - """ - highest_auc = np.round(max(max(model_performance['train_performance']), - max(model_performance['selection_performance']), - max(model_performance['validation_performance']) - ), 1) - - fig, ax = plt.subplots(figsize=dim) - - plt.plot(model_performance['train_performance'], marker=".", markersize=20, - linewidth=3, label='AUC train') - plt.plot(model_performance['selection_performance'], marker=".", - markersize=20, linewidth=3, label='AUC selection') - plt.plot(model_performance['validation_performance'], marker=".", - markersize=20, linewidth=3, label='AUC validation') - # Set x/yticks - ax.set_xticks(np.arange(len(model_performance['last_added_predictor']) - + 1)) - ax.set_xticklabels(model_performance['last_added_predictor'].tolist(), - rotation=40, ha='right') - ax.set_yticks(np.arange(0.5, highest_auc + 0.02, 0.05)) - #Make Pretty - ax.legend(loc='lower right') - fig.suptitle('Performance curves - forward feature selection', - fontsize=20) - plt.ylabel('Model performance') - plt.show() diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py new file mode 100644 index 0000000..8b71ab7 --- /dev/null +++ b/cobra/evaluation/plotting_utils.py @@ -0,0 +1,142 @@ +# third party imports +import numpy as np +import pandas as pd + +import matplotlib.pyplot as plt +import seaborn as sns + + +def plot_univariate_predictor_quality(df_auc: pd.DataFrame, + dim: tuple=(12, 8)): + """Plot univariate quality of the predictors + + Parameters + ---------- + df_auc : pd.DatFrame + Contains for each variable the train auc and selection auc allong with + a boolean indicating whether or not it is selected based on the + criteria + dim : tuple, optional + tuple with width and lentgh of the plot + """ + + df = (df_auc[df_auc["preselection"]] + .sort_values(by='AUC train', ascending=False)) + + df = pd.melt(df, id_vars=["predictor"], + value_vars=["AUC train", "AUC selection"], + var_name="split", + value_name="AUC") + + # plot data + with plt.style.context("seaborn-ticks"): + fig, ax = plt.subplots(figsize=dim) + + ax = sns.barplot(x="AUC", y="predictor", hue="split", data=df) + ax.set_title('Univariate Quality of Predictors') + + # Set Axis - make them pretty + sns.despine(ax=ax, right=True) + + # Remove white lines from the second axis + ax.grid(False) + + plt.show() + + +def plot_correlation_matrix(df_corr: pd.DataFrame, + dim: tuple=(12, 8)): + """Plot correlation matrix amongst the predictors + + Parameters + ---------- + df_corr : pd.DataFrame + Correlation matrix + dim : tuple, optional + tuple with width and lentgh of the plot + """ + fig, ax = plt.subplots(figsize=dim) + ax = sns.heatmap(df_corr, cmap='Blues') + ax.set_title('Correlation Matrix') + plt.show() + + +def plot_performance_curves(model_performance: pd.DataFrame, + dim: tuple=(12, 8), + colors: dict={"train": "#0099bf", + "selection": "#ff9500", + "validation": "#8064a2"}): + """Plot performance curves generated by the forward feature selection + for the train-selection-validation sets + + Parameters + ---------- + model_performance : pd.DataFrame + contains train-selection-validation performance for each model trained + in the forward feature selection + dim : tuple, optional + tuple with width and lentgh of the plot + """ + highest_auc = np.round(max(max(model_performance['train_performance']), + max(model_performance['selection_performance']), + max(model_performance['validation_performance']) + ), 1) + + with plt.style.context("seaborn-whitegrid"): + + fig, ax = plt.subplots(figsize=dim) + + plt.plot(model_performance['train_performance'], marker=".", + markersize=20, linewidth=3, label='AUC train', + color=colors["train"]) + plt.plot(model_performance['selection_performance'], marker=".", + markersize=20, linewidth=3, label='AUC selection', + color=colors["selection"]) + plt.plot(model_performance['validation_performance'], marker=".", + markersize=20, linewidth=3, label='AUC validation', + color=colors["validation"]) + # Set x/yticks + ax.set_xticks(np.arange(len(model_performance['last_added_predictor']) + + 1)) + ax.set_xticklabels(model_performance['last_added_predictor'].tolist(), + rotation=40, ha='right') + ax.set_yticks(np.arange(0.5, highest_auc + 0.02, 0.05)) + #Make Pretty + ax.legend(loc='lower right') + fig.suptitle('Performance curves - forward feature selection', + fontsize=20) + plt.ylabel('Model performance') + plt.show() + + +def plot_variable_importance(df_variable_importance: pd.DataFrame, + title: str=None, + dim: tuple=(12, 8)): + """Plot variable importance of a given model + + Parameters + ---------- + df_variable_importance : pd.DataFrame + DataFrame containing columns predictor and importance + title : str, optional + Title of the plot + dim : tuple, optional + tuple with width and lentgh of the plot + """ + with plt.style.context("seaborn-ticks"): + fig, ax = plt.subplots(figsize=dim) + ax = sns.barplot(x="importance", y="predictor", + data=df_variable_importance, + color="cornflowerblue") + if title: + ax.set_title(title) + else: + ax.set_title("Variable importance") + + # Set Axis - make them pretty + sns.despine(ax=ax, right=True) + + # Remove white lines from the second axis + ax.grid(False) + + plt.show() diff --git a/cobra/evaluation/predictor_quality.py b/cobra/evaluation/predictor_quality.py deleted file mode 100644 index 3fac91f..0000000 --- a/cobra/evaluation/predictor_quality.py +++ /dev/null @@ -1,80 +0,0 @@ -# third party imports -import pandas as pd - -import matplotlib.pyplot as plt -import seaborn as sns - - -def plot_variable_importance(df_variable_importance: pd.DataFrame, - title: str=None, - dim: tuple=(12, 8)): - """Plot variable importance of a given model - - Parameters - ---------- - df_variable_importance : pd.DataFrame - DataFrame containing columns predictor and importance - title : str, optional - Title of the plot - dim : tuple, optional - tuple with width and lentgh of the plot - """ - - # plot data - fig, ax = plt.subplots(figsize=dim) - ax = sns.barplot(x="importance", y="predictor", - data=df_variable_importance) - if title: - ax.set_title(title) - else: - ax.set_title("Variable importance") - plt.show() - - -def plot_predictor_quality(df_auc: pd.DataFrame, - dim: tuple=(12, 8)): - """Plot univariate quality of the predictors - - Parameters - ---------- - df_auc : pd.DatFrame - Contains for each variable the train auc and selection auc allong with - a boolean indicating whether or not it is selected based on the - criteria - dim : tuple, optional - tuple with width and lentgh of the plot - """ - - plt.style.use('seaborn-darkgrid') - - df = (df_auc[df_auc["preselection"]] - .sort_values(by='AUC train', ascending=False)) - - df = pd.melt(df, id_vars=["predictor"], - value_vars=["AUC train", "AUC selection"], - var_name="partition", - value_name="AUC") - - # plots - fig, ax = plt.subplots(figsize=dim) - - ax = sns.barplot(x="AUC", y="predictor", hue="partition", data=df) - ax.set_title('Univariate Quality of Predictors') - plt.show() - - -def plot_correlation_matrix(df_corr: pd.DataFrame, - dim: tuple=(12, 8)): - """Plot correlation matrix amongst the predictors - - Parameters - ---------- - df_corr : pd.DataFrame - Correlation matrix - dim : tuple, optional - tuple with width and lentgh of the plot - """ - fig, ax = plt.subplots(figsize=dim) - ax = sns.heatmap(df_corr, cmap='Blues') - ax.set_title('Correlation Matrix') - plt.show() diff --git a/examples/examples.ipynb b/examples/examples.ipynb index 1cf1527..71fe896 100644 --- a/examples/examples.ipynb +++ b/examples/examples.ipynb @@ -51,7 +51,7 @@ "from cobra.evaluation import generate_pig_tables\n", "from cobra.evaluation import plot_performance_curves\n", "from cobra.evaluation import plot_variable_importance\n", - "from cobra.evaluation import plot_predictor_quality\n", + "from cobra.evaluation import plot_univariate_predictor_quality\n", "from cobra.evaluation import plot_correlation_matrix\n", "from cobra.evaluation import Evaluator" ] @@ -853,7 +853,7 @@ } ], "source": [ - "plot_predictor_quality(df_auc)" + "plot_univariate_predictor_quality(df_auc)" ] }, { @@ -1420,7 +1420,7 @@ ], "source": [ "# Get various scalar metrics such as accuracy, AUC, precision, recall, ...\n", - "evaluator.get_scalar_scalar_metrics()" + "evaluator.scalar_metrics" ] }, { From a68ef3814a4c01c879ac6771e3f3dbd241494bc3 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 31 Mar 2020 16:25:38 +0200 Subject: [PATCH 87/98] Change datatype of Evaluator.scalar_metrics to pd.Series --- cobra/evaluation/evaluator.py | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 39f8f7c..04b95c1 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -45,7 +45,7 @@ def __init__(self, probability_cutoff: float=None, self.probability_cutoff = probability_cutoff # Placeholder to store fitted output - self._scalar_metrics = None + self.scalar_metrics = None self.roc_curve = None self.confusion_matrix = None self.lift_curve = None @@ -66,7 +66,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): for pred in y_pred]) # Compute the various evaluation metrics - self._scalar_metrics = Evaluator.compute_scalar_metrics( + self.scalar_metrics = Evaluator.compute_scalar_metrics( y_true, y_pred, y_pred_b, @@ -79,23 +79,13 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): self.cumulative_gains = Evaluator._compute_cumulative_gains(y_true, y_pred) - def get_scalar_scalar_metrics(self) -> pd.Series: - """Get the evaluation_metrics attribute as a pandas Series - - Returns - ------- - pd.Series - Score of various scalar evaluation metrics for the model - """ - return pd.Series(self._scalar_metrics) - @staticmethod def compute_scalar_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_pred_b: np.ndarray, - lift_at: float) -> dict: + lift_at: float) -> pd.Series: """Convenient function to compute various scalar performance measures - and return them in a dict + and return them in a pd.Series Parameters ---------- @@ -110,10 +100,10 @@ def compute_scalar_metrics(y_true: np.ndarray, Returns ------- - dict + pd.Series contains various performance measures of the model """ - return { + return pd.Series({ "accuracy": accuracy_score(y_true, y_pred_b), "AUC": roc_auc_score(y_true, y_pred), "precision": precision_score(y_true, y_pred_b), @@ -123,7 +113,7 @@ def compute_scalar_metrics(y_true: np.ndarray, ._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=lift_at), 2) - } + }) def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): """Plot ROC curves of the model @@ -142,7 +132,7 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): raise NotFittedError(msg.format(self.__class__.__name__)) - auc = self._scalar_metrics["AUC"] + auc = float(self.scalar_metrics.loc["AUC"]) with plt.style.context("seaborn-whitegrid"): @@ -188,7 +178,7 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), fig, ax = plt.subplots(figsize=dim) ax = sns.heatmap(self.confusion_matrix, annot=self.confusion_matrix.astype(str), - fmt="s", cmap="Reds", + fmt="s", cmap="Blues", xticklabels=labels, yticklabels=labels) ax.set_title("Confusion matrix", fontsize=20) From ac42d57c98e9921462ba177b98f23af20e818ba0 Mon Sep 17 00:00:00 2001 From: JanBenisek Date: Mon, 6 Apr 2020 08:06:51 +0200 Subject: [PATCH 88/98] updated gitignore Added .vscode/ settings to ignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 5a98460..a24d78a 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,6 @@ ENV/ # mypy .mypy_cache/ + +# vscode settings +.vscode/ From 69041d3da31b426cd2fa24b030587b69dd662720 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 20 Apr 2020 08:43:07 +0200 Subject: [PATCH 89/98] Fix typo in README --- README.md | 2 +- cobra/evaluation/evaluator.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a6ac9d7..2a517b7 100644 --- a/README.md +++ b/README.md @@ -201,7 +201,7 @@ evaluator = Evaluator() evaluator.fit(y_true, y_pred) # Automatically find the best cut-off probability # Get various scalar metrics such as accuracy, AUC, precision, recall, ... -evaluator.get_scalar_scalar_metrics() +evaluator.scalar_metrics # Plot non-scalar evaluation metrics: evaluator.plot_roc_curve() diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 04b95c1..b2f2531 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -109,10 +109,10 @@ def compute_scalar_metrics(y_true: np.ndarray, "precision": precision_score(y_true, y_pred_b), "recall": recall_score(y_true, y_pred_b), "F1": f1_score(y_true, y_pred_b, average=None)[1], - f"lift at {lift_at}": np.round(Evaluator - ._compute_lift(y_true=y_true, - y_pred=y_pred, - lift_at=lift_at), 2) + "lift at".format(lift_at): np.round(Evaluator + ._compute_lift(y_true=y_true, + y_pred=y_pred, + lift_at=lift_at), 2) }) def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): From 0b45b2bb452e9f99293712481f64478cc7533ff1 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Mon, 20 Apr 2020 08:44:33 +0200 Subject: [PATCH 90/98] Clean up repo --- cobra/metrics/__init__.py | 3 - cobra/metrics/all_metrics_plots.py | 527 ----------------------------- 2 files changed, 530 deletions(-) delete mode 100644 cobra/metrics/__init__.py delete mode 100644 cobra/metrics/all_metrics_plots.py diff --git a/cobra/metrics/__init__.py b/cobra/metrics/__init__.py deleted file mode 100644 index 67656f7..0000000 --- a/cobra/metrics/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .all_metrics_plots import Evaluator - -__all__ = ['Evaluator'] \ No newline at end of file diff --git a/cobra/metrics/all_metrics_plots.py b/cobra/metrics/all_metrics_plots.py deleted file mode 100644 index d18bb64..0000000 --- a/cobra/metrics/all_metrics_plots.py +++ /dev/null @@ -1,527 +0,0 @@ -""" -====================================================================================== ---------------------------------------- Evaluation Class code ------------------------ -====================================================================================== -author: jan.benisek@pythonpredictins.com - benoit.vandekerkhove@pythonpredictions.com -date: 23/09/2019 -purpose: library for model evaluation class - -""" -#%% -import numpy as np -import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt -import sklearn.metrics as mt -from typing import Tuple -#%% - - -class Evaluator(): - ''' - Class to evaluate models - - Parameters - ----------- - y_true : array, shape = [1, n_features] - array with true values - - y_pred_p : array, shape = [1, n_features] - array with predicted values (probabilities) - - lift_at : int , default=0.05 - calculate lift at given level (0-1) - - save_pth : str, default=None - path to where save the plot - - binary_cutoff : float, default=0.5 - cutoff to convert predictions to binary - - ''' - - def __init__(self, y_true: np.ndarray, y_pred_p: np.ndarray, - lift_at: float=0.05, save_pth: str=None, binary_cutoff: int=0.5): - - self.y_true = y_true.flatten() - self.y_pred_p = y_pred_p.flatten() #As probability - self.lift_at = lift_at - self.save_pth = save_pth - self.binary_cutoff = binary_cutoff - - self.y_pred_b = np.where(self.y_pred_p > self.binary_cutoff,1,0) - - - - - '''============================================================= - ----------------------------- PLOTS ---------------------------- - =============================================================''' - def plotROCCurve(self, desc: str=None): - ''' - Plot ROC curve and print best cutoff value - Transform probabilities predictions to bool based on best AUC based cutoff - - Parameters - ---------- - desc : str, default=None - description of the plot, used also as a name of saved plot - - ''' - if desc is None: - desc = '' - - fpr,tpr,thresholds = mt.roc_curve(self.y_true,self.y_pred_p) - - #--------------------------- - #Calculate AUC - #-------------------------- - score = mt.roc_auc_score(self.y_true, self.y_pred_p) - - fig, ax = plt.subplots(figsize=(8,5)) - ax.plot(fpr,tpr, color='darkorange', lw=2, label='ROC curve (area = {s:.3})'.format(s=score)) - ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') - ax.set_xlabel('False Positive Rate', fontsize=15) - ax.set_ylabel('True Positive Rate', fontsize=15) - ax.legend(loc="lower right") - ax.set_title('ROC Curve {}' .format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - '''============================================================= - ---------------------------- METRICS --------------------------- - =============================================================''' - - def printPerformance(self): - ''' - Print out performance measures - - EV.printPerformance() - %timeit 2min 19s ± 784 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - ''' - - if self.threshold != np.nan : - out_perfo = self._evaluation() - - print('=== Test on', self.test_on, '===') - print('Precision: {s:.3}'.format(s=out_perfo['precision'])) #If we mark customer as a churner, how often we are correct - print('Accuracy: {s:.3}'.format(s=out_perfo['accuracy'])) #Overall performance - print('Recall: {s:.3}'.format(s=out_perfo['recall'])) #How many churners can the model detect - print('F1 Score: {s:.3}'.format(s=out_perfo['F1'])) # 2 * (precision * recall) / (precision + recall) - print('Lift at top {l}%: {s:.3}'.format(l=self.lift_at*100, s=out_perfo['lift'])) # 2 * (precision * recall) / (precision + recall) - print('AUC: {s:.3}'.format(s=out_perfo['AUC'])) # 2 * (precision * recall) / (precision + recall) - - else : - raise ValueError('Please call .plotROCCurve() method first to get the best threshold for probabilities, and try again') - - def plotLift(self, desc : str=None, save_pth : str=None): - ''' - Method plots lift per decile - - Parameters - ---------- - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - #--------------------- - #-- CALCULATE LIFT --- - #--------------------- -# inc_rate = self.y_true.mean() - lifts = [Evaluator.liftCalculator(y_true=self.y_true, y_pred=self.y_pred_p, lift_at=perc_lift) - for perc_lift in np.arange(0.05,1.05,0.05)] - - #--------------------- - #------- PLOT -------- - #--------------------- - if desc is None: - desc = '' - - fig, ax = plt.subplots(figsize=(8,5)) - plt.style.use('seaborn-darkgrid') - - nrows = len(lifts) - x_labels = [nrows/2-x/2 for x in np.arange(0,nrows,1)] - - #plt.bar(x_labels[::-1], df['lift'].values.tolist(), align='center', color="cornflowerblue") - plt.bar(x_labels[::-1], lifts, align='center', color="green", width=0.2) - plt.ylabel('lift', fontsize=15) - plt.xlabel('decile', fontsize=15) - ax.set_xticks(x_labels) - ax.set_xticklabels(x_labels) - - plt.axhline(y=1, color='darkorange', linestyle='--', - xmin=0.05, xmax=0.9, linewidth=3, label='Baseline') - - #Legend - ax.legend(loc='upper right') - - ##Set Axis - make them pretty - sns.despine(ax=ax, right=True, left=True) - - #Remove white lines from the second axis - ax.grid(False) - - ##Description - ax.set_title('Cumulative Lift {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - - - '''------------------------------------------------------------------- - -------------------------------- UTILS ------------------------------- - -------------------------------------------------------------------''' - def estimateCutoff(self) -> float: - ''' - Estimates optimal cutoff based on maximization of AUC curve - https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python - - Parameters - ---------- - None - - Returns - ------- - best_cutoff : float - optimal cutoff as a float <0;1> - - ''' - fpr,tpr,thresholds = mt.roc_curve(self.y_true,self.y_pred_p) - i = np.arange(len(tpr)) - roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), - 'threshold' : pd.Series(thresholds, index=i)}) - roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]] - - best_cutoff = list(roc_t['threshold']) - - return best_cutoff[0] - - - def _testA(self, test : np.ndarray, pred : np.ndarray, train_M : np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - ''' - Limits the evaluation to potential A offers - (that a customer has not purchase in the train timeframe) - - Parameters - ---------- - test: true values -> array - pred: predictions as probabilities -> array - train_M : train matrix of interactions -> ndarray - - Output - ------ - testA: vector of interaction on potential A offers -> array - predA: vector of predictions on potential A offers -> array - ''' - - train = train_M.flatten() - testA = np.where(train>0, np.nan, test) - predA = np.where(train>0, np.nan, pred) - testA = testA[testA>=0] - predA = predA[predA>=0] - - return testA, predA - - def _evaluation(self): - ''' - Convenient function, returns various performance measures in a dict - - Parameters - ---------- - y_true: true values - y_pred: predictions as booleans - - Output - ------ - Returns dictionary with the measures - ''' - - dict_perfo = {'precision': mt.precision_score(self.y_true, self.y_pred_b), - 'accuracy': mt.accuracy_score(self.y_true, self.y_pred_b), - 'recall': mt.recall_score(self.y_true, self.y_pred_b), - 'F1': mt.f1_score(self.y_true, self.y_pred_b, average=None)[1], - 'lift': np.round(Evaluator.liftCalculator(y_true=self.y_true, - y_pred=self.y_pred_p, - lift_at=self.lift_at),2), - 'AUC': mt.roc_auc_score(self.y_true, self.y_pred_p) - } - return dict_perfo - - @staticmethod - def liftCalculator(y_true : np.ndarray, y_pred : np.ndarray, lift_at : float=0.05, **kwargs) -> float: - ''' - Calculates lift given two arrays on specified level - - Parameters - ---------- - y_true: numpy array with true values - y_pred: numpy array with predictions (probabilities) - lift_at: lift at what top percentage - - Output - ------ - Scalar value, lift. - - 50.3 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) - ''' - #Make sure it is numpy array - y_true_ = np.array(y_true) - y_pred_ = np.array(y_pred) - - #Make sure it has correct shape - y_true_ = y_true_.reshape(len(y_true_),1) - y_pred_ = y_pred_.reshape(len(y_pred_),1) - - #Merge data together - y_data = np.hstack([y_true_, y_pred_]) - - #Calculate necessary variables - nrows = len(y_data) - stop = int(np.floor(nrows*lift_at)) - avg_incidence = np.einsum('ij->j',y_true_)/float(len(y_true_)) - - #Sort and filter data - data_sorted = y_data[y_data[:,1].argsort()[::-1]][:stop,0].reshape(stop, 1) - - #Calculate lift (einsum is very fast way of summing, needs specific shape) - inc_in_top_n = np.einsum('ij->j',data_sorted)/float(len(data_sorted)) - - lift = np.round(inc_in_top_n/avg_incidence,2)[0] - - return lift - - '''------------------------------------------------------------------- - ------------------------JUST IN CASE ------------------------------- - -------------------------------------------------------------------''' - - def plotConfusionMatrix(self, labels : list=None, color : str='Reds', - save_pth : str=None, desc : str=None): - ''' - Plot Confusion matrix - - Parameters - ---------- - y_test: True values of target y - pred: Predicted values of target y, boolean - labels: labels for the matrix, if empty, values from y_test_ are used - color: Color of the matrix, its a cmap, so many values possible - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - if labels is None: - labels = [str(lab) for lab in np.unique(self.y_true)] - - if desc is None: - desc = '' - - cm = mt.confusion_matrix(self.y_true, self.y_pred_b) - - fig, ax = plt.subplots(figsize=(8,5)) - ax = sns.heatmap(cm, annot=cm.astype(str), fmt="s", cmap=color, xticklabels=labels, yticklabels=labels) - ax.set_title('Confusion matrix {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - def plotCumulativeGains(self, save_pth : str=None, desc : str=None): - ''' - Functions plot cumulative gains - - Parameters - ---------- - save: whether plot should be saved (if yes, then now shown) - desc: description of the plot, used also as a name of saved plot - ''' - if desc is None: - desc = '' - - #--------------------------- - #Calculate cumulative gains - #-------------------------- - nrows = len(self.y_true) - npositives = self.y_true.sum() - df_y_pred = pd.DataFrame({"y":self.y_true, "y_pred":self.y_pred_p}).sort_values(by='y_pred', ascending=False).reset_index(drop=True) - cgains = [0] - for stop in (np.linspace(0.01,1,100)*nrows).astype(int): - cgains.append(round(df_y_pred.loc[:stop,'y'].sum()/npositives*max(100,1),2)) - - #--------------------------- - #Plot it - #--------------------------- - plt.style.use('seaborn-darkgrid') - fig, ax_cgains = plt.subplots(figsize=(8,5)) - ax_cgains.plot(cgains, color='blue', linewidth=3, label='cumulative gains') - ax_cgains.plot(ax_cgains.get_xlim(), ax_cgains.get_ylim(), linewidth=3, ls="--", color="darkorange", label='random selection') - ax_cgains.set_title('Cumulative Gains ' + desc, fontsize=20) - - ax_cgains.set_title('Cumulative Gains {}' .format(desc), fontsize=20) - #Format axes - ax_cgains.set_xlim([0,100]) - ax_cgains.set_ylim([0,100]) - #Format ticks - ax_cgains.set_yticklabels(['{:3.0f}%'.format(x) for x in ax_cgains.get_yticks()]) - ax_cgains.set_xticklabels(['{:3.0f}%'.format(x) for x in ax_cgains.get_xticks()]) - #Legend - ax_cgains.legend(loc='lower right') - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - - def plotCumulativeResponse(self, desc : str=None, save_pth : str=None): - #--------------------- - #-- CALCULATE LIFT --- - #--------------------- - inc_rate = self.y_true.mean() - lifts = [Evaluator.liftCalculator(y_true=self.y_true, y_pred=self.y_pred_p, lift_at=perc_lift) - for perc_lift in np.arange(0.1,1.1,0.1)] - lifts = np.array(lifts)*inc_rate*100 - #--------------------- - #------- PLOT -------- - #--------------------- - if desc is None: - desc = '' - - fig, ax = plt.subplots(figsize=(8,5)) - #plt.style.use('seaborn-darkgrid') - plt.style.use('default') - - nrows = len(lifts) - x_labels = [nrows-x for x in np.arange(0,nrows,1)] - - #plt.bar(x_labels[::-1], df['lift'].values.tolist(), align='center', color="cornflowerblue") - plt.bar(x_labels[::-1], lifts, align='center', color="#00ccff") - plt.ylabel('response (%)', fontsize=16) - plt.xlabel('decile', fontsize=16) - ax.set_xticks(x_labels) - ax.set_xticklabels(x_labels) - - plt.axhline(y=inc_rate*100, color='#ff9500', linestyle='--', - xmin=0.05, xmax=0.95, linewidth=3, label='Incidence') - - #Legend - ax.legend(loc='upper right') - - ##Set Axis - make them pretty - sns.despine(ax=ax, right=True, left=True) - - #Remove white lines from the second axis - ax.grid(False) - - ##Description - ax.set_title('Cumulative response {}'.format(desc), fontsize=20) - - if save_pth is not None: - plt.savefig(save_pth, format='png', dpi=300, bbox_inches='tight') - - plt.show() - -def plotIncidence(df, variable, dim=(12,8)): - ''' - Method plots Incidence plot on train partition - Returns plot - ---------------------------------------------------- - df: dataframe with cleaned, binned, partitioned and prepared data - variable: variable for which the incidence plot will be shown` - dim: tuple with width and lentgh of the plot - ---------------------------------------------------- - ''' - def masterOfOrder(x): - ''' - Function converts interval or string (category) to a number, so the incidence plot can be orderd. - In case of interval -> '(151, 361]' to integer 151. - In case of string -> order is alphabetical - Missings and Non-significants are always put at the end - - Parameters - ---------- - x: value to be converted - - Output - ------ - Order of given value - ''' - x_split = x.split(',')[0] - replace_strings = (('...', '0'),('Missing','999999999999'), ('Non-significants','999999999999')) - for repl_str in replace_strings: - x_split = x_split.replace(repl_str[0], repl_str[1]) - x_split = x_split.strip("()[]") - - try: - order = float(x_split) - except: - LETTERS = {letter: index for index, letter in enumerate(ascii_lowercase, start=1)} - order = LETTERS[x[0].lower()] - - return order - - plt.style.use('seaborn-darkgrid') - - #---------------------------------- - #------ Prepare the data -------- - #---------------------------------- - #Set up the variable and dataframe - var_prefix = 'B_' + variable - df_plt = df[['TARGET', var_prefix]][df['PARTITION'] == 'train'].copy() - - #Aggregate the data - avg_inc_rate = df_plt['TARGET'].mean() - - aggregations = { - 'bin_inc_rate': 'mean', - 'bin_size': 'count' - } - df_plt = df_plt.groupby(var_prefix, as_index=False)['TARGET'].agg(aggregations) - df_plt['avg_inc_rate'] = avg_inc_rate - - #create a sort column and sort by it - df_plt['sort_by'] = df_plt[var_prefix].apply(lambda x: masterOfOrder(x)) - df_plt.sort_values(by='sort_by', ascending=True, inplace=True) - df_plt.reset_index(inplace=True) - - #---------------------------------- - #----- Plot the incidence ------- - #---------------------------------- - fig, ax = plt.subplots(figsize=dim) - ##First Axis - #Bin size - y_pos = np.arange(len(df_plt[var_prefix])) - plt.bar(y_pos, df_plt['bin_size'].values.tolist(), align='center', color="cornflowerblue") - plt.xticks(y_pos, df_plt[var_prefix]) - plt.ylabel('Bin Size') - plt.xlabel(variable + ' Bins') - - max_inc = max(df_plt['bin_inc_rate']) - - ##Second Axis - ax2 = ax.twinx() - #incidence rate per bin - plt.plot(df_plt['bin_inc_rate'], color="darkorange", marker=".", markersize=20, linewidth=3, label='incidence rate per bin') - plt.plot(df_plt['avg_inc_rate'], color="dimgrey", linewidth=4, label='average incidence rate') - ax2.plot(np.nan, "cornflowerblue", linewidth=6, label = 'bin size') #dummy line to have label on second axis from first - ax2.set_yticks(np.arange(0, max_inc+0.05, 0.05)) - ax2.set_yticklabels(['{:3.1f}%'.format(x*100) for x in ax2.get_yticks()]) - plt.ylabel('Incidence') - - ##Set Axis - sns.despine(ax=ax, right=True, left=True) - sns.despine(ax=ax2, left=True, right=False) - ax2.spines['right'].set_color('white') - - #remove white line from second grid axes - #the white lines are reguler, Spyder sometimes fails to visualize it (try to export the pic!) - ax2.grid(False) - - ##Description - fig.suptitle('Incidence Plot - ' + variable, fontsize=20, y=1.02) - ax2.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=1, mode="expand", borderaxespad=0.) - plt.show() From 9a5299ac771ede3fc48d8c67110fd0b5c10abe46 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 21 Apr 2020 10:06:35 +0200 Subject: [PATCH 91/98] Fix random state in models, bug fixing in evaluator --- cobra/evaluation/evaluator.py | 5 +++-- cobra/model_building/models.py | 2 +- cobra/utils.py | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index b2f2531..2a6a657 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -109,8 +109,9 @@ def compute_scalar_metrics(y_true: np.ndarray, "precision": precision_score(y_true, y_pred_b), "recall": recall_score(y_true, y_pred_b), "F1": f1_score(y_true, y_pred_b, average=None)[1], - "lift at".format(lift_at): np.round(Evaluator - ._compute_lift(y_true=y_true, + "lift at {}".format(lift_at): np.round(Evaluator + ._compute_lift( + y_true=y_true, y_pred=y_pred, lift_at=lift_at), 2) }) diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index a736c96..3ce6964 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -24,7 +24,7 @@ class LogisticRegressionModel: def __init__(self): self.logit = LogisticRegression(fit_intercept=True, C=1e9, - solver='liblinear') + solver='liblinear', random_state=42) # placeholder to keep track of a list of predictors self.predictors = [] self._eval_metrics_by_split = {} diff --git a/cobra/utils.py b/cobra/utils.py index 8c55b7d..b21e6d9 100644 --- a/cobra/utils.py +++ b/cobra/utils.py @@ -70,4 +70,5 @@ def clean_predictor_name(predictor: str) -> str: Returns: str: Description """ - return predictor.replace("_enc", "").replace("_bin", "") + return (predictor.replace("_enc", "").replace("_bin", "") + .replace("_processed", "")) From 8d8d5534b04f0adfc9b199d06146b1c17af3b0e6 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 21 Apr 2020 14:05:26 +0200 Subject: [PATCH 92/98] Fix bug in KBinsDiscretizer.set_attributes_from_dict --- cobra/preprocessing/kbins_discretizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 4ad153d..dc5baba 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -161,7 +161,7 @@ def set_attributes_from_dict(self, params: dict): self.set_params(**params) self._bins_by_column = { - key: [tuple(l) for l in value] + key: ([tuple(l) for l in value] if value else None) for key, value in _bins_by_column.items() } From eeddacea6d222d9f24da7038928ca453eea1fdec Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 21 Apr 2020 14:06:29 +0200 Subject: [PATCH 93/98] Modify CategoricalDataProcessor to avoid regrouping of dummy variables --- cobra/preprocessing/categorical_data_processor.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 5b3a4fc..45af0d6 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -191,6 +191,13 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, unique_categories = list(X.unique()) + # do not merge categories in case of dummies, i.e. 0 and 1 + # (and possibly "Missings") + if (len(unique_categories) == 2 + or (len(unique_categories) == 3 + and "Missing" in unique_categories)): + return set(unique_categories) + # get small categories and add them to the merged category list small_categories = (CategoricalDataProcessor ._get_small_categories( @@ -420,7 +427,8 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str, @staticmethod def _replace_categories(data: pd.Series, categories: set) -> pd.Series: - """replace categories in set with "Other" + """replace categories in set with "Other" and transform the remaining + categories to strings to avoid type errors later on in the pipeline Parameters ---------- @@ -434,4 +442,4 @@ def _replace_categories(data: pd.Series, categories: set) -> pd.Series: pd.Series Description """ - return data.apply(lambda x: x if x in categories else "Other") + return data.apply(lambda x: str(x) if x in categories else "Other") From 581f7b4daf753051465700c78bc4aca7410d4cd5 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 28 Apr 2020 14:19:22 +0200 Subject: [PATCH 94/98] Add option to save figure to plotting_utils --- cobra/evaluation/evaluator.py | 6 +++--- cobra/evaluation/plotting_utils.py | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 2a6a657..979753b 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -381,8 +381,8 @@ def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, @staticmethod def _compute_cumulative_gains(y_true: np.ndarray, y_pred: np.ndarray) -> tuple: - """Compute lift of the model per decile, returns x-labels, lifts and - the target incidence to create cummulative response curves + """Compute cumulative gains of the model, returns percentages and + gains cummulative gains curves Code from (https://github.com/reiinakano/scikit-plot/blob/ 2dd3e6a76df77edcbd724c4db25575f70abb57cb/ @@ -398,7 +398,7 @@ def _compute_cumulative_gains(y_true: np.ndarray, Returns ------- tuple - x-labels, lifts per decile and target incidence + x-labels, gains """ # make y_true a boolean vector diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py index 8b71ab7..35a56e7 100644 --- a/cobra/evaluation/plotting_utils.py +++ b/cobra/evaluation/plotting_utils.py @@ -45,7 +45,8 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame, def plot_correlation_matrix(df_corr: pd.DataFrame, - dim: tuple=(12, 8)): + dim: tuple=(12, 8), + path: str=None): """Plot correlation matrix amongst the predictors Parameters @@ -54,10 +55,16 @@ def plot_correlation_matrix(df_corr: pd.DataFrame, Correlation matrix dim : tuple, optional tuple with width and lentgh of the plot + path : str, optional + path to store the figure """ fig, ax = plt.subplots(figsize=dim) ax = sns.heatmap(df_corr, cmap='Blues') ax.set_title('Correlation Matrix') + + if path is not None: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + plt.show() @@ -111,7 +118,8 @@ def plot_performance_curves(model_performance: pd.DataFrame, def plot_variable_importance(df_variable_importance: pd.DataFrame, title: str=None, - dim: tuple=(12, 8)): + dim: tuple=(12, 8), + path: str=None): """Plot variable importance of a given model Parameters @@ -122,6 +130,8 @@ def plot_variable_importance(df_variable_importance: pd.DataFrame, Title of the plot dim : tuple, optional tuple with width and lentgh of the plot + path : str, optional + path to store the figure """ with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) @@ -139,4 +149,7 @@ def plot_variable_importance(df_variable_importance: pd.DataFrame, # Remove white lines from the second axis ax.grid(False) + if path is not None: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + plt.show() From 0a5a7ffcd6db360312eb628f7227f5c031910b0a Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 28 Apr 2020 17:12:19 +0200 Subject: [PATCH 95/98] hotfix of missing value imputation --- cobra/preprocessing/preprocessor.py | 7 ++ cobra/preprocessing/target_encoder.py | 96 +++++++++++++++++----- tests/preprocessing/test_target_encoder.py | 16 ++-- 3 files changed, 92 insertions(+), 27 deletions(-) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 8299b8f..5fb9774 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -80,6 +80,7 @@ def from_params(cls, scale_contingency_table: bool=True, forced_categories: dict={}, weight: float=0.0, + imputation_strategy: str="mean", serialization_path: Optional[str]=None): """Constructor to instantiate PreProcessor from all the parameters that can be set in all its required (attribute) classes. @@ -130,6 +131,12 @@ def from_params(cls, parameter, the bigger the contribution of the overall mean. When set to zero, there is no smoothing (e.g. the pure target incidence is used). + imputation_strategy : str, optional + in case there is a particular column which contains new categories, + the encoding will lead to NULL values which should be imputed. + Valid strategies are to replace with the global mean of the train + set or the min (resp. max) incidence of the categories of that + particular variable. serialization_path : str, optional path to save the pipeline to diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index ea63e13..9dac042 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -34,22 +34,37 @@ class TargetEncoder(BaseEstimator): Attributes ---------- - columns : list - A list of columns to encode, if None, all string columns will be - encoded. + imputation_strategy : str + in case there is a particular column which contains new categories, + the encoding will lead to NULL values which should be imputed. + Valid strategies are to replace with the global mean of the train + set or the min (resp. max) incidence of the categories of that + particular variable. weight : float Smoothing parameters (non-negative). The higher the value of the parameter, the bigger the contribution of the overall mean. When set to zero, there is no smoothing (e.g. the pure target incidence is used). """ - def __init__(self, weight: float=0.0): + valid_strategies = ("mean", "min", "max") + + def __init__(self, weight: float=0.0, + imputation_strategy: str="mean"): if weight < 0: raise ValueError("The value of weight cannot be smaller than zero") + elif imputation_strategy not in self.valid_strategies: + raise ValueError("Valid options for 'imputation_strategy' are {}." + " Got imputation_strategy={!r} instead" + .format(self.valid_strategies, + imputation_strategy)) self.weight = weight + self.imputation_strategy = imputation_strategy + self._mapping = {} # placeholder for fitted output + # placeholder for the global incidence of the data used for fitting + self._global_mean = None # not implemented yet! # randomized: bool=False, sigma=0.05 @@ -72,6 +87,8 @@ def attributes_to_dict(self) -> dict: for key, value in self._mapping.items() } + params["_global_mean"] = self._global_mean + return params def set_attributes_from_dict(self, params: dict): @@ -88,6 +105,14 @@ def set_attributes_from_dict(self, params: dict): if "weight" in params and type(params["weight"]) == float: self.weight = params["weight"] + if ("imputation_strategy" in params and + params["imputation_strategy"] in self.valid_strategies): + + self.imputation_strategy = params["imputation_strategy"] + + if "_global_mean" in params and type(params["_global_mean"]) == float: + self._global_mean = params["_global_mean"] + _mapping = {} if "_mapping" in params and type(params["_mapping"]) == dict: _mapping = params["_mapping"] @@ -121,7 +146,7 @@ def fit(self, data: pd.DataFrame, column_names: list, # compute global mean (target incidence in case of binary target) y = data[target_column] - global_mean = y.sum() / y.count() + self._global_mean = y.sum() / y.count() for column in column_names: if column not in data.columns: @@ -129,11 +154,9 @@ def fit(self, data: pd.DataFrame, column_names: list, "skipped in fitting" .format(column)) continue - self._mapping[column] = self._fit_column(data[column], y, - global_mean) + self._mapping[column] = self._fit_column(data[column], y) - def _fit_column(self, X: pd.Series, y: pd.Series, - global_mean: float) -> pd.Series: + def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series: """Summary Parameters @@ -143,8 +166,6 @@ def _fit_column(self, X: pd.Series, y: pd.Series, categorical variable. y : pd.Series series containing the targets for each observation - global_mean : float - Global mean of the target Returns ------- @@ -158,7 +179,9 @@ def _fit_column(self, X: pd.Series, y: pd.Series, # Q: do we need to do this here or during the transform phase??? # Note if self.weight = 0, we have the ordinary incidence replacement - numerator = stats["count"]*stats["mean"] + self.weight*global_mean + numerator = (stats["count"]*stats["mean"] + + self.weight * self._global_mean) + denominator = stats["count"] + self.weight return numerator/denominator @@ -187,13 +210,12 @@ def transform(self, data: pd.DataFrame, method """ - if len(self._mapping) == 0: + if (len(self._mapping) == 0) or (self._global_mean is None): msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") raise NotFittedError(msg.format(self.__class__.__name__)) - new_columns = [] for column in column_names: if column not in data.columns: @@ -205,15 +227,47 @@ def transform(self, data: pd.DataFrame, "and will be skipped".format(column)) continue - new_column = TargetEncoder._clean_column_name(column) + data = self._transform_column(data, column) + + return data + + def _transform_column(self, data: pd.DataFrame, + column_name: str) -> pd.DataFrame: + """Replace (e.g. encode) categories of each column with its average + incidence which was computed when the fit method was called - # Convert dtype to float because when the original dtype - # is of type "category", the resulting dtype is also of type - # "category" - data[new_column] = (data[column].map(self._mapping[column]) - .astype("float")) + Parameters + ---------- + X : pd.DataFrame + data to encode + column_name : str + Name of the column in data to be encoded - new_columns.append(new_column) + Returns + ------- + pd.DataFrame + transformed data + """ + new_column = TargetEncoder._clean_column_name(column_name) + + # Convert dtype to float because when the original dtype + # is of type "category", the resulting dtype is also of type + # "category" + data[new_column] = (data[column_name].map(self._mapping[column_name]) + .astype("float")) + + # In case of categorical data, it could be that new categories will + # emerge which were not present in the train set, so this will result + # in missing values (which should be replaced) + if data[new_column].isnull().sum() > 0: + if self.imputation_strategy == "mean": + data[new_column].fillna(self._global_mean, inplace=True) + elif self.imputation_strategy == "min": + data[new_column].fillna(data[new_column].min(), + inplace=True) + elif self.imputation_strategy == "max": + data[new_column].fillna(data[new_column].max(), + inplace=True) return data diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py index 25b5f3b..2441935 100644 --- a/tests/preprocessing/test_target_encoder.py +++ b/tests/preprocessing/test_target_encoder.py @@ -21,9 +21,13 @@ def test_target_encoder_attributes_to_dict(self): encoder._mapping["variable"] = mapping_data + encoder._global_mean = 0.5 + actual = encoder.attributes_to_dict() expected = {"weight": 0.0, + "imputation_strategy": "mean", + "_global_mean": 0.5, "_mapping": {"variable": { "negative": 0.333333, "neutral": 0.50000, @@ -58,6 +62,7 @@ def test_target_encoder_set_attributes_from_dict(self): encoder = TargetEncoder() data = {"weight": 0.0, + "_global_mean": 0.5, "_mapping": {"variable": { "negative": 0.333333, "neutral": 0.50000, @@ -85,8 +90,8 @@ def test_target_encoder_fit_column(self): 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) encoder = TargetEncoder() - actual = encoder._fit_column(X=df.variable, y=df.target, - global_mean=0.0) + encoder._global_mean = 0.0 + actual = encoder._fit_column(X=df.variable, y=df.target) expected = pd.Series(data=[0.333333, 0.50000, 0.666667], index=["negative", "neutral", "positive"]) @@ -103,11 +108,10 @@ def test_target_encoder_fit_column_global_mean(self): 'neutral'], 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - global_mean = df.target.sum() / df.target.count() # is 0.5 - encoder = TargetEncoder(weight=1) - actual = encoder._fit_column(X=df.variable, y=df.target, - global_mean=global_mean) + encoder._global_mean = df.target.sum() / df.target.count() # is 0.5 + + actual = encoder._fit_column(X=df.variable, y=df.target) expected = pd.Series(data=[0.375, 0.500, 0.625], index=["negative", "neutral", "positive"]) From f2c71edc9922da31f96e18f7b55919e8c6d346a8 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Tue, 5 May 2020 15:49:24 +0200 Subject: [PATCH 96/98] Add additional unittest for TargetEncoder --- cobra/evaluation/plotting_utils.py | 15 ++++++++++- tests/preprocessing/test_target_encoder.py | 29 +++++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py index 35a56e7..83090b2 100644 --- a/cobra/evaluation/plotting_utils.py +++ b/cobra/evaluation/plotting_utils.py @@ -7,7 +7,8 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame, - dim: tuple=(12, 8)): + dim: tuple=(12, 8), + path: str=None): """Plot univariate quality of the predictors Parameters @@ -18,6 +19,8 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame, criteria dim : tuple, optional tuple with width and lentgh of the plot + path : str, optional + path to store the figure """ df = (df_auc[df_auc["preselection"]] @@ -41,6 +44,9 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame, # Remove white lines from the second axis ax.grid(False) + if path is not None: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + plt.show() @@ -70,6 +76,7 @@ def plot_correlation_matrix(df_corr: pd.DataFrame, def plot_performance_curves(model_performance: pd.DataFrame, dim: tuple=(12, 8), + path: str=None, colors: dict={"train": "#0099bf", "selection": "#ff9500", "validation": "#8064a2"}): @@ -83,6 +90,8 @@ def plot_performance_curves(model_performance: pd.DataFrame, in the forward feature selection dim : tuple, optional tuple with width and lentgh of the plot + path : str, optional + path to store the figure """ highest_auc = np.round(max(max(model_performance['train_performance']), max(model_performance['selection_performance']), @@ -113,6 +122,10 @@ def plot_performance_curves(model_performance: pd.DataFrame, fig.suptitle('Performance curves - forward feature selection', fontsize=20) plt.ylabel('Model performance') + + if path is not None: + plt.savefig(path, format="png", dpi=300, bbox_inches="tight") + plt.show() diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py index 2441935..b924bb6 100644 --- a/tests/preprocessing/test_target_encoder.py +++ b/tests/preprocessing/test_target_encoder.py @@ -90,7 +90,7 @@ def test_target_encoder_fit_column(self): 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) encoder = TargetEncoder() - encoder._global_mean = 0.0 + encoder._global_mean = 0.5 actual = encoder._fit_column(X=df.variable, y=df.target) expected = pd.Series(data=[0.333333, 0.50000, 0.666667], @@ -164,6 +164,33 @@ def test_target_encoder_transform(self): pd.testing.assert_frame_equal(actual, expected, check_less_precise=5) + def test_target_encoder_transform_new_category(self): + + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + df_appended = df.append({"variable": "new", "target": 1}, + ignore_index=True) + + # inputs of TargetEncoder will be of dtype category + df["variable"] = df["variable"].astype("category") + df_appended["variable"] = df_appended["variable"].astype("category") + + expected = df_appended.copy() + expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000, + 0.333333, 0.666667, 0.333333, 0.50000, + 0.50000, 0.50000, 0.333333] + + encoder = TargetEncoder(imputation_strategy="min") + encoder.fit(data=df, column_names=["variable"], target_column="target") + actual = encoder.transform(data=df_appended, column_names=["variable"]) + + pd.testing.assert_frame_equal(actual, expected, + check_less_precise=5) + # Tests for _clean_column_name def test_target_encoder_clean_column_name(self): From 3865b36bbf9961fdc5b8682e93aa4b5c48fa3b9a Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Thu, 18 Jun 2020 16:10:45 +0200 Subject: [PATCH 97/98] Add matthews correlation coeff as evaluation metric --- cobra/evaluation/evaluator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 979753b..819f2f5 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -11,6 +11,7 @@ from sklearn.metrics import roc_curve from sklearn.metrics import confusion_matrix from sklearn.metrics import roc_auc_score +from sklearn.metrics import matthews_corrcoef from sklearn.exceptions import NotFittedError @@ -109,6 +110,7 @@ def compute_scalar_metrics(y_true: np.ndarray, "precision": precision_score(y_true, y_pred_b), "recall": recall_score(y_true, y_pred_b), "F1": f1_score(y_true, y_pred_b, average=None)[1], + "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b), "lift at {}".format(lift_at): np.round(Evaluator ._compute_lift( y_true=y_true, From f980a7ddd47e20abf5987bcbbb83291ab885edd2 Mon Sep 17 00:00:00 2001 From: Matthias Roels Date: Fri, 26 Jun 2020 10:23:31 +0200 Subject: [PATCH 98/98] Bug fix in setup.py --- README.md | 8 +++++--- setup.py | 10 +++------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2a517b7..e15b01b 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,8 @@ or using conda conda install requirements.txt ``` +__Note__: if you want to install cobra with e.g. pip, you don't have to install all of these requirements as these are automatically installed with cobra itself. + ### Installation As this package is an internal package that is not open-sourced, it is not available through `pip` or `conda`. As a result, the package has to be installed manually using the following steps: @@ -110,9 +112,9 @@ Once the preprocessing pipeline is fitted and applied to your data, we are ready from cobra.evaluation import generate_pig_tables pig_tables = generate_pig_tables(basetable[basetable["split"] == "selection"], - id_column_name, - target_column_name - preprocessed_predictors) + id_column_name=id_column_name, + target_column_name=target_column_name, + preprocessed_predictors=preprocessed_predictors) ``` Once these PIG tables are computed, we can start with the _univariate preselection_: diff --git a/setup.py b/setup.py index 5944027..e92bba1 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,11 @@ -from distutils.core import setup +from setuptools import setup, find_packages setup( name="cobra", version="1.0.0", description="Python Prediction's methodology for predictive analytics", - packages=["cobra", - "cobra.preprocessing", - "cobra.model_building", - "cobra.evaluation"], + packages=find_packages(include=['cobra', 'cobra.*']), url="https://github.com/PythonPredictions", install_requires=[ "numpy>=1.17.2", @@ -16,6 +13,5 @@ "scipy>=1.2.0", "scikit_learn>=0.22.1", "matplotlib>=3.0.2", - "seaborn>=0.9.0"], - python_requires=">=3.6", + "seaborn>=0.9.0"] )