Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ml29 adv validation #42

Merged
merged 6 commits into from
Nov 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions flows/flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@
from prediction.model_predictor import model_prediction
from preprocessing.data_clean import drop_corr_columns, drop_const_columns
from preprocessing.data_explorer import explore_data
from preprocessing.data_science_help_functions import detect_id_target_problem
from preprocessing.data_transformer import encode_categorical_features, standard_scale_numeric_features, clean_categorical_features
from preprocessing.data_explorer import outliers_detector
from preprocessing.data_science_help_functions import detect_id_target_problem, adversarial_validation
from preprocessing.data_transformer import encode_categorical_features, standard_scale_numeric_features, \
clean_categorical_features
from preprocessing.data_type_detector import detect_columns_types_summary
from preprocessing.json_preprocessor import flat_json
from preprocessing.utils import check_if_target_columns_are_imbalanced
from preprocessing.utils import read_data
from preprocessing.data_explorer import outliers_detector
from training.training import model_training
from visualization.visualization import compare_statistics

Expand Down Expand Up @@ -185,10 +186,14 @@ def load_data(self, path: str, files_list: list, rows_amount: int = 0) -> Tuple[
print(term.green_on_black("average_score, is_outlier = flow.outliers_extractor"
"(dataframe_dict: dict, key_i: str)"))

_, _, possible_problems = detect_id_target_problem(dataframes_dict)
possible_ids, possible_target, possible_problems = detect_id_target_problem(dataframes_dict)

_ = check_if_target_columns_are_imbalanced(dataframes_dict, possible_problems, self.kl_div_threshold)

# Check adversarial validation
ignore_columns = list(set(possible_ids).union(set(possible_target)))
_ = adversarial_validation(dataframes_dict, ignore_columns)

try:
self.guidance(self.flow_steps[function_id])
except Exception as e:
Expand Down Expand Up @@ -234,7 +239,7 @@ def encode_categorical_feature(self, dataframes_dict: dict, ignore_columns: list

# clean_categorical_values in dataframes
if clean_categorical_values:
dataframes_dict = clean_categorical_features(dataframes_dict, string_columns, False)
dataframes_dict = clean_categorical_features(dataframes_dict, string_columns, True)

# Feature encoding
dataframes_dict_encoded = encode_categorical_features(dataframes_dict, string_columns, print_results)
Expand All @@ -251,8 +256,10 @@ def encode_categorical_feature(self, dataframes_dict: dict, ignore_columns: list

return dataframes_dict_encoded, self.columns_set

def scale_data(self, dataframes_dict: dict, ignore_columns: list,
_reference: Union[bool, str] = False) -> Tuple[Dict, List]:
def scale_data(self, dataframes_dict: dict,
ignore_columns: list,
_reference: Union[bool, str] = False,
) -> Tuple[Dict, List]:
""" Feature scaling

This function scales features that contains numeric continuous values.
Expand Down
189 changes: 188 additions & 1 deletion preprocessing/data_science_help_functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import logging
import operator
import os
from typing import Tuple, List, Dict, Set, Union
from collections import Counter
from typing import Tuple, List, Dict, Set, Union

import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

logger = logging.getLogger(__name__)
formatting = (
Expand Down Expand Up @@ -189,3 +195,184 @@ def detect_id_target_problem(dataframes_dict: dict, threshold: float = 0.1) -> T
print(f"The possible possible_target are:\n {possible_target}")
print(f"The type of the problem that should be solved:\n {possible_problems}")
return possible_ids, possible_target, possible_problems


# result printing
def form_template(result, train_label, test_label, adversarial_validation_result, threshold) -> str:
""" Validation template former

This function put together all results of adversarial validation

:param Result related values: train_label, test_label, adversarial_validation_result, threshold

:return:
- A formed string
"""

return f"{result} significant difference between {train_label} and {test_label} datasets\n" \
f"in terms of feature distribution. Validation score: {adversarial_validation_result}, threshold: {threshold}"


def adversarial_validation(dataframe_dict: dict,
ignore_columns: list,
max_dataframe_length: int = 100000,
threshold: float = 0.7) -> Union[float, None]:
""" Make adversarial validation checking

Training a probabilistic classifier to distinguish train/test examples.
See more info here: http://fastml.com/adversarial-validation-part-one/
This function checks whether test and train data coming from the same data distribution.

:param dataframe_dict:
D, target, etc...)
:param float threshold: A value larger than 0 and less than 1. If the conclusion of calculation is greater than threshold - there is sugnificant difference between train and test data

:return:
- adversarial_validation_result: Adversarial validation score.
"""
print('Applying adversarial validation technique to check whether test and train data are coming from the same data distribution...')
# Check if it only one dataframe provided
if len(dataframe_dict) != 2:
# do nothing and return the original data
print("Can't apply adversarial_validation because count of dataframes is not equal to 2")
return
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks a lot for your great refactoring. You have a "return" statement here. I think you forgot removing it. I tried to test your code in one of the flows, but it did not work. I would suggest testing your code in all flows and check if it works without any problem. Even better, I would recommend strongly to write unit tests for the functions that you created.


# TODO: support > 2 dataframes ISSUE#44
# if 2 dataframe than it will be considered as `train` and `test`

# TODO: replace to take_first_n ISSUE 45 from https://docs.python.org/3.8/library/itertools.html#itertools-recipe
label_iter = iter(dataframe_dict.keys())
train_label = next(label_iter)
test_label = next(label_iter)

train = dataframe_dict[train_label]
test = dataframe_dict[test_label]

df_joined = join_dataframe_for_validation(ignore_columns, max_dataframe_length, test, train)

# a new target
y = df_joined['istrain']
df_joined.drop('istrain', axis=1, inplace=True)

# train classifier
adversarial_validation_result, clf = get_adv_validation_score(df_joined, y)

# Process conclusion:
if adversarial_validation_result < threshold:
conclusion = 'There is no'
print(form_template(conclusion, train_label, test_label, adversarial_validation_result, threshold))
else:
conclusion = 'WARNING!!!! There is'
print(form_template(conclusion, train_label, test_label, adversarial_validation_result, threshold))
print(f"Top features are: {xgb_important_features(clf)}\n")
return adversarial_validation_result


def join_dataframe_for_validation(ignore_columns: list,
max_dataframe_length: int,
test: pd.DataFrame,
train: pd.DataFrame) -> pd.DataFrame:
"""Join dataframe for validation

:param test:
:param train:
:param list ignore_columns: List of column to ignore (ID, target, etc...)
:param int max_dataframe_length: a limit of dataframe length before joining
:return:
- df_joined: Joined dataframe.
"""

if len(ignore_columns) > 0:
columns_to_use = [x for x in list(test.columns) if x not in ignore_columns]
train = train[columns_to_use]
test = test[columns_to_use]

# max_dataframe_length
for df in [train, test]:
if len(df) > max_dataframe_length:
df = df.head(max_dataframe_length)

# add identifier and combine
train['istrain'] = 1
test['istrain'] = 0
df_joined = pd.concat([train, test], axis=0)
# convert non-numerical columns to integers
df_numeric = df_joined.select_dtypes(exclude=['object', 'datetime'])
df_obj = df_joined.select_dtypes(include=['object', 'datetime']).copy()
for c in df_obj:
df_obj[c] = pd.factorize(df_obj[c])[0]
df_joined = pd.concat([df_numeric, df_obj], axis=1)
return df_joined


def get_adv_validation_score(df_joined: pd.DataFrame,
y: pd.Series) -> Tuple[float, xgb.sklearn.XGBClassifier]:
""" Advisarial validation score estimator

Calculate advisarial validation score based on dataframes and XGBClassifier

:param DataFrame df_joined: Feature dataframe
:param Series y: Target series

:return:
- clf: Trained model
- mean of KFold validation results (ROC-AUC scores)
"""

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=44)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is a very rigid parameter definition. I don't know is this work for all cases. I would suggest printing out the xgb_params to the user or to inform him/her about the configuration and tell how to change this. I would suggest extracting this to an external json file and pointing to it and loading using json.load(filename)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simplified it a bit

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let us discuss this. I would still write it to an external file

xgb_params = {}
clf = xgb.XGBClassifier(**xgb_params, seed=10)
results = []
logger.info('Adversarial validation checking:')
for fold, (train_index, test_index) in enumerate(skf.split(df_joined, y)):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest extracting this in a new function if it is possible

fold_xtrain, fold_xval = df_joined.iloc[train_index], df_joined.iloc[test_index]
fold_ytrain, fold_yval = y.iloc[train_index], y.iloc[test_index]
clf.fit(fold_xtrain, fold_ytrain, eval_set=[(fold_xval, fold_yval)],
eval_metric='logloss', verbose=False, early_stopping_rounds=10)
fold_ypred = clf.predict_proba(fold_xval)[:, 1]
fold_score = roc_auc_score(fold_yval, fold_ypred)
results.append(fold_score)
logger.info(f"Fold: {fold + 1} shape: {fold_xtrain.shape} score: {fold_score}")

return round(sum(results)/len(results), 2), clf


def xgb_important_features(model: xgb.sklearn.XGBClassifier,
top_features: int = 5) -> str:
""" Important features extractor

Get top of the most important features from a trained model

:param XGBClassifier model: A trained model
:param int top_features: Max length of features to send back

:return:
- A string with a list of the most important features plus their importance
"""

# get features
feat_imp = model.get_booster().get_score(importance_type='gain')

sorted_x = round_and_sort_dict(feat_imp)

return str(sorted_x[:top_features])


def round_and_sort_dict(feat_imp: dict) -> list:
""" Round and sort a dictionary

:param dict feat_imp: A dictionary

:return:
- A sorted list
"""

# round importances
for dict_key in feat_imp:
feat_imp[dict_key] = round(feat_imp[dict_key])

# sort by importances
sorted_x = sorted(feat_imp.items(), key=operator.itemgetter(1))
sorted_x.reverse()

return sorted_x
18 changes: 11 additions & 7 deletions preprocessing/data_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,16 +257,20 @@ def clean_categorical_features(dataframe_dict: object,
return dataframe_dict

# if 2 dataframe than it will be considered as `train` and `test`
else:
train = dataframe_dict[list(dataframe_dict.keys())[0]]
test = dataframe_dict[list(dataframe_dict.keys())[1]]
# TODO: replace to take_first_n ISSUE 45 from https://docs.python.org/3.8/library/itertools.html#itertools-recipes
label_iter = iter(dataframe_dict.keys())
train_label = next(label_iter)
test_label = next(label_iter)

train = dataframe_dict[train_label]
test = dataframe_dict[test_label]

# TODO implement support of > 2 dataframes (validation, etc) Here validation set (if any) will be merged to a training data.
# TODO: support > 2 dataframes ISSUE#44

# Checking cycle
if print_results:
print('*' * 10)
print(f"Checking the difference in categorical columns in {dataframe_dict.keys()[0]} and {dataframe_dict.keys()[1]} datasets: ")
print(f"Checking the difference in categorical columns in {train_label} and {test_label} datasets: ")

for column in columns_list:
if set(train[column].unique()) != set(test[column].unique()):
Expand All @@ -284,8 +288,8 @@ def clean_categorical_features(dataframe_dict: object,

# replace dataframes
if not inform_only:
dataframe_dict[list(dataframe_dict.keys())[0]] = train
dataframe_dict[list(dataframe_dict.keys())[1]] = test
dataframe_dict[train_label] = train
dataframe_dict[test_label] = test
print(f"The difference in categories was cleaned")
else:
print(f"The difference in categories was not cleaned")
Expand Down
Loading