KI-labs · mahedeeb · Nov 14, 2019 · Nov 8, 2019 · Nov 8, 2019 · Nov 8, 2019
diff --git a/flows/flows.py b/flows/flows.py
@@ -24,13 +24,14 @@
 from prediction.model_predictor import model_prediction
 from preprocessing.data_clean import drop_corr_columns, drop_const_columns
 from preprocessing.data_explorer import explore_data
-from preprocessing.data_science_help_functions import detect_id_target_problem
-from preprocessing.data_transformer import encode_categorical_features, standard_scale_numeric_features, clean_categorical_features
+from preprocessing.data_explorer import outliers_detector
+from preprocessing.data_science_help_functions import detect_id_target_problem, adversarial_validation
+from preprocessing.data_transformer import encode_categorical_features, standard_scale_numeric_features, \
+    clean_categorical_features
 from preprocessing.data_type_detector import detect_columns_types_summary
 from preprocessing.json_preprocessor import flat_json
 from preprocessing.utils import check_if_target_columns_are_imbalanced
 from preprocessing.utils import read_data
-from preprocessing.data_explorer import outliers_detector
 from training.training import model_training
 from visualization.visualization import compare_statistics
 
@@ -185,10 +186,14 @@ def load_data(self, path: str, files_list: list, rows_amount: int = 0) -> Tuple[
         print(term.green_on_black("average_score, is_outlier = flow.outliers_extractor"
                                   "(dataframe_dict: dict, key_i: str)"))
 
-        _, _, possible_problems = detect_id_target_problem(dataframes_dict)
+        possible_ids, possible_target, possible_problems = detect_id_target_problem(dataframes_dict)
 
         _ = check_if_target_columns_are_imbalanced(dataframes_dict, possible_problems, self.kl_div_threshold)
 
+        # Check adversarial validation
+        ignore_columns = list(set(possible_ids).union(set(possible_target)))
+        _ = adversarial_validation(dataframes_dict, ignore_columns)
+
         try:
             self.guidance(self.flow_steps[function_id])
         except Exception as e:
@@ -234,7 +239,7 @@ def encode_categorical_feature(self, dataframes_dict: dict, ignore_columns: list
 
         # clean_categorical_values in dataframes
         if clean_categorical_values:
-            dataframes_dict = clean_categorical_features(dataframes_dict, string_columns, False)
+            dataframes_dict = clean_categorical_features(dataframes_dict, string_columns, True)
 
         # Feature encoding
         dataframes_dict_encoded = encode_categorical_features(dataframes_dict, string_columns, print_results)
@@ -251,8 +256,10 @@ def encode_categorical_feature(self, dataframes_dict: dict, ignore_columns: list
 
         return dataframes_dict_encoded, self.columns_set
 
-    def scale_data(self, dataframes_dict: dict, ignore_columns: list,
-                   _reference: Union[bool, str] = False) -> Tuple[Dict, List]:
+    def scale_data(self, dataframes_dict: dict,
+                   ignore_columns: list,
+                   _reference: Union[bool, str] = False,
+                   ) -> Tuple[Dict, List]:
         """ Feature scaling
 
         This function scales features that contains numeric continuous values.

diff --git a/preprocessing/data_science_help_functions.py b/preprocessing/data_science_help_functions.py
@@ -1,7 +1,13 @@
 import logging
+import operator
 import os
-from typing import Tuple, List, Dict, Set, Union
 from collections import Counter
+from typing import Tuple, List, Dict, Set, Union
+
+import pandas as pd
+import xgboost as xgb
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import StratifiedKFold
 
 logger = logging.getLogger(__name__)
 formatting = (
@@ -189,3 +195,184 @@ def detect_id_target_problem(dataframes_dict: dict, threshold: float = 0.1) -> T
     print(f"The possible possible_target are:\n {possible_target}")
     print(f"The type of the problem that should be solved:\n {possible_problems}")
     return possible_ids, possible_target, possible_problems
+
+
+# result printing
+def form_template(result, train_label, test_label, adversarial_validation_result, threshold) -> str:
+    """ Validation template former
+
+    This function put together all results of adversarial validation
+
+    :param Result related values: train_label, test_label, adversarial_validation_result, threshold
+
+    :return:
+           - A formed string
+    """
+
+    return f"{result} significant difference between {train_label} and {test_label} datasets\n" \
+           f"in terms of feature distribution. Validation score: {adversarial_validation_result}, threshold: {threshold}"
+
+
+def adversarial_validation(dataframe_dict: dict,
+                           ignore_columns: list,
+                           max_dataframe_length: int = 100000,
+                           threshold: float = 0.7) -> Union[float, None]:
+    """ Make adversarial validation checking
+
+    Training a probabilistic classifier to distinguish train/test examples.
+    See more info here: http://fastml.com/adversarial-validation-part-one/
+    This function checks whether test and train data coming from the same data distribution.
+
+    :param dataframe_dict:
+D, target, etc...)
+    :param float threshold: A value larger than 0 and less than 1. If the conclusion of calculation is greater than threshold - there is sugnificant difference between train and test data
+
+    :return:
+            - adversarial_validation_result: Adversarial validation score.
+    """
+    print('Applying adversarial validation technique to check whether test and train data are coming from the same data distribution...')
+    # Check if it only one dataframe provided
+    if len(dataframe_dict) != 2:
+        # do nothing and return the original data
+        print("Can't apply adversarial_validation because count of dataframes is not equal to 2")
+        return
+
+    # TODO: support > 2 dataframes ISSUE#44
+    # if 2 dataframe than it will be considered as `train` and `test`
+
+    # TODO: replace to take_first_n ISSUE 45 from https://docs.python.org/3.8/library/itertools.html#itertools-recipe
+    label_iter = iter(dataframe_dict.keys())
+    train_label = next(label_iter)
+    test_label = next(label_iter)
+
+    train = dataframe_dict[train_label]
+    test = dataframe_dict[test_label]
+
+    df_joined = join_dataframe_for_validation(ignore_columns, max_dataframe_length, test, train)
+
+    # a new target
+    y = df_joined['istrain']
+    df_joined.drop('istrain', axis=1, inplace=True)
+
+    # train classifier
+    adversarial_validation_result, clf = get_adv_validation_score(df_joined, y)
+
+    # Process conclusion:
+    if adversarial_validation_result < threshold:
+        conclusion = 'There is no'
+        print(form_template(conclusion, train_label, test_label, adversarial_validation_result, threshold))
+    else:
+        conclusion = 'WARNING!!!! There is'
+        print(form_template(conclusion, train_label, test_label, adversarial_validation_result, threshold))
+        print(f"Top features are: {xgb_important_features(clf)}\n")
+    return adversarial_validation_result
+
+
+def join_dataframe_for_validation(ignore_columns: list,
+                                  max_dataframe_length: int,
+                                  test: pd.DataFrame,
+                                  train: pd.DataFrame) -> pd.DataFrame:
+    """Join dataframe for validation
+
+    :param test:
+    :param train:
+    :param list ignore_columns: List of column to ignore (ID, target, etc...)
+    :param int max_dataframe_length: a limit of dataframe length before joining
+    :return:
+            - df_joined: Joined dataframe.
+    """
+
+    if len(ignore_columns) > 0:
+        columns_to_use = [x for x in list(test.columns) if x not in ignore_columns]
+        train = train[columns_to_use]
+        test = test[columns_to_use]
+
+    # max_dataframe_length
+    for df in [train, test]:
+        if len(df) > max_dataframe_length:
+            df = df.head(max_dataframe_length)
+
+    # add identifier and combine
+    train['istrain'] = 1
+    test['istrain'] = 0
+    df_joined = pd.concat([train, test], axis=0)
+    # convert non-numerical columns to integers
+    df_numeric = df_joined.select_dtypes(exclude=['object', 'datetime'])
+    df_obj = df_joined.select_dtypes(include=['object', 'datetime']).copy()
+    for c in df_obj:
+        df_obj[c] = pd.factorize(df_obj[c])[0]
+    df_joined = pd.concat([df_numeric, df_obj], axis=1)
+    return df_joined
+
+
+def get_adv_validation_score(df_joined: pd.DataFrame,
+                             y: pd.Series) -> Tuple[float, xgb.sklearn.XGBClassifier]:
+    """ Advisarial validation score estimator
+
+    Calculate advisarial validation score based on dataframes and XGBClassifier
+
+    :param DataFrame df_joined: Feature dataframe
+    :param Series y: Target series
+
+    :return:
+            - clf: Trained model
+            - mean of KFold validation results (ROC-AUC scores)
+    """
+
+    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=44)
+    xgb_params = {}
+    clf = xgb.XGBClassifier(**xgb_params, seed=10)
+    results = []
+    logger.info('Adversarial validation checking:')
+    for fold, (train_index, test_index) in enumerate(skf.split(df_joined, y)):
+        fold_xtrain, fold_xval = df_joined.iloc[train_index], df_joined.iloc[test_index]
+        fold_ytrain, fold_yval = y.iloc[train_index], y.iloc[test_index]
+        clf.fit(fold_xtrain, fold_ytrain, eval_set=[(fold_xval, fold_yval)],
+                eval_metric='logloss', verbose=False, early_stopping_rounds=10)
+        fold_ypred = clf.predict_proba(fold_xval)[:, 1]
+        fold_score = roc_auc_score(fold_yval, fold_ypred)
+        results.append(fold_score)
+        logger.info(f"Fold: {fold + 1} shape: {fold_xtrain.shape} score: {fold_score}")
+
+    return round(sum(results)/len(results), 2), clf
+
+
+def xgb_important_features(model: xgb.sklearn.XGBClassifier,
+                           top_features: int = 5) -> str:
+    """ Important features extractor
+
+    Get top of the most important features from a trained model
+
+    :param XGBClassifier model: A trained model
+    :param int top_features: Max length of features to send back
+
+    :return:
+            - A string with a list of the most important features plus their importance
+    """
+
+    # get features
+    feat_imp = model.get_booster().get_score(importance_type='gain')
+
+    sorted_x = round_and_sort_dict(feat_imp)
+
+    return str(sorted_x[:top_features])
+
+
+def round_and_sort_dict(feat_imp: dict) -> list:
+    """ Round and sort a dictionary
+
+    :param dict feat_imp: A dictionary
+
+    :return:
+            - A sorted list
+    """
+
+    # round importances
+    for dict_key in feat_imp:
+        feat_imp[dict_key] = round(feat_imp[dict_key])
+
+    # sort by importances
+    sorted_x = sorted(feat_imp.items(), key=operator.itemgetter(1))
+    sorted_x.reverse()
+
+    return sorted_x
diff --git a/preprocessing/data_transformer.py b/preprocessing/data_transformer.py
@@ -257,16 +257,20 @@ def clean_categorical_features(dataframe_dict: object,
         return dataframe_dict
 
     # if 2 dataframe than it will be considered as `train` and `test`
-    else:
-        train = dataframe_dict[list(dataframe_dict.keys())[0]]
-        test = dataframe_dict[list(dataframe_dict.keys())[1]]
+    # TODO: replace to take_first_n ISSUE 45 from https://docs.python.org/3.8/library/itertools.html#itertools-recipes
+    label_iter = iter(dataframe_dict.keys())
+    train_label = next(label_iter)
+    test_label = next(label_iter)
+
+    train = dataframe_dict[train_label]
+    test = dataframe_dict[test_label]
 
-    # TODO implement support of > 2 dataframes (validation, etc) Here validation set (if any) will be merged to a training data.
+    # TODO: support > 2 dataframes ISSUE#44
 
     # Checking cycle
     if print_results:
         print('*' * 10)
-        print(f"Checking the difference in categorical columns in {dataframe_dict.keys()[0]} and {dataframe_dict.keys()[1]} datasets: ")
+        print(f"Checking the difference in categorical columns in {train_label} and {test_label} datasets: ")
 
     for column in columns_list:
         if set(train[column].unique()) != set(test[column].unique()):
@@ -284,8 +288,8 @@ def clean_categorical_features(dataframe_dict: object,
 
     # replace dataframes
     if not inform_only:
-        dataframe_dict[list(dataframe_dict.keys())[0]] = train
-        dataframe_dict[list(dataframe_dict.keys())[1]] = test
+        dataframe_dict[train_label] = train
+        dataframe_dict[test_label] = test
         print(f"The difference in categories was cleaned")
     else:
         print(f"The difference in categories was not cleaned")