imputation for categorical values.

erdogant · Oct 19, 2024 · fcbad3e · fcbad3e
1 parent 3091a36
commit fcbad3e
Show file tree

Hide file tree

Showing 2 changed files with 113 additions and 8 deletions.
diff --git a/bnlearn/examples.py b/bnlearn/examples.py
@@ -1,3 +1,37 @@
+# %% Issue 81
+# It implements MICE using the function mice_imputer function that performs Multiple Imputation by Chained Equations (MICE) on numeric columns while handling string/categorical columns.
+# The code is based on the already implemented code by Erdogan on imputation using knn.
+
+# Key features include:
+
+# Supports MICE imputation for numeric columns.
+# String/categorical columns are encoded before imputation and restored post-imputation.
+# Includes options to specify the imputation estimator, number of iterations (max_iter), and verbosity level for logging.
+# Numeric columns are auto-identified and converted for imputation where necessary.
+# This enhancement improves missing data handling and supports mixed-type datasets.
+
+# Key changes include:
+
+# Created a new file impute.py for imputation related functions
+# Moved the existing code for imputation and renamed it to knn_imputer
+# Implemented the MICE function
+# Updated the impute.rst file to include examples of both types of imputation
+
+import bnlearn as bn
+import pandas as pd
+import numpy as np
+from impute import knn_imputer, mice_imputer
+
+# Load the dataset
+# df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original', delim_whitespace=True, header=None, names=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name'])
+# imputed_df1 = bn.knn_imputer(df, n_neighbors=3, weights="distance", string_columns=['car name'])
+# imputed_df2 = bn.knn_imputer(df, n_neighbors=3, weights="distance")
+
+df = pd.DataFrame({'age': [25, np.nan, 27], 'income': [50000, 60000, np.nan], 'city': ['New York', np.nan, 'Los Angeles']})
+knn_imputer(df, n_neighbors=3, weights='distance', string_columns='city')
+mice_imputer(df, max_iter=5, string_columns='city')
+
+
 # %%
 import bnlearn as bn
 # Load example mixed dataset

diff --git a/bnlearn/impute.py b/bnlearn/impute.py
@@ -1,9 +1,11 @@
 import numpy as np
 import pandas as pd
 from sklearn.impute import KNNImputer
+from sklearn.neighbors import NearestNeighbors
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.impute import IterativeImputer
 
+
 # %% Impute
 def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', string_columns=None, verbose=3):
     """Impute missing values.
@@ -44,16 +46,18 @@ def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', st
 
     Examples
     --------
+    >>> from impute import knn_imputer, mice_imputer
     >>> df = pd.DataFrame({
     ...    'age': [25, np.nan, 27],
     ...    'income': [50000, 60000, np.nan],
     ...    'city': ['New York', np.nan, 'Los Angeles']
     ... })
-    >>> bn.knn_imputer(df, n_neighbors=3, weights='distance', string_columns='city')
+    >>> knn_imputer(df, n_neighbors=3, weights='distance', string_columns='city')
          age   income          city
     0  25.0  50000.0      New York
-    1  26.0  60000.0      New York
+    1  26.0  60000.0      Los Angeles
     2  27.0  55000.0  Los Angeles
+
     """
     # Convert string columns to categorical and then encode them
     if string_columns is not None:
@@ -91,8 +95,13 @@ def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', st
         for col in string_columns:
             df_imputed[col] = df[col]
 
+    # Impute categorical columns with the most frequent value
+    df_imputed = impute_catagorical_knn(df_imputed, string_columns, numeric_cols, n_neighbors)
+
+    # Return
     return df_imputed
 
+
 def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, verbose=3):
     """Impute missing values using Multiple Imputation by Chained Equations (MICE).
 
@@ -128,16 +137,18 @@ def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, verbose=3
 
     Examples
     --------
+    >>> from impute import knn_imputer, mice_imputer
     >>> df = pd.DataFrame({
     ...    'age': [25, np.nan, 27],
     ...    'income': [50000, 60000, np.nan],
     ...    'city': ['New York', np.nan, 'Los Angeles']
     ... })
-    >>> bn.mice_imputer(df, max_iter=5, string_columns='city')
+    >>> mice_imputer(df, max_iter=5, string_columns='city')
          age   income          city
     0  25.0  50000.0      New York
-    1  26.2  60000.0      New York
+    1  26.2  60000.0      Los Angeles
     2  27.0  55123.7  Los Angeles
+
     """
     # Convert string columns to categorical and then encode them
     if string_columns is not None:
@@ -170,9 +181,69 @@ def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, verbose=3
     # Create a new DataFrame for imputed numeric values
     df_imputed = pd.DataFrame(imputed_values, columns=numeric_cols)
 
-    # Add the original string columns back to the imputed DataFrame if any
+    # Impute categorical columns with the most frequent value
+    df_imputed = impute_catagorical_knn(df_imputed, string_columns, numeric_cols, 3)
+
+    # Return
+    return df_imputed
+
+
+def impute_catagorical_knn(df, string_columns, numeric_cols, n_neighbors):
+    """
+    Impute missing values in categorical columns using K-Nearest Neighbors (KNN) based on numeric columns.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        The DataFrame containing both categorical and numeric columns.
+    string_columns : list of str
+        List of column names in `df` that are categorical and need imputation for missing values.
+    numeric_cols : list of str
+        List of column names in `df` that are numeric and will be used for distance calculation in KNN.
+    n_neighbors : int
+        The number of nearest neighbors to consider for imputation.
+
+    Returns
+    -------
+    pandas.DataFrame
+        The input DataFrame `df` with missing values in the categorical columns imputed.
+
+    Notes
+    -----
+    - The function uses K-Nearest Neighbors to find the nearest data points based on the numeric columns
+      and imputes missing values in the categorical columns with the most frequent value (mode) among the neighbors.
+    - The function ensures that missing categorical values are replaced by the mode of their nearest neighbors
+      based on numeric data.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> df = pd.DataFrame({
+    ...     'age': [25, 25, 27, 29],
+    ...     'income': [50000, 60000, 65000, 7000],
+    ...     'city': ['New York', np.nan, 'Los Angeles', 'San Francisco']
+    ... })
+    >>> impute_catagorical_knn(df, string_columns=['city'], numeric_cols=['age', 'income'], n_neighbors=3)
+          age  income           city
+    0   25   50000        New York
+    1   25   60000        New York
+    2   27   65000     Los Angeles
+    3   29    7000  San Francisco
+
+    """
+    # Impute categorical columns with the most frequent value
     if string_columns is not None:
+        model = NearestNeighbors(n_neighbors=np.minimum(n_neighbors + 1, df.shape[0])).fit(df[numeric_cols])
         for col in string_columns:
-            df_imputed[col] = df[col]
-
-    return df_imputed
+            # Get all missing indexes
+            missing_index = np.where(df[col].isna())[0]
+            # For each missing catagory, find its nearest neighbors and impute the mode
+            for row in missing_index:
+                distances, indices = model.kneighbors([df[numeric_cols].loc[row]])
+                most_frequent = df[col][indices[0]].mode()[0]
+                # Impute
+                df[col][missing_index] = most_frequent
+
+    # Return
+    return df