Skip to content

Commit

Permalink
imputation for categorical values.
Browse files Browse the repository at this point in the history
  • Loading branch information
erdogant committed Oct 19, 2024
1 parent 3091a36 commit fcbad3e
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 8 deletions.
34 changes: 34 additions & 0 deletions bnlearn/examples.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,37 @@
# %% Issue 81
# It implements MICE using the function mice_imputer function that performs Multiple Imputation by Chained Equations (MICE) on numeric columns while handling string/categorical columns.
# The code is based on the already implemented code by Erdogan on imputation using knn.

# Key features include:

# Supports MICE imputation for numeric columns.
# String/categorical columns are encoded before imputation and restored post-imputation.
# Includes options to specify the imputation estimator, number of iterations (max_iter), and verbosity level for logging.
# Numeric columns are auto-identified and converted for imputation where necessary.
# This enhancement improves missing data handling and supports mixed-type datasets.

# Key changes include:

# Created a new file impute.py for imputation related functions
# Moved the existing code for imputation and renamed it to knn_imputer
# Implemented the MICE function
# Updated the impute.rst file to include examples of both types of imputation

import bnlearn as bn
import pandas as pd
import numpy as np
from impute import knn_imputer, mice_imputer

# Load the dataset
# df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original', delim_whitespace=True, header=None, names=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name'])
# imputed_df1 = bn.knn_imputer(df, n_neighbors=3, weights="distance", string_columns=['car name'])
# imputed_df2 = bn.knn_imputer(df, n_neighbors=3, weights="distance")

df = pd.DataFrame({'age': [25, np.nan, 27], 'income': [50000, 60000, np.nan], 'city': ['New York', np.nan, 'Los Angeles']})
knn_imputer(df, n_neighbors=3, weights='distance', string_columns='city')
mice_imputer(df, max_iter=5, string_columns='city')


# %%
import bnlearn as bn
# Load example mixed dataset
Expand Down
87 changes: 79 additions & 8 deletions bnlearn/impute.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# %% Impute
def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', string_columns=None, verbose=3):
"""Impute missing values.
Expand Down Expand Up @@ -44,16 +46,18 @@ def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', st
Examples
--------
>>> from impute import knn_imputer, mice_imputer
>>> df = pd.DataFrame({
... 'age': [25, np.nan, 27],
... 'income': [50000, 60000, np.nan],
... 'city': ['New York', np.nan, 'Los Angeles']
... })
>>> bn.knn_imputer(df, n_neighbors=3, weights='distance', string_columns='city')
>>> knn_imputer(df, n_neighbors=3, weights='distance', string_columns='city')
age income city
0 25.0 50000.0 New York
1 26.0 60000.0 New York
1 26.0 60000.0 Los Angeles
2 27.0 55000.0 Los Angeles
"""
# Convert string columns to categorical and then encode them
if string_columns is not None:
Expand Down Expand Up @@ -91,8 +95,13 @@ def knn_imputer(df, n_neighbors=2, weights="uniform", metric='nan_euclidean', st
for col in string_columns:
df_imputed[col] = df[col]

# Impute categorical columns with the most frequent value
df_imputed = impute_catagorical_knn(df_imputed, string_columns, numeric_cols, n_neighbors)

# Return
return df_imputed


def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, verbose=3):
"""Impute missing values using Multiple Imputation by Chained Equations (MICE).
Expand Down Expand Up @@ -128,16 +137,18 @@ def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, verbose=3
Examples
--------
>>> from impute import knn_imputer, mice_imputer
>>> df = pd.DataFrame({
... 'age': [25, np.nan, 27],
... 'income': [50000, 60000, np.nan],
... 'city': ['New York', np.nan, 'Los Angeles']
... })
>>> bn.mice_imputer(df, max_iter=5, string_columns='city')
>>> mice_imputer(df, max_iter=5, string_columns='city')
age income city
0 25.0 50000.0 New York
1 26.2 60000.0 New York
1 26.2 60000.0 Los Angeles
2 27.0 55123.7 Los Angeles
"""
# Convert string columns to categorical and then encode them
if string_columns is not None:
Expand Down Expand Up @@ -170,9 +181,69 @@ def mice_imputer(df, max_iter=10, estimator=None, string_columns=None, verbose=3
# Create a new DataFrame for imputed numeric values
df_imputed = pd.DataFrame(imputed_values, columns=numeric_cols)

# Add the original string columns back to the imputed DataFrame if any
# Impute categorical columns with the most frequent value
df_imputed = impute_catagorical_knn(df_imputed, string_columns, numeric_cols, 3)

# Return
return df_imputed


def impute_catagorical_knn(df, string_columns, numeric_cols, n_neighbors):
"""
Impute missing values in categorical columns using K-Nearest Neighbors (KNN) based on numeric columns.
Parameters
----------
df : pandas.DataFrame
The DataFrame containing both categorical and numeric columns.
string_columns : list of str
List of column names in `df` that are categorical and need imputation for missing values.
numeric_cols : list of str
List of column names in `df` that are numeric and will be used for distance calculation in KNN.
n_neighbors : int
The number of nearest neighbors to consider for imputation.
Returns
-------
pandas.DataFrame
The input DataFrame `df` with missing values in the categorical columns imputed.
Notes
-----
- The function uses K-Nearest Neighbors to find the nearest data points based on the numeric columns
and imputes missing values in the categorical columns with the most frequent value (mode) among the neighbors.
- The function ensures that missing categorical values are replaced by the mode of their nearest neighbors
based on numeric data.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> df = pd.DataFrame({
... 'age': [25, 25, 27, 29],
... 'income': [50000, 60000, 65000, 7000],
... 'city': ['New York', np.nan, 'Los Angeles', 'San Francisco']
... })
>>> impute_catagorical_knn(df, string_columns=['city'], numeric_cols=['age', 'income'], n_neighbors=3)
age income city
0 25 50000 New York
1 25 60000 New York
2 27 65000 Los Angeles
3 29 7000 San Francisco
"""
# Impute categorical columns with the most frequent value
if string_columns is not None:
model = NearestNeighbors(n_neighbors=np.minimum(n_neighbors + 1, df.shape[0])).fit(df[numeric_cols])
for col in string_columns:
df_imputed[col] = df[col]

return df_imputed
# Get all missing indexes
missing_index = np.where(df[col].isna())[0]
# For each missing catagory, find its nearest neighbors and impute the mode
for row in missing_index:
distances, indices = model.kneighbors([df[numeric_cols].loc[row]])
most_frequent = df[col][indices[0]].mode()[0]
# Impute
df[col][missing_index] = most_frequent

# Return
return df

0 comments on commit fcbad3e

Please sign in to comment.