From 3ebd2394d1352e6eae4ec8dd42be7979ee6354c7 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 18 Jan 2024 10:44:46 -0800 Subject: [PATCH] ensuring boolean columns get cast to float for statsmodels --- ISLP/models/model_spec.py | 12 ++++++++---- tests/models/test_boolean_columns.py | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 tests/models/test_boolean_columns.py diff --git a/ISLP/models/model_spec.py b/ISLP/models/model_spec.py index 07a8b88..983a85d 100644 --- a/ISLP/models/model_spec.py +++ b/ISLP/models/model_spec.py @@ -107,7 +107,6 @@ def fit(self, X, y=None): cats = self.encoder_.categories_[0] column_names = [str(n) for n in cats] - if isinstance(X, pd.DataFrame): # expecting a column, we take .iloc[:,0] X = X.iloc[:,0] @@ -635,18 +634,23 @@ def build_model(column_info, if isinstance(X, (pd.Series, pd.DataFrame)): df = pd.concat(dfs, axis=1) df.index = X.index - return df else: - return np.column_stack(dfs) + return np.column_stack(dfs).astype(float) else: # return a 0 design zero = np.zeros(X.shape[0]) if isinstance(X, (pd.Series, pd.DataFrame)): df = pd.DataFrame({'zero': zero}) df.index = X.index - return df else: return zero + # if we reach here, we will be returning a DataFrame + + for col in df.columns: + if df[col].dtype == bool: + df[col] = df[col].astype(float) + return df + def derived_feature(variables, encoder=None, name=None, use_transform=True): """ Create a Feature, optionally diff --git a/tests/models/test_boolean_columns.py b/tests/models/test_boolean_columns.py new file mode 100644 index 0000000..7b5a429 --- /dev/null +++ b/tests/models/test_boolean_columns.py @@ -0,0 +1,23 @@ +import pandas as pd +import statsmodels.api as sm +import numpy as np +from itertools import combinations + +from ISLP.models import ModelSpec as MS + +rng = np.random.default_rng(0) + +df = pd.DataFrame({'A':rng.standard_normal(10), + 'B':np.array([1,2,3,2,1,1,1,3,2,1], int), + 'C':np.array([True,False,False,True,True]*2, bool), + 'D':rng.standard_normal(10)}) +Y = rng.standard_normal(10) + +def test_all(): + + for i in range(1, 5): + for comb in combinations(['A','B','C','D'], i): + + X = MS(comb).fit_transform(df) + sm.OLS(Y, X).fit() +