Skip to content

Commit

Permalink
changed default_encoders to categorical_encoders
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-taylor committed Jun 4, 2024
1 parent 8f730ac commit dc3cd75
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 34 deletions.
17 changes: 9 additions & 8 deletions ISLP/models/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError


class Column(NamedTuple):

"""
Expand Down Expand Up @@ -125,10 +124,7 @@ def _get_column_info(X,
columns,
is_categorical,
is_ordinal,
default_encoders={
'ordinal': OrdinalEncoder(),
'categorical': OneHotEncoder()
}
categorical_encoders={}
):


Expand All @@ -151,12 +147,18 @@ def _get_column_info(X,
if is_categorical[i]:
if is_ordinal[i]:
Xcol = _get_column(col, X)
encoder = clone(default_encoders['ordinal'])
if col not in categorical_encoders:
encoder = clone(categorical_encoders['ordinal'])
else:
encoder = categorical_encoders[col]
encoder.fit(Xcol)
columns = ['{0}'.format(col)]
else:
Xcol = _get_column(col, X)
encoder = clone(default_encoders['categorical'])
if col not in categorical_encoders:
encoder = clone(categorical_encoders['categorical'])
else:
encoder = categorical_encoders[col]
cols = encoder.fit_transform(Xcol)
if hasattr(encoder, 'columns_'):
columns_ = encoder.columns_
Expand All @@ -181,7 +183,6 @@ def _get_column_info(X,
# https://github.com/scikit-learn/scikit-learn/blob/2beed55847ee70d363bdbfe14ee4401438fba057/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
# max_bins is ignored


def _check_categories(categorical_features, X):
"""Check and validate categorical features in X
Expand Down
33 changes: 21 additions & 12 deletions ISLP/models/model_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def fit(self, X, y=None):
raise ValueError('method must be one of ["drop", "sum", None] or a callable' +
'that returns a contrast matrix and column names given the number' +
' of levels')

return self

def transform(self, X):
Expand All @@ -171,22 +172,23 @@ def transform(self, X):

class ModelSpec(TransformerMixin, BaseEstimator):

'''
Parameters
'''Parameters
----------
terms : sequence (optional)
Sequence of sets whose
elements are columns of *X* when fit.
For :py:class:`pd.DataFrame` these can be column
names.
intercept : bool (optional)
Include a column for intercept?
categorical_features : array-like of {bool, int} of shape (n_features)
or shape (n_categorical_features,), default=None.
Indicates the categorical features. Will be ignored if *X* is a :py:class:`pd.DataFrame`
or :py:class:`pd.Series`.
Expand All @@ -195,25 +197,31 @@ class ModelSpec(TransformerMixin, BaseEstimator):
- integer array-like : integer indices indicating categorical
features.
default_encoders : dict
Dictionary whose keys are elements of *terms* and values
are transforms to be applied to the associate columns in the model matrix
by running the *fit_transform* method when *fit* is called and overwriting
these values in the dictionary.
categorical_encoders : dict
Dictionary whose keys are elements of *terms* that represent
**categorical variables**. Its values are transforms to be
applied to the associate columns in the model matrix by
running the *fit_transform* method when *fit* is called and
overwriting these values in the dictionary.
'''

def __init__(self,
terms=[],
intercept=True,
categorical_features=None,
default_encoders={'categorical': Contrast(method='drop'),
'ordinal': OrdinalEncoder()}
categorical_encoders={}
):

self.intercept = intercept
self.terms = terms
self.categorical_features = categorical_features
self.default_encoders = default_encoders

self.categorical_encoders = categorical_encoders
self.categorical_encoders_ = {'ordinal': OrdinalEncoder(),
'categorical': Contrast(method='drop')}
self.categorical_encoders_.update(**categorical_encoders)

def fit(self, X, y=None):

Expand Down Expand Up @@ -261,7 +269,8 @@ def fit(self, X, y=None):
self.columns_,
np.asarray(self.is_categorical_),
np.asarray(self.is_ordinal_),
default_encoders=self.default_encoders)
categorical_encoders=self.categorical_encoders_)

# include each column as a Feature
# so that their columns are built if needed

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ doc = ['Sphinx>=3.0']
[build-system]
requires = ["setuptools>=42",
"wheel",
"versioneer[toml]"
"versioneer[toml]",
"Sphinx>=1.0"
]
build-backend = "setuptools.build_meta"

Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

from setuptools import setup


# Define extensions
EXTS = []

Expand Down
5 changes: 4 additions & 1 deletion tests/models/test_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from pandas.api.types import CategoricalDtype
from ISLP.models.columns import _get_column_info
from ISLP.models.model_spec import Contrast

def test_column_info():

Expand All @@ -15,5 +16,7 @@ def test_column_info():
print(_get_column_info(df,
df.columns,
[False]*4+[True],
[False]*5))
[False]*5,
categorical_encoders={'categorical':Contrast(method='drop')}))


22 changes: 11 additions & 11 deletions tests/models/test_model_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_ndarray():
X = rng.standard_normal((50,5))

M = ModelSpec(terms=[1, (3,2)],
default_encoders=default_encoders)
categorical_encoders=default_encoders)
M.fit(X)
MX = M.transform(X)

Expand All @@ -51,7 +51,7 @@ def test_dataframe1():
D = pd.DataFrame(X, columns=['A','B','C','D','E'])

M = ModelSpec(terms=['A','D',('D','E')],
default_encoders=default_encoders)
categorical_encoders=default_encoders)
clone(M)
MX = np.asarray(M.fit_transform(D))

Expand All @@ -66,7 +66,7 @@ def test_dataframe2():
D = pd.DataFrame(X, columns=['V','B','A','D','E'])

M = ModelSpec(terms=['A', 'D', 'B', ('D','E'), 'V'],
default_encoders=default_encoders)
categorical_encoders=default_encoders)
clone(M)

MX = M.fit_transform(D)
Expand All @@ -83,7 +83,7 @@ def test_dataframe3():
D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))

M = ModelSpec(terms=['A', 'E', ('D','E')],
default_encoders=default_encoders)
categorical_encoders=default_encoders)
MX = np.asarray(M.fit_transform(D))
M2 = clone(M)

Expand All @@ -105,7 +105,7 @@ def test_dataframe4():
D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))

M = ModelSpec(terms=['A', 'E', ('D','E'), 'D'],
default_encoders=default_encoders)
categorical_encoders=default_encoders)
MX = np.asarray(M.fit_transform(D))

DE = pd.get_dummies(D['E'])
Expand All @@ -129,7 +129,7 @@ def test_dataframe5():
D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))

M = ModelSpec(terms=['A', 'E', ('D','E')],
default_encoders=default_encoders)
categorical_encoders=default_encoders)
MX = np.asarray(M.fit_transform(D))

# check they agree on copy of dataframe
Expand All @@ -148,7 +148,7 @@ def test_dataframe6():
D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))

M = ModelSpec(terms=['A',W,(W,'D',)],
default_encoders=default_encoders)
categorical_encoders=default_encoders)
MX = M.fit_transform(D)

MX = np.asarray(MX)
Expand All @@ -162,7 +162,7 @@ def test_dataframe7():
D['Eee'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))

M = ModelSpec(terms=D.columns.drop(['Y','C']),
default_encoders=default_encoders)
categorical_encoders=default_encoders)
MX = M.fit_transform(D)
print(MX.columns)
MX = np.asarray(MX)
Expand All @@ -179,7 +179,7 @@ def test_dataframe8():
# raises a ValueError because poly will have been already fit -- need new instance of Poly
W = Feature(('A',), 'poly(A)', poly)
M = ModelSpec(terms=list(D.columns.drop(['Y','C'])) + [(W,'E')],
default_encoders=default_encoders)
categorical_encoders=default_encoders)
MX = M.fit_transform(D)

print(MX.columns)
Expand All @@ -198,7 +198,7 @@ def test_dataframe9():
W = Feature(('A',), 'poly(A)', poly)
U = Feature(('B',), 'poly(B)', clone(poly))
M = ModelSpec(terms=list(D.columns.drop(['Y','C'])) + [W,U],
default_encoders=default_encoders)
categorical_encoders=default_encoders)
MX = M.fit_transform(D)

print(MX.columns)
Expand All @@ -215,7 +215,7 @@ def test_dataframe10():
D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))

M = ModelSpec(terms=['A', 'E', 'C', W, (W, 'D',), U],
default_encoders=default_encoders)
categorical_encoders=default_encoders)
MX = M.fit_transform(D)
print(MX.columns)
MX = np.asarray(MX)
Expand Down

0 comments on commit dc3cd75

Please sign in to comment.