Skip to content

Commit

Permalink
Merge pull request #38 from artefactory/corrections
Browse files Browse the repository at this point in the history
ADD: Descriptions & cleaning of datasets loaders
ADD: Expedia Dataset
ADD: More flexible structure to create ChoiceDataset from wide df
  • Loading branch information
VincentAuriau authored Mar 11, 2024
2 parents f16897d + 306187f commit f8e5ddd
Show file tree
Hide file tree
Showing 12 changed files with 657 additions and 216 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,6 @@ secrets/*

# Mac OS
.DS_Store

# Specific data
choice_learn/datasets/data/expedia.csv
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,17 @@ If you are new to choice modelling, you can check this [resource](https://www.pu
### Data
- Generic dataset handling with the ChoiceDataset class [[Example]](https://github.com/artefactory/choice-learn-private/blob/main/notebooks/choice_learn_introduction_data.ipynb)
- Ready-To-Use datasets:
- [SwissMetro](./choice_learn/datasets/data/swissmetro.csv.gz) from Bierlaire et al. (2001) [[2]](#citation)
- [ModeCanada](./choice_learn/datasets/data/ModeCanada.csv.gz) from Koppelman et al. (1993) [[3]](#citation)
- The [Train](./choice_learn/datasets/data/train_data.csv.gz) dataset from Ben Akiva et al. (1993) [[5]](#citation)
- [SwissMetro](./choice_learn/datasets/data/swissmetro.csv.gz) [[2]](#citation)
- [ModeCanada](./choice_learn/datasets/data/ModeCanada.csv.gz) [[3]](#citation)
- The [Train](./choice_learn/datasets/data/train_data.csv.gz) [[5]](#citation)
- The [Heating](./choice_learn/datasets/data/heating_data.csv.gz) & [Electricity](./choice_learn/datasets/data/electricity.csv.gz) datasets from Kenneth Train described [here](https://rdrr.io/cran/mlogit/man/Electricity.html) and [here](https://rdrr.io/cran/mlogit/man/Heating.html)
- The [TaFeng](./choice_learn/datasets/data/ta_feng.csv.zip) dataset from [Kaggle](https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset)

### Models
- Ready-to-use models:
- Conditional MultiNomialLogit, Train, K.; McFadden, D.; Ben-Akiva, M. (1987) [[4]](#citation)[[Example]](https://github.com/artefactory/choice-learn-private/blob/main/notebooks/choice_learn_introduction_clogit.ipynb)
- Conditional MultiNomialLogit [[4]](#citation)[[Example]](https://github.com/artefactory/choice-learn-private/blob/main/notebooks/choice_learn_introduction_clogit.ipynb)
- Latent Class MultiNomialLogit [[Example]](https://github.com/artefactory/choice-learn-private/blob/main/notebooks/latent_class_model.ipynb)
- RUMnet, Aouad A.; Désir A. (2022) [[1]](#citation)[[Example]](https://github.com/artefactory/choice-learn-private/blob/main/notebooks/rumnet_example.ipynb)
- RUMnet [[1]](#citation)[[Example]](https://github.com/artefactory/choice-learn-private/blob/main/notebooks/rumnet_example.ipynb)
- Ready-to-use models to be implemented:
- Nested MultiNomialLogit
- [TasteNet](https://arxiv.org/abs/2002.00922)
Expand Down
112 changes: 102 additions & 10 deletions choice_learn/data/choice_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,14 @@ def _build_features_by_ids(self):
raise ValueError(
"No features_names given, match with fiven features_by_ids impossible."
)
if (
self.fixed_items_features_names == (None,)
and self.contexts_features_names == (None,)
and self.contexts_items_features_names == (None,)
):
raise ValueError(
"No features_names given, match with fiven features_by_ids impossible."
)

fixed_items_features_map = {}
contexts_features_map = {}
Expand All @@ -388,6 +396,7 @@ def _build_features_by_ids(self):
index_dict = fixed_items_features_map.get(i, {})
index_dict[j] = feature_by_id
fixed_items_features_map[i] = index_dict
print("Feature by ID found:", feature_by_id.name)

if self.contexts_features_names is not None:
for i, feature in enumerate(self.contexts_features_names):
Expand All @@ -398,6 +407,7 @@ def _build_features_by_ids(self):
index_dict = contexts_features_map.get(i, {})
index_dict[j] = feature_by_id
contexts_features_map[i] = index_dict
print("Feature by ID found:", feature_by_id.name)

if self.contexts_items_features_names is not None:
for i, feature in enumerate(self.contexts_items_features_names):
Expand All @@ -409,10 +419,12 @@ def _build_features_by_ids(self):
index_dict[k] = feature_by_id
contexts_items_features_map[i] = index_dict
# contexts_items_features_map.append(((i, k), feature_by_id))
print("Feature by ID found:", feature_by_id.name)

if len(fixed_items_features_map) + len(contexts_features_map) + sum(
[len(c.keys()) for c in contexts_items_features_map.values()]
) != len(self.features_by_ids):
num_fif_maps = sum([len(val) for val in fixed_items_features_map.values()])
num_cf_maps = sum([len(val) for val in contexts_features_map.values()])
num_cif_maps = sum([len(val) for val in contexts_items_features_map.values()])
if num_fif_maps + num_cf_maps + num_cif_maps != len(self.features_by_ids):
raise ValueError("Some features_by_ids were not matched with features_names.")

return fixed_items_features_map, contexts_features_map, contexts_items_features_map
Expand Down Expand Up @@ -718,9 +730,13 @@ def from_single_wide_df(
df,
items_id,
fixed_items_suffixes=None,
fixed_items_prefixes=None,
contexts_features_columns=None,
contexts_items_features_suffixes=None,
contexts_items_features_prefixes=None,
contexts_items_availabilities_suffix=None,
contexts_items_availabilities_prefix=None,
delimiter="_",
choices_column="choice",
choice_mode="items_id",
):
Expand All @@ -734,12 +750,21 @@ def from_single_wide_df(
List of items ids
fixed_items_suffixes : list
Suffixes of the columns of the dataframe that are item features, default is None
fixed_items_prefixes : list
Prefixes of the columns of the dataframe that are item features, default is None
contexts_features_suffixes : list
Suffixes of the columns of the dataframe that are contexts features, default is None
contexts_features_prefixes : list
Prefixes of the columns of the dataframe that are contexts features, default is None
contexts_items_suffixes : list
Suffixes of the columns of the dataframe that are context-item features, default is None
contexts_items_availabilities_prefix: list
Prefix of the columns of the dataframe that are context-item availabilities,
contexts_items_availabilities_suffix: list
Suffixes of the columns of the dataframe that are context-item availabilities,
Suffix of the columns of the dataframe that are context-item availabilities,
delimiter: str, optional
Delimiter used to separate the given prefix or suffixes and the features names,
default is "_"
choice_column: str, optional
Name of the column containing the choices, default is "choice"
choice_mode: str, optional
Expand All @@ -751,12 +776,46 @@ def from_single_wide_df(
ChoiceDataset
corresponding ChoiceDataset
"""
if fixed_items_prefixes is not None and fixed_items_suffixes is not None:
raise ValueError("You cannot give both fixed_items_prefixes and fixed_items_suffixes")
if (
contexts_items_features_prefixes is not None
and contexts_items_features_suffixes is not None
):
raise ValueError(
"You cannot give both contexts_items_features_prefixes and\
contexts_items_features_suffixes"
)
if (
contexts_items_availabilities_prefix is not None
and contexts_items_availabilities_suffix is not None
):
raise ValueError(
"You cannot give both contexts_items_availabilities_prefix and\
contexts_items_availabilities_suffix"
)

if fixed_items_suffixes is not None:
fixed_items_features = {"item_id": []}
for item in items_id:
fixed_items_features["item_id"].append(item)
for feature in fixed_items_suffixes:
feature_value = df[f"{feature}_{item}"].unique()
feature_value = df[f"{item}{delimiter}{feature}"].unique()
if len(feature_value) > 1:
raise ValueError(
f"More than one value for feature {feature} for item {item}"
)
fixed_items_features[feature] = (
fixed_items_features.get(feature, []),
+[feature_value],
)
fixed_items_features = pd.DataFrame(fixed_items_features)
elif fixed_items_prefixes is not None:
fixed_items_features = {"item_id": []}
for item in items_id:
fixed_items_features["item_id"].append(item)
for feature in fixed_items_suffixes:
feature_value = df[f"{feature}{delimiter}{item}"].unique()
if len(feature_value) > 1:
raise ValueError(
f"More than one value for feature {feature} for item {item}"
Expand All @@ -777,7 +836,24 @@ def from_single_wide_df(
if contexts_items_features_suffixes is not None:
contexts_items_features = []
for item in items_id:
columns = [f"{item}_{feature}" for feature in contexts_items_features_suffixes]
columns = [
f"{item}{delimiter}{feature}" for feature in contexts_items_features_suffixes
]
for col in columns:
if col not in df.columns:
print(
f"Column {col} was not in DataFrame,\
dummy creation of the feature with zeros."
)
df[col] = 0
contexts_items_features.append(df[columns].to_numpy())
contexts_items_features = np.stack(contexts_items_features, axis=1)
elif contexts_items_features_prefixes is not None:
contexts_items_features = []
for item in items_id:
columns = [
f"{feature}{delimiter}{item}" for feature in contexts_items_features_suffixes
]
for col in columns:
if col not in df.columns:
print(
Expand All @@ -795,19 +871,35 @@ def from_single_wide_df(
if not len(contexts_items_availabilities_suffix) == len(items_id):
raise ValueError(
"You have given a list of columns for availabilities."
"We consider that it is one for each item but lenght do not match"
"We consider that it is one for each item however lenghts do not match"
)
print("You have given a list of columns for availabilities.")
print("We consider that it is one for each item")
print("Each column will be matched to an item, given their order")
contexts_items_availabilities = df[contexts_items_availabilities_suffix].to_numpy()
else:
columns = [f"{item}_{contexts_items_availabilities_suffix}" for item in items_id]
columns = [
f"{item}{delimiter}{contexts_items_availabilities_suffix}" for item in items_id
]
contexts_items_availabilities = df[columns].to_numpy()
elif contexts_items_availabilities_prefix is not None:
if isinstance(contexts_items_availabilities_prefix, list):
if not len(contexts_items_availabilities_prefix) == len(items_id):
raise ValueError(
"You have given a list of columns for availabilities."
"We consider that it is one for each item however lenghts do not match"
)
print("You have given a list of columns for availabilities.")
print("Each column will be matched to an item, given their order")
contexts_items_availabilities = df[contexts_items_availabilities_prefix].to_numpy()
else:
columns = [
f"{contexts_items_availabilities_prefix}{delimiter}{item}" for item in items_id
]
contexts_items_availabilities = df[columns].to_numpy()
else:
contexts_items_availabilities = None

choices = df[choices_column].to_numpy()
print("choice", choices)
if choice_mode == "items_id":
if items_id is None:
raise ValueError("items_id must be given to use choice_mode 'items_id'")
Expand Down
14 changes: 7 additions & 7 deletions choice_learn/data/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,12 @@ def __getitem__(self, sequence_keys):
np.ndarray
OneHot reconstructed vectors corresponding to sequence_keys
"""
if isinstance(sequence_keys, list):
if isinstance(sequence_keys, list) or isinstance(sequence_keys, np.ndarray):
# Construction of the OneHot vector from the index of the 1 value
one_hot = np.zeros((len(sequence_keys), self.shape[1]))
for i, j in enumerate(sequence_keys):
one_hot[i, self.storage.storage[j]] = 1
return one_hot.astype(self.dtype)
one_hot = []
for j in sequence_keys:
one_hot.append(self[j])
return np.stack(one_hot).astype(self.dtype)
if isinstance(sequence_keys, slice):
return self[list(range(*sequence_keys.indices(len(self.shape[0]))))]
# else:
Expand Down Expand Up @@ -375,7 +375,7 @@ def __getitem__(self, choices_indexes):
].batch[contexts_items_features[tuple_index][:, :, feature_index]]
)
feat_ind_min = feature_index + 1
unstacked_feat.append(contexts_features[tuple_index][:, :, feat_ind_min:])
unstacked_feat.append(contexts_items_features[tuple_index][:, :, feat_ind_min:])
mapped_features.append(np.concatenate(unstacked_feat, axis=2))

contexts_items_features = mapped_features
Expand Down Expand Up @@ -559,4 +559,4 @@ def __getitem__(self, choices_indexes):
choice,
)
print(f"Type{type(choices_indexes)} not handled")
raise NotImplementedError
raise NotImplementedError(f"Type{type(choices_indexes)} not handled")
14 changes: 12 additions & 2 deletions choice_learn/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
"""Init file for datasets module."""

from .base import load_electricity, load_heating, load_modecanada, load_swissmetro
from .base import load_electricity, load_heating, load_modecanada, load_swissmetro, load_train
from .examples import load_tafeng
from .expedia import load_expedia

__all__ = ["load_modecanada", "load_swissmetro", "load_electricity", "load_heating"]
__all__ = [
"load_modecanada",
"load_swissmetro",
"load_electricity",
"load_heating",
"load_train",
"load_tafeng",
"load_expedia",
]
Loading

0 comments on commit f8e5ddd

Please sign in to comment.