Merge pull request #66 from artefactory/car_dataset

ADD: Car Preferences Dataset + few fixes & improvements
artefactory · Apr 17, 2024 · 2ce0c24 · 2ce0c24
2 parents 274ca89 + f93920e
commit 2ce0c24
Show file tree

Hide file tree

Showing 8 changed files with 130 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -44,8 +44,9 @@ If you are new to choice modelling, you can check this [resource](https://www.pu
 - Ready-To-Use datasets:
   - [SwissMetro](./choice_learn/datasets/data/swissmetro.csv.gz) [[2]](#citation)
   - [ModeCanada](./choice_learn/datasets/data/ModeCanada.csv.gz) [[3]](#citation)
-  - The [Train](./choice_learn/datasets/data/train_data.csv.gz) [[5]](#citation)
+  - The [Train](./choice_learn/datasets/data/train_data.csv.gz) dataset [[5]](#citation)
   - The [Heating](./choice_learn/datasets/data/heating_data.csv.gz) & [Electricity](./choice_learn/datasets/data/electricity.csv.gz) datasets from Kenneth Train described [here](https://rdrr.io/cran/mlogit/man/Electricity.html) and [here](https://rdrr.io/cran/mlogit/man/Heating.html)
+  - [Stated car preferences](./choice_learn/datasets/data/car.csv.gz) [[9]](#citation)
   - The [TaFeng](./choice_learn/datasets/data/ta_feng.csv.zip) dataset from [Kaggle](https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset)
   - The ICDM-2013 [Expedia](./choice_learn/datasets/expedia.py) dataset from [Kaggle](https://www.kaggle.com/c/expedia-personalized-sort) [[6]](#citation)
 
@@ -179,7 +180,8 @@ The use of this software is under the MIT license, with no limitation of usage,
 [5] [Estimation of Travel Choice Models with Randomly Distributed Values of Time](https://ideas.repec.org/p/fth/lavaen/9303.html), Ben-Akiva, M.; Bolduc, D.; Bradley, M. (1993)\
 [6] [Personalize Expedia Hotel Searches - ICDM 2013](https://www.kaggle.com/c/expedia-personalized-sort), Ben Hamner, A.; Friedman, D.; SSA_Expedia. (2013)\
 [7] [A Neural-embedded Discrete Choice Model: Learning Taste Representation with Strengthened Interpretability](https://arxiv.org/abs/2002.00922), Han, Y.; Calara Oereuran F.; Ben-Akiva, M.; Zegras, C. (2020)\
-[8] [A branch-and-cut algorithm for the latent-class logit assortment problem](https://www.sciencedirect.com/science/article/pii/S0166218X12001072), Méndez-Díaz, I.; Miranda-Bront, J. J.; Vulcano, G.; Zabala, P. (2014)
+[8] [A branch-and-cut algorithm for the latent-class logit assortment problem](https://www.sciencedirect.com/science/article/pii/S0166218X12001072), Méndez-Díaz, I.; Miranda-Bront, J. J.; Vulcano, G.; Zabala, P. (2014)\
+[9] [Stated Preferences for Car Choice in Mixed MNL models for discrete response.](https://www.jstor.org/stable/2678603), McFadden, D. and Kenneth Train (2000)
 
 ### Code and Repositories
 - [1][RUMnet](https://github.com/antoinedesir/rumnet)

diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py
@@ -789,7 +789,7 @@ def from_single_wide_df(
         choice_column: str, optional
             Name of the column containing the choices, default is "choice"
         choice_format: str, optional
-            How choice is indicated in df, either "items_name" or "items_index",
+            How choice is indicated in df, either "items_id" or "items_index",
             default is "items_id"
 
         Returns
@@ -807,7 +807,7 @@ def from_single_wide_df(
                 "You cannot give both available_items_prefix and\
                     available_items_suffix."
             )
-        if choice_format not in ["items_index", "items_name"]:
+        if choice_format not in ["items_index", "items_id"]:
             logging.warning("choice_format not undersood, defaulting to 'items_index'")
 
         if shared_features_columns is not None:
@@ -881,9 +881,8 @@ def from_single_wide_df(
             if items_id is None:
                 raise ValueError("items_id must be given to use choice_format='items_id'")
             items_id = np.array(items_id)
-
             choices = np.squeeze([np.where(items_id == c)[0] for c in choices])
-            if choices.shape[0] == 0:
+            if choices.size == 0:
                 raise ValueError("No choice found in the items_id list")
 
         return ChoiceDataset(

diff --git a/choice_learn/datasets/__init__.py b/choice_learn/datasets/__init__.py
@@ -1,6 +1,13 @@
 """Init file for datasets module."""
 
-from .base import load_electricity, load_heating, load_modecanada, load_swissmetro, load_train
+from .base import (
+    load_car_preferences,
+    load_electricity,
+    load_heating,
+    load_modecanada,
+    load_swissmetro,
+    load_train,
+)
 from .expedia import load_expedia
 from .tafeng import load_tafeng
 
@@ -12,4 +19,5 @@
     "load_train",
     "load_tafeng",
     "load_expedia",
+    "load_car_preferences",
 ]
diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py
@@ -515,7 +515,7 @@ def load_modecanada(
         Whether to split features by type in different dataframes, by default False.
     to_wide : bool, optional
         Whether to return the dataset in wide format,
-        by default False (an thus retuned in long format).
+        by default False (an thus returned in long format).
     preprocessing : str, optional
         Preprocessing to apply to the dataset, by default None
 
@@ -704,7 +704,7 @@ def load_heating(
         Whether to return the description, by default False.
     to_wide : bool, optional
         Whether to return the dataset in wide format,
-        by default False (an thus retuned in long format).
+        by default False (an thus returned in long format).
 
     Returns
     -------
@@ -765,7 +765,7 @@ def load_electricity(
         by default False.
     to_wide : bool, optional
         Whether to return the dataset in wide format,
-        by default False (an thus retuned in long format).
+        by default False (an thus returned in long format).
     return_desc : bool, optional
         Whether to return the description, by default False.
 
@@ -836,7 +836,7 @@ def load_train(
         by default False.
     to_wide : bool, optional
         Whether to return the dataset in wide format,
-        by default False (an thus retuned in long format).
+        by default False (an thus returned in long format).
     return_desc : bool, optional
         Whether to return the description, by default False.
 
@@ -873,3 +873,68 @@ def load_train(
         choices_column="choice",
         choice_format="items_id",
     )
+
+
+def load_car_preferences(
+    as_frame=False,
+    return_desc=False,
+):
+    """Load and return the Car dataset from  McFadden, Daniel and Kenneth Train (2000).
+
+    “Mixed MNL models for discrete response”, Journal of Applied Econometrics, 15(5), 447–470.
+
+    Parameters
+    ----------
+    as_frame : bool, optional
+        Whether to return the dataset as pd.DataFrame. If not, returned as ChoiceDataset,
+        by default False.
+    return_desc : bool, optional
+        Whether to return the description, by default False.
+
+    Returns
+    -------
+    ChoiceDataset
+        Loaded Train dataset
+    """
+    desc = "Stated Preferences for Car Choice."
+    desc += """McFadden, Daniel and Kenneth Train (2000)
+    “Mixed MNL models for discrete response”, Journal of Applied Econometrics, 15(5), 447–470."""
+
+    data_file_name = "car.csv.gz"
+    # names, data = load_gzip(data_file_name)
+
+    full_path = get_path(data_file_name, module=DATA_MODULE)
+    cars_df = pd.read_csv(full_path)
+
+    if return_desc:
+        return desc
+
+    if as_frame:
+        return cars_df
+
+    cars_df["choice"] = cars_df.apply(lambda row: row.choice[-1], axis=1)
+    shared_features = ["college", "hsg2", "coml5"]
+    items_features = [
+        "type",
+        "fuel",
+        "price",
+        "range",
+        "acc",
+        "speed",
+        "pollution",
+        "size",
+        "space",
+        "cost",
+        "station",
+    ]
+    items_id = [f"{i}" for i in range(1, 7)]
+
+    return ChoiceDataset.from_single_wide_df(
+        df=cars_df,
+        items_id=items_id,
+        shared_features_columns=shared_features,
+        items_features_prefixes=items_features,
+        delimiter="",
+        choices_column="choice",
+        choice_format="items_id",
+    )
diff --git a/choice_learn/datasets/data/car.csv.gz b/choice_learn/datasets/data/car.csv.gz
diff --git a/docs/index.md b/docs/index.md
@@ -10,6 +10,40 @@ Choice-Learn uses NumPy and pandas as data backend engines and TensorFlow for mo
 
 In this documentation you will find examples to be quickly getting started as well as some more in-depth example.
 
+## What's in there ?
+
+Here is a quick overview of the different functionalities offered by Choice-Learn. Further details are given in the rest of the documentation.
+
+### Data
+- [Custom data handling](./reference/data/references_choice_dataset.md) for choice datasets with possible memory usage optimizations
+- Some Open-Source ready-to use datasets are included within the datasets:
+  - [SwissMetro](./references/dataset/references_base.md)
+  - [ModeCanada](./references/dataset/references_base.md)
+  - The [Train](./references/dataset/references_base.md) dataset
+  - The [Heating](./references/dataset/references_base.md) & [Electricity](./references/dataset/references_base.md)datasets from Kenneth Train
+  - [Stated car preferences](./references/dataset/references_base.md)
+  - The [TaFeng](./references/dataset/references_tafeng.md) dataset from Kaggle
+  - The ICDM-2013 [Expedia](./references/dataset/references_expedia.md) dataset from Kaggle
+
+### Models
+- [Custom modelling](./notebooks/introduction/4_model_customization.md)
+- Ready to be used models:
+    - *Linear Models:*
+        - [Multinomial Logit](./references/models/references_simple_mnl.md)
+        - [Conditional Logit](./references/models/references_clogit.md)
+        - [Latent class MNL](./references/models/references_latent_class_mnl.md)
+    - *Non-Linear Models:*
+        - [RUMnet](./references/models/references_rumnet.md)
+        - [TasteNet](./references/models/references_tastenet.md)
+
+### Tools
+- [Assortment Optimization](./references/toolbox/references_assortment_optimizer.md)
+- [Assortment and Pricing](./references/toolbox/references_assortment_optimizer.md)
+
+### Examples
+
+Diverse examples are provided in the How-To section, give it a look !
+
 ## Introduction - Discrete Choice Modelling
 
 Discrete choice models aim at explaining or predicting choices over a set of alternatives. Well known use-cases include analyzing people's choice of mean of transport or products purchases in stores.

diff --git a/docs/references/models/references_tastenet.md b/docs/references/models/references_tastenet.md
diff --git a/tests/unit_tests/test_os_datasets.py b/tests/unit_tests/test_os_datasets.py
@@ -3,6 +3,7 @@
 
 from choice_learn.data import ChoiceDataset
 from choice_learn.datasets import (
+    load_car_preferences,
     load_electricity,
     load_heating,
     load_modecanada,
@@ -72,3 +73,13 @@ def test_heating_loader():
 
     heating = load_heating()
     assert isinstance(heating, ChoiceDataset)
+
+
+def test_car_preferences_loader():
+    """Test loading the car preferences dataset."""
+    cars = load_car_preferences(as_frame=True)
+    assert isinstance(cars, pd.DataFrame)
+    assert cars.shape == (4654, 71)
+
+    cars = load_car_preferences()
+    assert isinstance(cars, ChoiceDataset)