Merge pull request #39 from artefactory/corrections

ADD: tests on datasets loading when possible ADD: complete requirements.txt ADD: Example of RUMnet fitting on Expedia FIX: GPURUMnet, wrong tests on (None, ), CD.from_wide_df ENH: diverse minors style enhancements (ReadMe, DocStrings, etc...)
artefactory · Mar 14, 2024 · 8864d44 · 8864d44
2 parents f8e5ddd + 73601b5
commit 8864d44
Show file tree

Hide file tree

Showing 13 changed files with 416 additions and 81 deletions.
diff --git a/README.md b/README.md
@@ -48,6 +48,7 @@ If you are new to choice modelling, you can check this [resource](https://www.pu
   - The [Train](./choice_learn/datasets/data/train_data.csv.gz) [[5]](#citation)
   - The [Heating](./choice_learn/datasets/data/heating_data.csv.gz) & [Electricity](./choice_learn/datasets/data/electricity.csv.gz) datasets from Kenneth Train described [here](https://rdrr.io/cran/mlogit/man/Electricity.html) and [here](https://rdrr.io/cran/mlogit/man/Heating.html)
   - The [TaFeng](./choice_learn/datasets/data/ta_feng.csv.zip) dataset from [Kaggle](https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset)
+  - The IDCM-2013 [Expedia](./choice_learn/datasets/expedia.py) dataset from [Kaggle](https://www.kaggle.com/c/expedia-personalized-sort) [[6]](#citation)
 
 ### Models
 - Ready-to-use models:
@@ -124,17 +125,29 @@ model = ConditionalMNL(optimizer="lbfgs")
 
 # add_coefficients adds one coefficient for each specified item_index
 # intercept, and income are added for each item except the first one that needs to be zeroed
-model.add_coefficients(coefficient_name="beta_inter", feature_name="intercept", items_indexes=[1, 2, 3])
-model.add_coefficients(coefficient_name="beta_income", feature_name="income", items_indexes=[1, 2, 3])
+model.add_coefficients(coefficient_name="beta_inter",
+                       feature_name="intercept",
+                       items_indexes=[1, 2, 3])
+model.add_coefficients(coefficient_name="beta_income",
+                       feature_name="income",
+                       items_indexes=[1, 2, 3])
 
 # ivt is added for each item:
-model.add_coefficients(coefficient_name="beta_ivt", feature_name="ivt", items_indexes=[0, 1, 2, 3])
+model.add_coefficients(coefficient_name="beta_ivt",
+                       feature_name="ivt",
+                       items_indexes=[0, 1, 2, 3])
 
 # shared_coefficient add one coefficient that is used for all items specified in the items_indexes:
 # Here, cost, freq and ovt coefficients are shared between all items
-model.add_shared_coefficient(coefficient_name="beta_cost", feature_name="cost", items_indexes=[0, 1, 2, 3])
-model.add_shared_coefficient(coefficient_name="beta_freq", feature_name="freq", items_indexes=[0, 1, 2, 3])
-model.add_shared_coefficient(coefficient_name="beta_ovt", feature_name="ovt", items_indexes=[0, 1, 2, 3])
+model.add_shared_coefficient(coefficient_name="beta_cost",
+                             feature_name="cost",
+                             items_indexes=[0, 1, 2, 3])
+model.add_shared_coefficient(coefficient_name="beta_freq",
+                             feature_name="freq",
+                             items_indexes=[0, 1, 2, 3])
+model.add_shared_coefficient(coefficient_name="beta_ovt",
+                             feature_name="ovt",
+                             items_indexes=[0, 1, 2, 3])
 
 history = model.fit(dataset, epochs=1000, get_report=True)
 print("The average neg-loglikelihood is:", model.evaluate(dataset).numpy())
@@ -157,7 +170,8 @@ A detailed documentation of this project is available [here](https://artefactory
 [2][The Acceptance of Model Innovation: The Case of Swissmetro](https://www.researchgate.net/publication/37456549_The_acceptance_of_modal_innovation_The_case_of_Swissmetro), Bierlaire, M.; Axhausen, K., W.; Abay, G. (2001)\
 [3][Applications and Interpretation of Nested Logit Models of Intercity Mode Choice](https://trid.trb.org/view/385097), Forinash, C., V.; Koppelman, F., S. (1993)\
 [4][The Demand for Local Telephone Service: A Fully Discrete Model of Residential Calling Patterns and Service Choices](https://www.jstor.org/stable/2555538), Train K., E.; McFadden, D., L.; Moshe, B. (1987)\
-[5] [Estimation of Travel Choice Models with Randomly Distributed Values of Time](https://ideas.repec.org/p/fth/lavaen/9303.html), Ben-Akiva M; Bolduc D; Bradley M(1993)
+[5] [Estimation of Travel Choice Models with Randomly Distributed Values of Time](https://ideas.repec.org/p/fth/lavaen/9303.html), Ben-Akiva M; Bolduc D; Bradley M(1993)\
+[6] [Personalize Expedia Hotel Searches - ICDM 2013](https://www.kaggle.com/c/expedia-personalized-sort), Ben Hamner, A.; Friedman, D.; SSA_Expedia. (2013)
 
 ### Code and Repositories
 - [1][RUMnet](https://github.com/antoinedesir/rumnet)

diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py
@@ -375,9 +375,12 @@ def _build_features_by_ids(self):
                 "No features_names given, match with fiven features_by_ids impossible."
             )
         if (
-            self.fixed_items_features_names == (None,)
-            and self.contexts_features_names == (None,)
-            and self.contexts_items_features_names == (None,)
+            isinstance(self.fixed_items_features_names, tuple)
+            and self.fixed_items_features_names[0] is None
+            and isinstance(self.contexts_features_names, tuple)
+            and self.contexts_features_names[0] is None
+            and isinstance(self.contexts_features_names, tuple)
+            and self.contexts_features_names[0] is None
         ):
             raise ValueError(
                 "No features_names given, match with fiven features_by_ids impossible."
@@ -805,10 +808,9 @@ def from_single_wide_df(
                         raise ValueError(
                             f"More than one value for feature {feature} for item {item}"
                         )
-                    fixed_items_features[feature] = (
-                        fixed_items_features.get(feature, []),
-                        +[feature_value],
-                    )
+                    fixed_items_features[feature] = fixed_items_features.get(feature, []) + [
+                        feature_value[0]
+                    ]
             fixed_items_features = pd.DataFrame(fixed_items_features)
         elif fixed_items_prefixes is not None:
             fixed_items_features = {"item_id": []}
@@ -820,10 +822,9 @@ def from_single_wide_df(
                         raise ValueError(
                             f"More than one value for feature {feature} for item {item}"
                         )
-                    fixed_items_features[feature] = (
-                        fixed_items_features.get(feature, []),
-                        +[feature_value],
-                    )
+                    fixed_items_features[feature] = fixed_items_features.get(feature, []) + [
+                        feature_value[0]
+                    ]
             fixed_items_features = pd.DataFrame(fixed_items_features)
         else:
             fixed_items_features = None

diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py
@@ -11,6 +11,31 @@
 DATA_MODULE = "choice_learn.datasets.data"
 
 
+def get_path(data_file_name, module=DATA_MODULE):
+    """Function to get path toward data file.
+
+    Specifically used to handled Python 3.8 and 3.9+ differences in importlib.resources handling.
+    Parameters:
+    -----------
+    module : str, optional
+        path to directory containing the data file, by default DATA_MODULE
+    data_file_name : str
+        name of the csv file to load
+
+    Returns:
+    --------
+    Path
+        path to the data file
+    """
+    import sys
+
+    if sys.version >= "3.9":
+        return resources.files(module) / data_file_name
+
+    with resources.path(module, data_file_name) as path:
+        return path
+
+
 def load_csv(data_file_name, data_module=DATA_MODULE, encoding="utf-8"):
     """Base function to load csv files.
 
@@ -123,12 +148,14 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False,
     Ascona, Switzerland."""
 
     data_file_name = "swissmetro.csv.gz"
-    names, data = load_gzip(data_file_name)
-    data = data.astype(int)
+    full_path = get_path(data_file_name, module=DATA_MODULE)
+    swiss_df = pd.read_csv(full_path)
+    swiss_df["CAR_HE"] = 0.0
+    # names, data = load_gzip(data_file_name)
+    # data = data.astype(int)
 
     items = ["TRAIN", "SM", "CAR"]
-    items_features_names = []
-    session_features_names = [
+    contexts_features_names = [
         "GROUP",
         "PURPOSE",
         "FIRST",
@@ -142,20 +169,21 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False,
         "ORIGIN",
         "DEST",
     ]
-    sessions_items_features_names = ["TT", "CO", "HE"]
-    sessions_items_features_names = [
-        [f"{item}_{feature}" for feature in sessions_items_features_names] for item in items
-    ]
-    sessions_items_availabilities = ["TRAIN_AV", "SM_AV", "CAR_AV"]
+    contexts_items_features_names = ["CO", "TT", "HE", "SEATS"]
     choice_column = "CHOICE"
+    availabilities_column = "AV"
 
     if add_items_one_hot:
-        items_features = np.eye(len(items), dtype=np.float64)
         items_features_names = [f"oh_{item}" for item in items]
+        for item in items:
+            for item2 in items:
+                if item == item2:
+                    swiss_df[f"{item}_oh_{item}"] = 1
+                else:
+                    swiss_df[f"{item2}_oh_{item}"] = 0
     else:
-        items_features = None
         items_features_names = None
-
+    """
     # Adding dummy CAR_HE feature as 0 for consistency
     names.append("CAR_HE")
     data = np.hstack([data, np.zeros((data.shape[0], 1))])
@@ -177,15 +205,16 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False,
 
     # choices renormalization
     choices = choices - 1
+    """
 
     if return_desc:
         return description
 
     if as_frame:
-        return pd.DataFrame(data, columns=names)
+        return swiss_df
 
     if preprocessing == "tutorial":
-        swiss_df = pd.DataFrame(data, columns=names)
+        # swiss_df = pd.DataFrame(data, columns=names)
         # Removing unknown choices
         swiss_df = swiss_df.loc[swiss_df.CHOICE != 0]
         # Keep only commute an dbusiness trips
@@ -249,7 +278,7 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False,
             choices=choices,
         )
     if preprocessing == "rumnet":
-        swiss_df = pd.DataFrame(data, columns=names)
+        # swiss_df = pd.DataFrame(data, columns=names)
         swiss_df = swiss_df.loc[swiss_df.CHOICE != 0]
         choices = swiss_df.CHOICE.to_numpy() - 1
         contexts_items_availabilities = swiss_df[["TRAIN_AV", "SM_AV", "CAR_AV"]].to_numpy()
@@ -326,15 +355,15 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False,
             choices=choices,
         )
 
-    return ChoiceDataset(
-        fixed_items_features=items_features,
-        contexts_features=session_features,
-        contexts_items_features=sessions_items_features,
-        contexts_items_availabilities=sessions_items_availabilities,
-        choices=choices,
-        fixed_items_features_names=items_features_names,
-        contexts_features_names=session_features_names,
-        contexts_items_features_names=sessions_items_features_names,
+    return ChoiceDataset.from_single_wide_df(
+        df=swiss_df,
+        items_id=items,
+        fixed_items_suffixes=items_features_names,
+        contexts_features_columns=contexts_features_names,
+        contexts_items_features_suffixes=contexts_items_features_names,
+        contexts_items_availabilities_suffix=availabilities_column,
+        choices_column=choice_column,
+        choice_mode="item_index",
     )
 
 
@@ -389,9 +418,12 @@ def load_modecanada(
      nested logit models of intercity mode choice,” Transportation Research Record 1413, 98-106. """
     _ = to_wide
     data_file_name = "ModeCanada.csv.gz"
-    names, data = load_gzip(data_file_name)
-    names = [name.replace('"', "") for name in names]
-    canada_df = pd.DataFrame(data[:, 1:], index=data[:, 0].astype(int), columns=names[1:])
+    # names, data = load_gzip(data_file_name)
+    # names = [name.replace('"', "") for name in names]
+    # canada_df = pd.DataFrame(data[:, 1:], index=data[:, 0].astype(int), columns=names[1:])
+
+    full_path = get_path(data_file_name, module=DATA_MODULE)
+    canada_df = pd.read_csv(full_path)
     canada_df["alt"] = canada_df.apply(lambda row: row.alt.replace('"', ""), axis=1)
     # Just some typing
     canada_df.income = canada_df.income.astype("float32")
@@ -578,9 +610,9 @@ def load_heating(
     Train, K.E. (2003) Discrete Choice Methods with Simulation. Cambridge University Press."""
     _ = to_wide
     data_file_name = "heating_data.csv.gz"
-    names, data = load_gzip(data_file_name)
 
-    heating_df = pd.read_csv(resources.files(DATA_MODULE) / "heating_data.csv.gz")
+    full_path = get_path(data_file_name, module=DATA_MODULE)
+    heating_df = pd.read_csv(full_path)
 
     if return_desc:
         return desc
@@ -632,7 +664,7 @@ def load_electricity(
     """
     _ = to_wide
     data_file_name = "electricity.csv.gz"
-    names, data = load_gzip(data_file_name)
+    # names, data = load_gzip(data_file_name)
 
     description = """A sample of 2308 households in the United States.
     - choice: the choice of the individual, one of 1, 2, 3, 4,
@@ -657,7 +689,8 @@ def load_electricity(
     Train, K.E. (2003) Discrete Choice Methods with Simulation. Cambridge University Press.
     """
 
-    elec_df = pd.read_csv(resources.files(DATA_MODULE) / data_file_name)
+    full_path = get_path(data_file_name, module=DATA_MODULE)
+    elec_df = pd.read_csv(full_path)
     elec_df.choice = elec_df.choice.astype(int)
     elec_df[["pf", "cl", "loc", "wk", "tod", "seas"]] = elec_df[
         ["pf", "cl", "loc", "wk", "tod", "seas"]
@@ -706,9 +739,10 @@ def load_train(
     ”Papers 9303, Laval-Recherche en Energie. https://ideas.repec.org/p/fth/lavaen/9303.html."""
     _ = to_wide
     data_file_name = "train_data.csv.gz"
-    names, data = load_gzip(data_file_name)
+    # names, data = load_gzip(data_file_name)
 
-    train_df = pd.read_csv(resources.files(DATA_MODULE) / data_file_name)
+    full_path = get_path(data_file_name, module=DATA_MODULE)
+    train_df = pd.read_csv(full_path)
 
     if return_desc:
         return desc

diff --git a/choice_learn/datasets/data/__init__.py b/choice_learn/datasets/data/__init__.py
@@ -0,0 +1 @@
+"""Directory to store datasets as zipped .csv files."""
diff --git a/choice_learn/datasets/examples.py b/choice_learn/datasets/examples.py
@@ -1,10 +1,10 @@
 """Some datasets used for personal examples."""
-from importlib import resources
 
 import numpy as np
 import pandas as pd
 
 from choice_learn.data.choice_dataset import ChoiceDataset
+from choice_learn.datasets.base import get_path
 
 DATA_MODULE = "choice_learn.datasets.data"
 
@@ -30,7 +30,8 @@ def load_tafeng(as_frame=False, return_desc=False, preprocessing=None):
         TaFeng Grocery Dataset.
     """
     filename = "ta_feng.csv.zip"
-    filepath = resources.files(DATA_MODULE) / filename
+
+    filepath = get_path(filename, module=DATA_MODULE)
     # url = "https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset/download?datasetVersionNumber=1"
     # if not os.path.exists(filepath):
     #     with urllib.request.urlopen(url) as f:
@@ -125,4 +126,4 @@ def load_tafeng(as_frame=False, return_desc=False, preprocessing=None):
             contexts_items_availabilities=np.ones((len(choices), 25)).astype("float32"),
         )
 
-    return tafeng_df
+    return load_tafeng(as_frame=False, preprocessing="assort_example")
diff --git a/choice_learn/datasets/expedia.py b/choice_learn/datasets/expedia.py
@@ -1,32 +1,29 @@
 """ICDM 2013 Expedia dataset."""
-import os
-from importlib import resources
 from pathlib import Path
 
 import numpy as np
 import pandas as pd
 
 from choice_learn.data.choice_dataset import ChoiceDataset
 from choice_learn.data.storage import OneHotStorage
+from choice_learn.datasets.base import get_path
 
 DATA_MODULE = "choice_learn.datasets.data"
 
 
 def load_expedia(as_frame=False, preprocessing="rumnet"):
     """Load the Expedia dataset."""
     filename = "expedia.csv"
-    data_path = resources.files(DATA_MODULE)
-    if not Path.exists((data_path / filename)):
+    data_path = get_path(filename, module=DATA_MODULE)
+    if not Path.exists(data_path):
         print("In order to use the Expedia dataset, please download it from:")
         print("https://www.kaggle.com/c/expedia-personalized-sort")
         print("and save it in the following location:")
-        print(os.path.join(DATA_MODULE, filename))
+        print(data_path)
         print("The downloaded train.csv file should be named 'expedia.csv'")
-        raise FileNotFoundError(
-            f"File {filename} not found in {os.path.join(DATA_MODULE, filename)}"
-        )
+        raise FileNotFoundError(f"File {filename} not found in {data_path}")
 
-    expedia_df = pd.read_csv((data_path / filename))
+    expedia_df = pd.read_csv(data_path)
     if as_frame:
         return expedia_df
 
@@ -35,6 +32,20 @@ def load_expedia(as_frame=False, preprocessing="rumnet"):
         expedia_df.loc[:, "day_of_week"] = expedia_df.loc[:, "date_time"].dt.dayofweek
         expedia_df.loc[:, "month"] = expedia_df.loc[:, "date_time"].dt.month
         expedia_df.loc[:, "hour"] = expedia_df.loc[:, "date_time"].dt.hour
+
+        for id_col in [
+            "site_id",
+            "visitor_location_country_id",
+            "prop_country_id",
+            "srch_destination_id",
+        ]:
+            value_counts = expedia_df[["srch_id", id_col]].drop_duplicates()[id_col].value_counts()
+            kept_ids = value_counts.index[value_counts.gt(1000)]
+            for id_ in expedia_df[id_col].unique():
+                if id_ not in kept_ids:
+                    expedia_df.loc[expedia_df[id_col] == id_, id_col] = -1
+
+        # Filtering
         expedia_df = expedia_df[expedia_df.price_usd <= 1000]
         expedia_df = expedia_df[expedia_df.price_usd >= 10]
         expedia_df["log_price"] = expedia_df.price_usd.apply(np.log)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Directory to store datasets as zipped .csv files."""