Skip to content

Commit

Permalink
Merge pull request #39 from artefactory/corrections
Browse files Browse the repository at this point in the history
ADD: tests on datasets loading when possible
ADD: complete requirements.txt
ADD: Example of RUMnet fitting on Expedia
FIX: GPURUMnet, wrong tests on (None, ), CD.from_wide_df
ENH: diverse minors style enhancements (ReadMe, DocStrings, etc...)
  • Loading branch information
VincentAuriau authored Mar 14, 2024
2 parents f8e5ddd + 73601b5 commit 8864d44
Show file tree
Hide file tree
Showing 13 changed files with 416 additions and 81 deletions.
28 changes: 21 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ If you are new to choice modelling, you can check this [resource](https://www.pu
- The [Train](./choice_learn/datasets/data/train_data.csv.gz) [[5]](#citation)
- The [Heating](./choice_learn/datasets/data/heating_data.csv.gz) & [Electricity](./choice_learn/datasets/data/electricity.csv.gz) datasets from Kenneth Train described [here](https://rdrr.io/cran/mlogit/man/Electricity.html) and [here](https://rdrr.io/cran/mlogit/man/Heating.html)
- The [TaFeng](./choice_learn/datasets/data/ta_feng.csv.zip) dataset from [Kaggle](https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset)
- The IDCM-2013 [Expedia](./choice_learn/datasets/expedia.py) dataset from [Kaggle](https://www.kaggle.com/c/expedia-personalized-sort) [[6]](#citation)

### Models
- Ready-to-use models:
Expand Down Expand Up @@ -124,17 +125,29 @@ model = ConditionalMNL(optimizer="lbfgs")

# add_coefficients adds one coefficient for each specified item_index
# intercept, and income are added for each item except the first one that needs to be zeroed
model.add_coefficients(coefficient_name="beta_inter", feature_name="intercept", items_indexes=[1, 2, 3])
model.add_coefficients(coefficient_name="beta_income", feature_name="income", items_indexes=[1, 2, 3])
model.add_coefficients(coefficient_name="beta_inter",
feature_name="intercept",
items_indexes=[1, 2, 3])
model.add_coefficients(coefficient_name="beta_income",
feature_name="income",
items_indexes=[1, 2, 3])

# ivt is added for each item:
model.add_coefficients(coefficient_name="beta_ivt", feature_name="ivt", items_indexes=[0, 1, 2, 3])
model.add_coefficients(coefficient_name="beta_ivt",
feature_name="ivt",
items_indexes=[0, 1, 2, 3])

# shared_coefficient add one coefficient that is used for all items specified in the items_indexes:
# Here, cost, freq and ovt coefficients are shared between all items
model.add_shared_coefficient(coefficient_name="beta_cost", feature_name="cost", items_indexes=[0, 1, 2, 3])
model.add_shared_coefficient(coefficient_name="beta_freq", feature_name="freq", items_indexes=[0, 1, 2, 3])
model.add_shared_coefficient(coefficient_name="beta_ovt", feature_name="ovt", items_indexes=[0, 1, 2, 3])
model.add_shared_coefficient(coefficient_name="beta_cost",
feature_name="cost",
items_indexes=[0, 1, 2, 3])
model.add_shared_coefficient(coefficient_name="beta_freq",
feature_name="freq",
items_indexes=[0, 1, 2, 3])
model.add_shared_coefficient(coefficient_name="beta_ovt",
feature_name="ovt",
items_indexes=[0, 1, 2, 3])

history = model.fit(dataset, epochs=1000, get_report=True)
print("The average neg-loglikelihood is:", model.evaluate(dataset).numpy())
Expand All @@ -157,7 +170,8 @@ A detailed documentation of this project is available [here](https://artefactory
[2][The Acceptance of Model Innovation: The Case of Swissmetro](https://www.researchgate.net/publication/37456549_The_acceptance_of_modal_innovation_The_case_of_Swissmetro), Bierlaire, M.; Axhausen, K., W.; Abay, G. (2001)\
[3][Applications and Interpretation of Nested Logit Models of Intercity Mode Choice](https://trid.trb.org/view/385097), Forinash, C., V.; Koppelman, F., S. (1993)\
[4][The Demand for Local Telephone Service: A Fully Discrete Model of Residential Calling Patterns and Service Choices](https://www.jstor.org/stable/2555538), Train K., E.; McFadden, D., L.; Moshe, B. (1987)\
[5] [Estimation of Travel Choice Models with Randomly Distributed Values of Time](https://ideas.repec.org/p/fth/lavaen/9303.html), Ben-Akiva M; Bolduc D; Bradley M(1993)
[5] [Estimation of Travel Choice Models with Randomly Distributed Values of Time](https://ideas.repec.org/p/fth/lavaen/9303.html), Ben-Akiva M; Bolduc D; Bradley M(1993)\
[6] [Personalize Expedia Hotel Searches - ICDM 2013](https://www.kaggle.com/c/expedia-personalized-sort), Ben Hamner, A.; Friedman, D.; SSA_Expedia. (2013)

### Code and Repositories
- [1][RUMnet](https://github.com/antoinedesir/rumnet)
Expand Down
23 changes: 12 additions & 11 deletions choice_learn/data/choice_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,9 +375,12 @@ def _build_features_by_ids(self):
"No features_names given, match with fiven features_by_ids impossible."
)
if (
self.fixed_items_features_names == (None,)
and self.contexts_features_names == (None,)
and self.contexts_items_features_names == (None,)
isinstance(self.fixed_items_features_names, tuple)
and self.fixed_items_features_names[0] is None
and isinstance(self.contexts_features_names, tuple)
and self.contexts_features_names[0] is None
and isinstance(self.contexts_features_names, tuple)
and self.contexts_features_names[0] is None
):
raise ValueError(
"No features_names given, match with fiven features_by_ids impossible."
Expand Down Expand Up @@ -805,10 +808,9 @@ def from_single_wide_df(
raise ValueError(
f"More than one value for feature {feature} for item {item}"
)
fixed_items_features[feature] = (
fixed_items_features.get(feature, []),
+[feature_value],
)
fixed_items_features[feature] = fixed_items_features.get(feature, []) + [
feature_value[0]
]
fixed_items_features = pd.DataFrame(fixed_items_features)
elif fixed_items_prefixes is not None:
fixed_items_features = {"item_id": []}
Expand All @@ -820,10 +822,9 @@ def from_single_wide_df(
raise ValueError(
f"More than one value for feature {feature} for item {item}"
)
fixed_items_features[feature] = (
fixed_items_features.get(feature, []),
+[feature_value],
)
fixed_items_features[feature] = fixed_items_features.get(feature, []) + [
feature_value[0]
]
fixed_items_features = pd.DataFrame(fixed_items_features)
else:
fixed_items_features = None
Expand Down
100 changes: 67 additions & 33 deletions choice_learn/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,31 @@
DATA_MODULE = "choice_learn.datasets.data"


def get_path(data_file_name, module=DATA_MODULE):
"""Function to get path toward data file.
Specifically used to handled Python 3.8 and 3.9+ differences in importlib.resources handling.
Parameters:
-----------
module : str, optional
path to directory containing the data file, by default DATA_MODULE
data_file_name : str
name of the csv file to load
Returns:
--------
Path
path to the data file
"""
import sys

if sys.version >= "3.9":
return resources.files(module) / data_file_name

with resources.path(module, data_file_name) as path:
return path


def load_csv(data_file_name, data_module=DATA_MODULE, encoding="utf-8"):
"""Base function to load csv files.
Expand Down Expand Up @@ -123,12 +148,14 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False,
Ascona, Switzerland."""

data_file_name = "swissmetro.csv.gz"
names, data = load_gzip(data_file_name)
data = data.astype(int)
full_path = get_path(data_file_name, module=DATA_MODULE)
swiss_df = pd.read_csv(full_path)
swiss_df["CAR_HE"] = 0.0
# names, data = load_gzip(data_file_name)
# data = data.astype(int)

items = ["TRAIN", "SM", "CAR"]
items_features_names = []
session_features_names = [
contexts_features_names = [
"GROUP",
"PURPOSE",
"FIRST",
Expand All @@ -142,20 +169,21 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False,
"ORIGIN",
"DEST",
]
sessions_items_features_names = ["TT", "CO", "HE"]
sessions_items_features_names = [
[f"{item}_{feature}" for feature in sessions_items_features_names] for item in items
]
sessions_items_availabilities = ["TRAIN_AV", "SM_AV", "CAR_AV"]
contexts_items_features_names = ["CO", "TT", "HE", "SEATS"]
choice_column = "CHOICE"
availabilities_column = "AV"

if add_items_one_hot:
items_features = np.eye(len(items), dtype=np.float64)
items_features_names = [f"oh_{item}" for item in items]
for item in items:
for item2 in items:
if item == item2:
swiss_df[f"{item}_oh_{item}"] = 1
else:
swiss_df[f"{item2}_oh_{item}"] = 0
else:
items_features = None
items_features_names = None

"""
# Adding dummy CAR_HE feature as 0 for consistency
names.append("CAR_HE")
data = np.hstack([data, np.zeros((data.shape[0], 1))])
Expand All @@ -177,15 +205,16 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False,
# choices renormalization
choices = choices - 1
"""

if return_desc:
return description

if as_frame:
return pd.DataFrame(data, columns=names)
return swiss_df

if preprocessing == "tutorial":
swiss_df = pd.DataFrame(data, columns=names)
# swiss_df = pd.DataFrame(data, columns=names)
# Removing unknown choices
swiss_df = swiss_df.loc[swiss_df.CHOICE != 0]
# Keep only commute an dbusiness trips
Expand Down Expand Up @@ -249,7 +278,7 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False,
choices=choices,
)
if preprocessing == "rumnet":
swiss_df = pd.DataFrame(data, columns=names)
# swiss_df = pd.DataFrame(data, columns=names)
swiss_df = swiss_df.loc[swiss_df.CHOICE != 0]
choices = swiss_df.CHOICE.to_numpy() - 1
contexts_items_availabilities = swiss_df[["TRAIN_AV", "SM_AV", "CAR_AV"]].to_numpy()
Expand Down Expand Up @@ -326,15 +355,15 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False,
choices=choices,
)

return ChoiceDataset(
fixed_items_features=items_features,
contexts_features=session_features,
contexts_items_features=sessions_items_features,
contexts_items_availabilities=sessions_items_availabilities,
choices=choices,
fixed_items_features_names=items_features_names,
contexts_features_names=session_features_names,
contexts_items_features_names=sessions_items_features_names,
return ChoiceDataset.from_single_wide_df(
df=swiss_df,
items_id=items,
fixed_items_suffixes=items_features_names,
contexts_features_columns=contexts_features_names,
contexts_items_features_suffixes=contexts_items_features_names,
contexts_items_availabilities_suffix=availabilities_column,
choices_column=choice_column,
choice_mode="item_index",
)


Expand Down Expand Up @@ -389,9 +418,12 @@ def load_modecanada(
nested logit models of intercity mode choice,” Transportation Research Record 1413, 98-106. """
_ = to_wide
data_file_name = "ModeCanada.csv.gz"
names, data = load_gzip(data_file_name)
names = [name.replace('"', "") for name in names]
canada_df = pd.DataFrame(data[:, 1:], index=data[:, 0].astype(int), columns=names[1:])
# names, data = load_gzip(data_file_name)
# names = [name.replace('"', "") for name in names]
# canada_df = pd.DataFrame(data[:, 1:], index=data[:, 0].astype(int), columns=names[1:])

full_path = get_path(data_file_name, module=DATA_MODULE)
canada_df = pd.read_csv(full_path)
canada_df["alt"] = canada_df.apply(lambda row: row.alt.replace('"', ""), axis=1)
# Just some typing
canada_df.income = canada_df.income.astype("float32")
Expand Down Expand Up @@ -578,9 +610,9 @@ def load_heating(
Train, K.E. (2003) Discrete Choice Methods with Simulation. Cambridge University Press."""
_ = to_wide
data_file_name = "heating_data.csv.gz"
names, data = load_gzip(data_file_name)

heating_df = pd.read_csv(resources.files(DATA_MODULE) / "heating_data.csv.gz")
full_path = get_path(data_file_name, module=DATA_MODULE)
heating_df = pd.read_csv(full_path)

if return_desc:
return desc
Expand Down Expand Up @@ -632,7 +664,7 @@ def load_electricity(
"""
_ = to_wide
data_file_name = "electricity.csv.gz"
names, data = load_gzip(data_file_name)
# names, data = load_gzip(data_file_name)

description = """A sample of 2308 households in the United States.
- choice: the choice of the individual, one of 1, 2, 3, 4,
Expand All @@ -657,7 +689,8 @@ def load_electricity(
Train, K.E. (2003) Discrete Choice Methods with Simulation. Cambridge University Press.
"""

elec_df = pd.read_csv(resources.files(DATA_MODULE) / data_file_name)
full_path = get_path(data_file_name, module=DATA_MODULE)
elec_df = pd.read_csv(full_path)
elec_df.choice = elec_df.choice.astype(int)
elec_df[["pf", "cl", "loc", "wk", "tod", "seas"]] = elec_df[
["pf", "cl", "loc", "wk", "tod", "seas"]
Expand Down Expand Up @@ -706,9 +739,10 @@ def load_train(
”Papers 9303, Laval-Recherche en Energie. https://ideas.repec.org/p/fth/lavaen/9303.html."""
_ = to_wide
data_file_name = "train_data.csv.gz"
names, data = load_gzip(data_file_name)
# names, data = load_gzip(data_file_name)

train_df = pd.read_csv(resources.files(DATA_MODULE) / data_file_name)
full_path = get_path(data_file_name, module=DATA_MODULE)
train_df = pd.read_csv(full_path)

if return_desc:
return desc
Expand Down
1 change: 1 addition & 0 deletions choice_learn/datasets/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Directory to store datasets as zipped .csv files."""
7 changes: 4 additions & 3 deletions choice_learn/datasets/examples.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""Some datasets used for personal examples."""
from importlib import resources

import numpy as np
import pandas as pd

from choice_learn.data.choice_dataset import ChoiceDataset
from choice_learn.datasets.base import get_path

DATA_MODULE = "choice_learn.datasets.data"

Expand All @@ -30,7 +30,8 @@ def load_tafeng(as_frame=False, return_desc=False, preprocessing=None):
TaFeng Grocery Dataset.
"""
filename = "ta_feng.csv.zip"
filepath = resources.files(DATA_MODULE) / filename

filepath = get_path(filename, module=DATA_MODULE)
# url = "https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset/download?datasetVersionNumber=1"
# if not os.path.exists(filepath):
# with urllib.request.urlopen(url) as f:
Expand Down Expand Up @@ -125,4 +126,4 @@ def load_tafeng(as_frame=False, return_desc=False, preprocessing=None):
contexts_items_availabilities=np.ones((len(choices), 25)).astype("float32"),
)

return tafeng_df
return load_tafeng(as_frame=False, preprocessing="assort_example")
29 changes: 20 additions & 9 deletions choice_learn/datasets/expedia.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,29 @@
"""ICDM 2013 Expedia dataset."""
import os
from importlib import resources
from pathlib import Path

import numpy as np
import pandas as pd

from choice_learn.data.choice_dataset import ChoiceDataset
from choice_learn.data.storage import OneHotStorage
from choice_learn.datasets.base import get_path

DATA_MODULE = "choice_learn.datasets.data"


def load_expedia(as_frame=False, preprocessing="rumnet"):
"""Load the Expedia dataset."""
filename = "expedia.csv"
data_path = resources.files(DATA_MODULE)
if not Path.exists((data_path / filename)):
data_path = get_path(filename, module=DATA_MODULE)
if not Path.exists(data_path):
print("In order to use the Expedia dataset, please download it from:")
print("https://www.kaggle.com/c/expedia-personalized-sort")
print("and save it in the following location:")
print(os.path.join(DATA_MODULE, filename))
print(data_path)
print("The downloaded train.csv file should be named 'expedia.csv'")
raise FileNotFoundError(
f"File {filename} not found in {os.path.join(DATA_MODULE, filename)}"
)
raise FileNotFoundError(f"File {filename} not found in {data_path}")

expedia_df = pd.read_csv((data_path / filename))
expedia_df = pd.read_csv(data_path)
if as_frame:
return expedia_df

Expand All @@ -35,6 +32,20 @@ def load_expedia(as_frame=False, preprocessing="rumnet"):
expedia_df.loc[:, "day_of_week"] = expedia_df.loc[:, "date_time"].dt.dayofweek
expedia_df.loc[:, "month"] = expedia_df.loc[:, "date_time"].dt.month
expedia_df.loc[:, "hour"] = expedia_df.loc[:, "date_time"].dt.hour

for id_col in [
"site_id",
"visitor_location_country_id",
"prop_country_id",
"srch_destination_id",
]:
value_counts = expedia_df[["srch_id", id_col]].drop_duplicates()[id_col].value_counts()
kept_ids = value_counts.index[value_counts.gt(1000)]
for id_ in expedia_df[id_col].unique():
if id_ not in kept_ids:
expedia_df.loc[expedia_df[id_col] == id_, id_col] = -1

# Filtering
expedia_df = expedia_df[expedia_df.price_usd <= 1000]
expedia_df = expedia_df[expedia_df.price_usd >= 10]
expedia_df["log_price"] = expedia_df.price_usd.apply(np.log)
Expand Down
Loading

0 comments on commit 8864d44

Please sign in to comment.