diff --git a/README.md b/README.md index 6232be2c..ef732481 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ If you are new to choice modelling, you can check this [resource](https://www.pu - The [Train](./choice_learn/datasets/data/train_data.csv.gz) [[5]](#citation) - The [Heating](./choice_learn/datasets/data/heating_data.csv.gz) & [Electricity](./choice_learn/datasets/data/electricity.csv.gz) datasets from Kenneth Train described [here](https://rdrr.io/cran/mlogit/man/Electricity.html) and [here](https://rdrr.io/cran/mlogit/man/Heating.html) - The [TaFeng](./choice_learn/datasets/data/ta_feng.csv.zip) dataset from [Kaggle](https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset) + - The IDCM-2013 [Expedia](./choice_learn/datasets/expedia.py) dataset from [Kaggle](https://www.kaggle.com/c/expedia-personalized-sort) [[6]](#citation) ### Models - Ready-to-use models: @@ -124,17 +125,29 @@ model = ConditionalMNL(optimizer="lbfgs") # add_coefficients adds one coefficient for each specified item_index # intercept, and income are added for each item except the first one that needs to be zeroed -model.add_coefficients(coefficient_name="beta_inter", feature_name="intercept", items_indexes=[1, 2, 3]) -model.add_coefficients(coefficient_name="beta_income", feature_name="income", items_indexes=[1, 2, 3]) +model.add_coefficients(coefficient_name="beta_inter", + feature_name="intercept", + items_indexes=[1, 2, 3]) +model.add_coefficients(coefficient_name="beta_income", + feature_name="income", + items_indexes=[1, 2, 3]) # ivt is added for each item: -model.add_coefficients(coefficient_name="beta_ivt", feature_name="ivt", items_indexes=[0, 1, 2, 3]) +model.add_coefficients(coefficient_name="beta_ivt", + feature_name="ivt", + items_indexes=[0, 1, 2, 3]) # shared_coefficient add one coefficient that is used for all items specified in the items_indexes: # Here, cost, freq and ovt coefficients are shared between all items -model.add_shared_coefficient(coefficient_name="beta_cost", feature_name="cost", items_indexes=[0, 1, 2, 3]) -model.add_shared_coefficient(coefficient_name="beta_freq", feature_name="freq", items_indexes=[0, 1, 2, 3]) -model.add_shared_coefficient(coefficient_name="beta_ovt", feature_name="ovt", items_indexes=[0, 1, 2, 3]) +model.add_shared_coefficient(coefficient_name="beta_cost", + feature_name="cost", + items_indexes=[0, 1, 2, 3]) +model.add_shared_coefficient(coefficient_name="beta_freq", + feature_name="freq", + items_indexes=[0, 1, 2, 3]) +model.add_shared_coefficient(coefficient_name="beta_ovt", + feature_name="ovt", + items_indexes=[0, 1, 2, 3]) history = model.fit(dataset, epochs=1000, get_report=True) print("The average neg-loglikelihood is:", model.evaluate(dataset).numpy()) @@ -157,7 +170,8 @@ A detailed documentation of this project is available [here](https://artefactory [2][The Acceptance of Model Innovation: The Case of Swissmetro](https://www.researchgate.net/publication/37456549_The_acceptance_of_modal_innovation_The_case_of_Swissmetro), Bierlaire, M.; Axhausen, K., W.; Abay, G. (2001)\ [3][Applications and Interpretation of Nested Logit Models of Intercity Mode Choice](https://trid.trb.org/view/385097), Forinash, C., V.; Koppelman, F., S. (1993)\ [4][The Demand for Local Telephone Service: A Fully Discrete Model of Residential Calling Patterns and Service Choices](https://www.jstor.org/stable/2555538), Train K., E.; McFadden, D., L.; Moshe, B. (1987)\ -[5] [Estimation of Travel Choice Models with Randomly Distributed Values of Time](https://ideas.repec.org/p/fth/lavaen/9303.html), Ben-Akiva M; Bolduc D; Bradley M(1993) +[5] [Estimation of Travel Choice Models with Randomly Distributed Values of Time](https://ideas.repec.org/p/fth/lavaen/9303.html), Ben-Akiva M; Bolduc D; Bradley M(1993)\ +[6] [Personalize Expedia Hotel Searches - ICDM 2013](https://www.kaggle.com/c/expedia-personalized-sort), Ben Hamner, A.; Friedman, D.; SSA_Expedia. (2013) ### Code and Repositories - [1][RUMnet](https://github.com/antoinedesir/rumnet) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 5ba813ad..96b40a1c 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -375,9 +375,12 @@ def _build_features_by_ids(self): "No features_names given, match with fiven features_by_ids impossible." ) if ( - self.fixed_items_features_names == (None,) - and self.contexts_features_names == (None,) - and self.contexts_items_features_names == (None,) + isinstance(self.fixed_items_features_names, tuple) + and self.fixed_items_features_names[0] is None + and isinstance(self.contexts_features_names, tuple) + and self.contexts_features_names[0] is None + and isinstance(self.contexts_features_names, tuple) + and self.contexts_features_names[0] is None ): raise ValueError( "No features_names given, match with fiven features_by_ids impossible." @@ -805,10 +808,9 @@ def from_single_wide_df( raise ValueError( f"More than one value for feature {feature} for item {item}" ) - fixed_items_features[feature] = ( - fixed_items_features.get(feature, []), - +[feature_value], - ) + fixed_items_features[feature] = fixed_items_features.get(feature, []) + [ + feature_value[0] + ] fixed_items_features = pd.DataFrame(fixed_items_features) elif fixed_items_prefixes is not None: fixed_items_features = {"item_id": []} @@ -820,10 +822,9 @@ def from_single_wide_df( raise ValueError( f"More than one value for feature {feature} for item {item}" ) - fixed_items_features[feature] = ( - fixed_items_features.get(feature, []), - +[feature_value], - ) + fixed_items_features[feature] = fixed_items_features.get(feature, []) + [ + feature_value[0] + ] fixed_items_features = pd.DataFrame(fixed_items_features) else: fixed_items_features = None diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py index 77b91959..e4ec0a39 100644 --- a/choice_learn/datasets/base.py +++ b/choice_learn/datasets/base.py @@ -11,6 +11,31 @@ DATA_MODULE = "choice_learn.datasets.data" +def get_path(data_file_name, module=DATA_MODULE): + """Function to get path toward data file. + + Specifically used to handled Python 3.8 and 3.9+ differences in importlib.resources handling. + Parameters: + ----------- + module : str, optional + path to directory containing the data file, by default DATA_MODULE + data_file_name : str + name of the csv file to load + + Returns: + -------- + Path + path to the data file + """ + import sys + + if sys.version >= "3.9": + return resources.files(module) / data_file_name + + with resources.path(module, data_file_name) as path: + return path + + def load_csv(data_file_name, data_module=DATA_MODULE, encoding="utf-8"): """Base function to load csv files. @@ -123,12 +148,14 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, Ascona, Switzerland.""" data_file_name = "swissmetro.csv.gz" - names, data = load_gzip(data_file_name) - data = data.astype(int) + full_path = get_path(data_file_name, module=DATA_MODULE) + swiss_df = pd.read_csv(full_path) + swiss_df["CAR_HE"] = 0.0 + # names, data = load_gzip(data_file_name) + # data = data.astype(int) items = ["TRAIN", "SM", "CAR"] - items_features_names = [] - session_features_names = [ + contexts_features_names = [ "GROUP", "PURPOSE", "FIRST", @@ -142,20 +169,21 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, "ORIGIN", "DEST", ] - sessions_items_features_names = ["TT", "CO", "HE"] - sessions_items_features_names = [ - [f"{item}_{feature}" for feature in sessions_items_features_names] for item in items - ] - sessions_items_availabilities = ["TRAIN_AV", "SM_AV", "CAR_AV"] + contexts_items_features_names = ["CO", "TT", "HE", "SEATS"] choice_column = "CHOICE" + availabilities_column = "AV" if add_items_one_hot: - items_features = np.eye(len(items), dtype=np.float64) items_features_names = [f"oh_{item}" for item in items] + for item in items: + for item2 in items: + if item == item2: + swiss_df[f"{item}_oh_{item}"] = 1 + else: + swiss_df[f"{item2}_oh_{item}"] = 0 else: - items_features = None items_features_names = None - + """ # Adding dummy CAR_HE feature as 0 for consistency names.append("CAR_HE") data = np.hstack([data, np.zeros((data.shape[0], 1))]) @@ -177,15 +205,16 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, # choices renormalization choices = choices - 1 + """ if return_desc: return description if as_frame: - return pd.DataFrame(data, columns=names) + return swiss_df if preprocessing == "tutorial": - swiss_df = pd.DataFrame(data, columns=names) + # swiss_df = pd.DataFrame(data, columns=names) # Removing unknown choices swiss_df = swiss_df.loc[swiss_df.CHOICE != 0] # Keep only commute an dbusiness trips @@ -249,7 +278,7 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, choices=choices, ) if preprocessing == "rumnet": - swiss_df = pd.DataFrame(data, columns=names) + # swiss_df = pd.DataFrame(data, columns=names) swiss_df = swiss_df.loc[swiss_df.CHOICE != 0] choices = swiss_df.CHOICE.to_numpy() - 1 contexts_items_availabilities = swiss_df[["TRAIN_AV", "SM_AV", "CAR_AV"]].to_numpy() @@ -326,15 +355,15 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, choices=choices, ) - return ChoiceDataset( - fixed_items_features=items_features, - contexts_features=session_features, - contexts_items_features=sessions_items_features, - contexts_items_availabilities=sessions_items_availabilities, - choices=choices, - fixed_items_features_names=items_features_names, - contexts_features_names=session_features_names, - contexts_items_features_names=sessions_items_features_names, + return ChoiceDataset.from_single_wide_df( + df=swiss_df, + items_id=items, + fixed_items_suffixes=items_features_names, + contexts_features_columns=contexts_features_names, + contexts_items_features_suffixes=contexts_items_features_names, + contexts_items_availabilities_suffix=availabilities_column, + choices_column=choice_column, + choice_mode="item_index", ) @@ -389,9 +418,12 @@ def load_modecanada( nested logit models of intercity mode choice,” Transportation Research Record 1413, 98-106. """ _ = to_wide data_file_name = "ModeCanada.csv.gz" - names, data = load_gzip(data_file_name) - names = [name.replace('"', "") for name in names] - canada_df = pd.DataFrame(data[:, 1:], index=data[:, 0].astype(int), columns=names[1:]) + # names, data = load_gzip(data_file_name) + # names = [name.replace('"', "") for name in names] + # canada_df = pd.DataFrame(data[:, 1:], index=data[:, 0].astype(int), columns=names[1:]) + + full_path = get_path(data_file_name, module=DATA_MODULE) + canada_df = pd.read_csv(full_path) canada_df["alt"] = canada_df.apply(lambda row: row.alt.replace('"', ""), axis=1) # Just some typing canada_df.income = canada_df.income.astype("float32") @@ -578,9 +610,9 @@ def load_heating( Train, K.E. (2003) Discrete Choice Methods with Simulation. Cambridge University Press.""" _ = to_wide data_file_name = "heating_data.csv.gz" - names, data = load_gzip(data_file_name) - heating_df = pd.read_csv(resources.files(DATA_MODULE) / "heating_data.csv.gz") + full_path = get_path(data_file_name, module=DATA_MODULE) + heating_df = pd.read_csv(full_path) if return_desc: return desc @@ -632,7 +664,7 @@ def load_electricity( """ _ = to_wide data_file_name = "electricity.csv.gz" - names, data = load_gzip(data_file_name) + # names, data = load_gzip(data_file_name) description = """A sample of 2308 households in the United States. - choice: the choice of the individual, one of 1, 2, 3, 4, @@ -657,7 +689,8 @@ def load_electricity( Train, K.E. (2003) Discrete Choice Methods with Simulation. Cambridge University Press. """ - elec_df = pd.read_csv(resources.files(DATA_MODULE) / data_file_name) + full_path = get_path(data_file_name, module=DATA_MODULE) + elec_df = pd.read_csv(full_path) elec_df.choice = elec_df.choice.astype(int) elec_df[["pf", "cl", "loc", "wk", "tod", "seas"]] = elec_df[ ["pf", "cl", "loc", "wk", "tod", "seas"] @@ -706,9 +739,10 @@ def load_train( ”Papers 9303, Laval-Recherche en Energie. https://ideas.repec.org/p/fth/lavaen/9303.html.""" _ = to_wide data_file_name = "train_data.csv.gz" - names, data = load_gzip(data_file_name) + # names, data = load_gzip(data_file_name) - train_df = pd.read_csv(resources.files(DATA_MODULE) / data_file_name) + full_path = get_path(data_file_name, module=DATA_MODULE) + train_df = pd.read_csv(full_path) if return_desc: return desc diff --git a/choice_learn/datasets/data/__init__.py b/choice_learn/datasets/data/__init__.py new file mode 100644 index 00000000..c24dcc41 --- /dev/null +++ b/choice_learn/datasets/data/__init__.py @@ -0,0 +1 @@ +"""Directory to store datasets as zipped .csv files.""" diff --git a/choice_learn/datasets/examples.py b/choice_learn/datasets/examples.py index d25b7cc8..a5761f74 100644 --- a/choice_learn/datasets/examples.py +++ b/choice_learn/datasets/examples.py @@ -1,10 +1,10 @@ """Some datasets used for personal examples.""" -from importlib import resources import numpy as np import pandas as pd from choice_learn.data.choice_dataset import ChoiceDataset +from choice_learn.datasets.base import get_path DATA_MODULE = "choice_learn.datasets.data" @@ -30,7 +30,8 @@ def load_tafeng(as_frame=False, return_desc=False, preprocessing=None): TaFeng Grocery Dataset. """ filename = "ta_feng.csv.zip" - filepath = resources.files(DATA_MODULE) / filename + + filepath = get_path(filename, module=DATA_MODULE) # url = "https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset/download?datasetVersionNumber=1" # if not os.path.exists(filepath): # with urllib.request.urlopen(url) as f: @@ -125,4 +126,4 @@ def load_tafeng(as_frame=False, return_desc=False, preprocessing=None): contexts_items_availabilities=np.ones((len(choices), 25)).astype("float32"), ) - return tafeng_df + return load_tafeng(as_frame=False, preprocessing="assort_example") diff --git a/choice_learn/datasets/expedia.py b/choice_learn/datasets/expedia.py index 2b8f53a3..94838389 100644 --- a/choice_learn/datasets/expedia.py +++ b/choice_learn/datasets/expedia.py @@ -1,6 +1,4 @@ """ICDM 2013 Expedia dataset.""" -import os -from importlib import resources from pathlib import Path import numpy as np @@ -8,6 +6,7 @@ from choice_learn.data.choice_dataset import ChoiceDataset from choice_learn.data.storage import OneHotStorage +from choice_learn.datasets.base import get_path DATA_MODULE = "choice_learn.datasets.data" @@ -15,18 +14,16 @@ def load_expedia(as_frame=False, preprocessing="rumnet"): """Load the Expedia dataset.""" filename = "expedia.csv" - data_path = resources.files(DATA_MODULE) - if not Path.exists((data_path / filename)): + data_path = get_path(filename, module=DATA_MODULE) + if not Path.exists(data_path): print("In order to use the Expedia dataset, please download it from:") print("https://www.kaggle.com/c/expedia-personalized-sort") print("and save it in the following location:") - print(os.path.join(DATA_MODULE, filename)) + print(data_path) print("The downloaded train.csv file should be named 'expedia.csv'") - raise FileNotFoundError( - f"File {filename} not found in {os.path.join(DATA_MODULE, filename)}" - ) + raise FileNotFoundError(f"File {filename} not found in {data_path}") - expedia_df = pd.read_csv((data_path / filename)) + expedia_df = pd.read_csv(data_path) if as_frame: return expedia_df @@ -35,6 +32,20 @@ def load_expedia(as_frame=False, preprocessing="rumnet"): expedia_df.loc[:, "day_of_week"] = expedia_df.loc[:, "date_time"].dt.dayofweek expedia_df.loc[:, "month"] = expedia_df.loc[:, "date_time"].dt.month expedia_df.loc[:, "hour"] = expedia_df.loc[:, "date_time"].dt.hour + + for id_col in [ + "site_id", + "visitor_location_country_id", + "prop_country_id", + "srch_destination_id", + ]: + value_counts = expedia_df[["srch_id", id_col]].drop_duplicates()[id_col].value_counts() + kept_ids = value_counts.index[value_counts.gt(1000)] + for id_ in expedia_df[id_col].unique(): + if id_ not in kept_ids: + expedia_df.loc[expedia_df[id_col] == id_, id_col] = -1 + + # Filtering expedia_df = expedia_df[expedia_df.price_usd <= 1000] expedia_df = expedia_df[expedia_df.price_usd >= 10] expedia_df["log_price"] = expedia_df.price_usd.apply(np.log) diff --git a/choice_learn/models/rumnet.py b/choice_learn/models/rumnet.py index 2872bce7..3efd36ec 100644 --- a/choice_learn/models/rumnet.py +++ b/choice_learn/models/rumnet.py @@ -443,6 +443,17 @@ class PaperRUMnet(ChoiceModel): Representing Random Utility Choice Models with Neural Networks from Ali Aouad and Antoine Désir https://arxiv.org/abs/2207.12877 + --- Attention: --- + Note that the model uses two type of features that are treated differently: + - customer features + - product features + >>> In this implementation, please make sure that the features are correctly formatted: + - customer features: (n_contexts, n_features) are given as 'contexts_features' in the + ChoiceDataset used to fit the model + - product features: (n_contexts, n_items, n_features) are given as 'contexts_items_features' + in the ChoiceDataset used to fit the model + --- + Inherits from base_model.ChoiceModel TODO: Verify that all parameters are implemented. """ @@ -782,12 +793,19 @@ def batch_predict( probabilities, tf.reduce_sum(probabilities, axis=1, keepdims=True) + 1e-5 ) - batch_nll = self.loss( - y_pred=probabilities, - y_true=tf.one_hot(choices, depth=probabilities.shape[1]), - sample_weight=sample_weight, - ) - return batch_nll, probabilities + batch_loss = { + "optimized_loss": self.loss( + y_pred=probabilities, + y_true=tf.one_hot(choices, depth=probabilities.shape[1]), + sample_weight=sample_weight, + ), + "NegativeLogLikelihood": tf.keras.losses.CategoricalCrossentropy()( + y_pred=probabilities, + y_true=tf.one_hot(choices, depth=probabilities.shape[1]), + sample_weight=sample_weight, + ), + } + return batch_loss, probabilities class CPURUMnet(PaperRUMnet): @@ -831,9 +849,30 @@ def compute_batch_utility( """ (_, _) = contexts_items_availabilities, choices ### Restacking of the item features - stacked_fixed_items_features = tf.concat([*fixed_items_features], axis=-1) - stacked_contexts_features = tf.concat([*contexts_features], axis=-1) - stacked_contexts_items_features = tf.concat([*contexts_items_features], axis=-1) + if fixed_items_features is not None and fixed_items_features[0] is not None: + stacked_fixed_items_features = tf.cast( + tf.concat([*fixed_items_features], axis=-1), tf.float32 + ) + else: + if contexts_items_features is None or contexts_items_features[0] is None: + raise ValueError("No item features provided") + stacked_fixed_items_features = tf.zeros((contexts_items_features[0].shape[1], 0)) + if contexts_features is not None and contexts_features[0] is not None: + stacked_contexts_features = tf.cast( + tf.concat([*contexts_features], axis=-1), tf.float32 + ) + else: + raise ValueError("No Customer features provided") + if contexts_items_features is not None and contexts_items_features[0] is not None: + stacked_contexts_items_features = tf.cast( + tf.concat([*contexts_items_features], axis=-1), tf.float32 + ) + else: + if fixed_items_features is None or fixed_items_features[0] is None: + raise ValueError("No item features provided") + stacked_fixed_items_features = tf.zeros( + (contexts_items_features.shape[0], fixed_items_features[0].shape[0], 0) + ) full_item_features = tf.stack( [stacked_fixed_items_features] * stacked_contexts_items_features.shape[0], axis=0 @@ -950,9 +989,24 @@ def compute_batch_utility( (_, _) = contexts_items_availabilities, choices ### Restacking of the item features - stacked_fixed_items_features = tf.concat([*fixed_items_features], axis=-1) - stacked_contexts_features = tf.concat([*contexts_features], axis=-1) - stacked_contexts_items_features = tf.concat([*contexts_items_features], axis=-1) + if fixed_items_features is not None and fixed_items_features[0] is not None: + stacked_fixed_items_features = tf.concat([*fixed_items_features], axis=-1) + else: + if contexts_items_features is None or contexts_items_features[0] is None: + raise ValueError("No item features provided") + stacked_fixed_items_features = tf.zeros((contexts_items_features.shape[1], 0)) + if contexts_features is not None and contexts_features[0] is not None: + stacked_contexts_features = tf.concat([*contexts_features], axis=-1) + else: + raise ValueError("No Customer features provided") + if contexts_items_features is not None and contexts_items_features[0] is not None: + stacked_contexts_items_features = tf.concat([*contexts_items_features], axis=-1) + else: + if fixed_items_features is None or fixed_items_features[0] is None: + raise ValueError("No item features provided") + stacked_fixed_items_features = tf.zeros( + (contexts_items_features.shape[0], fixed_items_features.shape[0], 0) + ) # Reshaping # Beware if contexts_items_features is None...! diff --git a/choice_learn/toolbox/__init__.py b/choice_learn/toolbox/__init__.py new file mode 100644 index 00000000..ef1f3eb0 --- /dev/null +++ b/choice_learn/toolbox/__init__.py @@ -0,0 +1 @@ +"""Different tools to help with choice models manipulation.""" diff --git a/notebooks/latent_class_model.ipynb b/notebooks/latent_class_model.ipynb index 88cd8f10..59f4de11 100644 --- a/notebooks/latent_class_model.ipynb +++ b/notebooks/latent_class_model.ipynb @@ -22,7 +22,6 @@ "\n", "sys.path.append(\"../\")\n", "\n", - "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "\n", diff --git a/notebooks/rumnet_example.ipynb b/notebooks/rumnet_example.ipynb index 06034272..8f1f6284 100644 --- a/notebooks/rumnet_example.ipynb +++ b/notebooks/rumnet_example.ipynb @@ -271,19 +271,64 @@ "print(\"Average LogLikeliHood on test:\", np.mean(test_eval))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A larger and more complex dataset: Expedia ICDM 2013\n", + "The RUMnet paper benchmarks the model on a second dataset. If you want to use it you need to download the file from [Kaggle](https://www.kaggle.com/c/expedia-personalized-sort) and place the train.csv file in the folder choice_learn/datasets/data with the name expedia.csv." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "test_eval" + "from choice_learn.datasets import load_expedia\n", + "\n", + "expedia_dataset = load_expedia(preprocessing=\"rumnet\")" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, - "source": [] + "outputs": [], + "source": [ + "test_dataset = expedia_dataset[int(len(expedia_dataset)*0.8):]\n", + "train_dataset = expedia_dataset[:int(len(expedia_dataset)*0.8)]\n", + "\n", + "model_args = {\n", + " \"num_products_features\": 46,\n", + " \"num_customer_features\": 84,\n", + " \"width_eps_x\": 10,\n", + " \"depth_eps_x\": 3,\n", + " \"heterogeneity_x\": 5,\n", + " \"width_eps_z\": 10,\n", + " \"depth_eps_z\": 3,\n", + " \"heterogeneity_z\": 5,\n", + " \"width_u\": 10,\n", + " \"depth_u\": 3,\n", + " \"tol\": 0,\n", + " \"optimizer\": \"Adam\",\n", + " \"lr\": 0.001,\n", + " \"logmin\": 1e-10,\n", + " \"label_smoothing\": 0.02,\n", + " \"callbacks\": [],\n", + " \"epochs\": 15,\n", + " \"batch_size\": 128,\n", + " \"tol\": 1e-5,\n", + "}\n", + "model = RUMnet(**model_args)\n", + "model.instantiate()\n", + "\n", + "losses = model.fit(train_dataset, val_dataset=test_dataset)\n", + "probas = model.predict_probas(test_dataset)\n", + "test_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)(y_pred=model.predict_probas(test_dataset), y_true=tf.one_hot(test_dataset.choices, 39))\n", + "\n", + "print(test_loss)" + ] } ], "metadata": { diff --git a/notebooks/simple_mnl_mlogit.ipynb b/notebooks/simple_mnl_mlogit.ipynb index e298dc3f..8d246cc5 100644 --- a/notebooks/simple_mnl_mlogit.ipynb +++ b/notebooks/simple_mnl_mlogit.ipynb @@ -22,7 +22,6 @@ "\n", "sys.path.append(\"../\")\n", "\n", - "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "from choice_learn.models.simple_mnl import SimpleMNL\n", @@ -36,7 +35,7 @@ "source": [ "Let's recreate this [tutorial](https://cran.r-project.org/web/packages/mlogit/vignettes/e1mlogit.html) by Yves Croissant for the mlogit R package.\n", "\n", - "It uses the Heating dataset, where we try to predict which heating harware a houseold will chose. The dataset is integrated in the package, you can find information [here]." + "It uses the Heating dataset, where we try to predict which heating hardware a houseold will chose available in choice_learn.datasets !" ] }, { diff --git a/requirements-complete.txt b/requirements-complete.txt new file mode 100644 index 00000000..4d18e3a2 --- /dev/null +++ b/requirements-complete.txt @@ -0,0 +1,101 @@ +absl-py==1.4.0 +aiohttp==3.9.3 +aiosignal==1.2.0 +appnope==0.1.4 +asttokens==2.4.1 +astunparse==1.6.3 +async-timeout==4.0.3 +attrs==23.1.0 +backcall==0.2.0 +blinker==1.6.2 +Bottleneck==1.3.7 +Brotli==1.0.9 +cachetools==4.2.2 +certifi==2024.2.2 +cffi==1.16.0 +charset-normalizer==2.0.4 +click==8.1.7 +cloudpickle==2.2.1 +comm==0.2.2 +cryptography==41.0.3 +debugpy==1.6.7 +decorator==5.1.1 +dm-tree==0.1.7 +executing==2.0.1 +flatbuffers==2.0 +frozenlist==1.4.0 +gast==0.4.0 +google-auth==2.6.0 +google-auth-oauthlib==0.4.4 +google-pasta==0.2.0 +grpcio==1.42.0 +h5py==3.9.0 +idna==3.4 +importlib_metadata==7.0.2 +ipykernel==6.29.3 +ipython==8.12.0 +jax==0.3.25 +jaxlib==0.3.25 +jedi==0.19.1 +jupyter_client==8.6.1 +jupyter_core==5.7.2 +keras==2.11.0 +Keras-Preprocessing==1.1.2 +Markdown==3.4.1 +MarkupSafe==2.1.3 +matplotlib-inline==0.1.6 +multidict==6.0.4 +nest_asyncio==1.6.0 +numexpr==2.8.4 +numpy==1.24.3 +oauthlib==3.2.2 +opt-einsum==3.3.0 +packaging==24.0 +pandas==2.0.3 +parso==0.8.3 +pexpect==4.9.0 +pickleshare==0.7.5 +pip==23.3.1 +platformdirs==4.2.0 +pooch==1.7.0 +prompt-toolkit==3.0.42 +protobuf==3.20.3 +psutil==5.9.8 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.21 +Pygments==2.17.2 +PyJWT==2.4.0 +pyOpenSSL==23.2.0 +PySocks==1.7.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +pyzmq==24.0.1 +requests==2.31.0 +requests-oauthlib==1.3.0 +rsa==4.7.2 +scipy==1.10.1 +setuptools==68.2.2 +six==1.16.0 +stack-data==0.6.2 +tensorboard==2.11.0 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.6.0 +tensorflow==2.11.0 +tensorflow-estimator==2.11.0 +tensorflow-probability==0.19.0 +termcolor==2.1.0 +tornado==6.4 +tqdm==4.65.0 +traitlets==5.14.2 +typing_extensions==4.10.0 +tzdata==2023.3 +urllib3==2.1.0 +wcwidth==0.2.13 +Werkzeug==2.3.8 +wheel==0.35.1 +wrapt==1.14.1 +yarl==1.9.3 +zipp==3.17.0 diff --git a/tests/unit_tests/test_os_datasets.py b/tests/unit_tests/test_os_datasets.py new file mode 100644 index 00000000..8349c18a --- /dev/null +++ b/tests/unit_tests/test_os_datasets.py @@ -0,0 +1,74 @@ +"""Unit testing for included Open Source datasets loaders.""" +import pandas as pd + +from choice_learn.data import ChoiceDataset +from choice_learn.datasets import ( + load_electricity, + load_heating, + load_modecanada, + load_swissmetro, + load_tafeng, + load_train, +) + + +def test_swissmetro_loader(): + """Test loading the Swissmetro dataset.""" + swissmetro = load_swissmetro(as_frame=True) + assert isinstance(swissmetro, pd.DataFrame) + assert swissmetro.shape == (10728, 29) + + swissmetro = load_swissmetro() + assert isinstance(swissmetro, ChoiceDataset) + swissmetro = load_swissmetro(add_items_one_hot=True) + assert isinstance(swissmetro, ChoiceDataset) + + +def test_modecanada_loader(): + """Test loading the Canada dataset.""" + canada = load_modecanada(as_frame=True) + assert isinstance(canada, pd.DataFrame) + assert canada.shape == (15520, 12) + + canada = load_modecanada() + assert isinstance(canada, ChoiceDataset) + + +def test_electricity_loader(): + """Test loading the Electricity dataset.""" + electricity = load_electricity(as_frame=True) + assert isinstance(electricity, pd.DataFrame) + assert electricity.shape == (17232, 10) + + electricity = load_electricity() + assert isinstance(electricity, ChoiceDataset) + + +def test_train_loader(): + """Test loading the Train dataset.""" + train = load_train(as_frame=True) + assert isinstance(train, pd.DataFrame) + assert train.shape == (2929, 11) + + train = load_train() + assert isinstance(train, ChoiceDataset) + + +def test_tafeng_loader(): + """Test loading the TaFeng dataset.""" + tafeng = load_tafeng(as_frame=True) + assert isinstance(tafeng, pd.DataFrame) + assert tafeng.shape == (817741, 9) + + tafeng = load_tafeng() + assert isinstance(tafeng, ChoiceDataset) + + +def test_heating_loader(): + """Test loading the heating dataset.""" + heating = load_heating(as_frame=True) + assert isinstance(heating, pd.DataFrame) + assert heating.shape == (900, 16) + + heating = load_heating() + assert isinstance(heating, ChoiceDataset)