diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 2de18566..744ba37e 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -1222,17 +1222,22 @@ def get_choices_batch(self, choices_indexes, features=None): for feature_index in np.sort( list(self.shared_features_by_choice_map[tuple_index].keys()) ): - unstacked_feat.append( - shared_features_by_choice[tuple_index][ - :, feat_ind_min:feature_index - ] - ) + if feat_ind_min != feature_index: + unstacked_feat.append( + shared_features_by_choice[tuple_index][ + :, feat_ind_min:feature_index + ] + ) unstacked_feat.append( self.shared_features_by_choice_map[tuple_index][ feature_index ].batch[shared_features_by_choice[tuple_index][:, feature_index]] ) feat_ind_min = feature_index + 1 + if feat_ind_min != shared_features_by_choice[tuple_index].shape[1]: + unstacked_feat.append( + shared_features_by_choice[tuple_index][:, feat_ind_min:] + ) mapped_features.append(np.concatenate(unstacked_feat, axis=1)) else: mapped_features.append(shared_features_by_choice[tuple_index]) @@ -1255,11 +1260,12 @@ def get_choices_batch(self, choices_indexes, features=None): for feature_index in np.sort( list(self.items_features_by_choice_map[tuple_index].keys()) ): - unstacked_feat.append( - items_features_by_choice[tuple_index][ - :, :, feat_ind_min:feature_index - ] - ) + if feat_ind_min != feature_index: + unstacked_feat.append( + items_features_by_choice[tuple_index][ + :, :, feat_ind_min:feature_index + ] + ) unstacked_feat.append( self.items_features_by_choice_map[tuple_index][ feature_index @@ -1268,6 +1274,10 @@ def get_choices_batch(self, choices_indexes, features=None): ] ) feat_ind_min = feature_index + 1 + if feat_ind_min != items_features_by_choice[tuple_index].shape[2]: + unstacked_feat.append( + shared_features_by_choice[tuple_index][:, :, feat_ind_min:] + ) mapped_features.append(np.concatenate(unstacked_feat, axis=2)) else: mapped_features.append(items_features_by_choice[tuple_index]) @@ -1398,7 +1408,7 @@ def __getitem__(self, choices_indexes): try: if isinstance(self.available_items_by_choice, tuple): - available_items_by_choice = self.available_items_by_choice[1] + available_items_by_choice = self.available_items_by_choice[1][choices_indexes] else: available_items_by_choice = self.available_items_by_choice[choices_indexes] except TypeError: diff --git a/choice_learn/data/indexer.py b/choice_learn/data/indexer.py index 5d477c02..45a605bb 100644 --- a/choice_learn/data/indexer.py +++ b/choice_learn/data/indexer.py @@ -162,6 +162,15 @@ def __getitem__(self, sequence_keys): """ if isinstance(sequence_keys, list) or isinstance(sequence_keys, np.ndarray): # Construction of the OneHot vector from the index of the 1 value + + if np.array(sequence_keys).ndim == 1: + one_hot = [] + for j in sequence_keys: + # one_hot.append(self[j]) + one_hot.append(self.storage.storage[j]) + matrix = np.zeros((len(one_hot), self.shape[1])) + matrix[np.arange(len(one_hot)), one_hot] = 1 + return matrix.astype(self.dtype) one_hot = [] for j in sequence_keys: one_hot.append(self[j]) @@ -256,9 +265,10 @@ def _get_shared_features_by_choice(self, choices_indexes): if hasattr(shared_feature, "batch"): shared_features_by_choice.append(shared_feature.batch[choices_indexes]) else: - shared_features_by_choice.append( - np.stack(shared_feature[choices_indexes], axis=0) - ) + # shared_features_by_choice.append( + # np.stack(shared_feature[choices_indexes], axis=0) + # ) + shared_features_by_choice.append(shared_feature[choices_indexes]) return shared_features_by_choice def _get_items_features_by_choice(self, choices_indexes): @@ -281,7 +291,8 @@ def _get_items_features_by_choice(self, choices_indexes): if hasattr(items_feature, "batch"): items_features_by_choice.append(items_feature.batch[choices_indexes]) else: - items_features_by_choice.append(np.stack(items_feature[choices_indexes], axis=0)) + # items_features_by_choice.append(np.stack(items_feature[choices_indexes], axis=0)) + items_features_by_choice.append(items_feature[choices_indexes]) return items_features_by_choice def __getitem__(self, choices_indexes): @@ -352,18 +363,23 @@ def __getitem__(self, choices_indexes): ].keys() ) ): - unstacked_feat.append( - shared_features_by_choice[tuple_index][ - :, feat_ind_min:feature_index - ] - ) + if feat_ind_min != feature_index: + unstacked_feat.append( + shared_features_by_choice[tuple_index][ + :, feat_ind_min:feature_index + ] + ) unstacked_feat.append( self.choice_dataset.shared_features_by_choice_map[tuple_index][ feature_index ].batch[shared_features_by_choice[tuple_index][:, feature_index]] ) feat_ind_min = feature_index + 1 - mapped_features.append(np.concatenate(unstacked_feat, axis=1)) + if feat_ind_min != shared_features_by_choice[tuple_index].shape[1]: + unstacked_feat.append( + shared_features_by_choice[tuple_index][:, feat_ind_min:] + ) + mapped_features.append(np.hstack(unstacked_feat)) else: mapped_features.append(shared_features_by_choice[tuple_index]) @@ -389,11 +405,12 @@ def __getitem__(self, choices_indexes): ].keys() ) ): - unstacked_feat.append( - items_features_by_choice[tuple_index][ - :, :, feat_ind_min:feature_index - ] - ) + if feat_ind_min != feature_index: + unstacked_feat.append( + items_features_by_choice[tuple_index][ + :, :, feat_ind_min:feature_index + ] + ) unstacked_feat.append( self.choice_dataset.items_features_by_choice_map[tuple_index][ feature_index @@ -402,6 +419,10 @@ def __getitem__(self, choices_indexes): ] ) feat_ind_min = feature_index + 1 + if feat_ind_min != items_features_by_choice[tuple_index].shape[2]: + unstacked_feat.append( + shared_features_by_choice[tuple_index][:, :, feat_ind_min:] + ) mapped_features.append(np.concatenate(unstacked_feat, axis=2)) else: mapped_features.append(items_features_by_choice[tuple_index]) @@ -429,7 +450,6 @@ def __getitem__(self, choices_indexes): items_features_by_choice = items_features_by_choice[0] else: items_features_by_choice = tuple(items_features_by_choice) - return ( shared_features_by_choice, items_features_by_choice, diff --git a/tests/integration_tests/data/test_dataset_indexer.py b/tests/integration_tests/data/test_dataset_indexer.py index c1bf1f9b..6dcb2b5f 100644 --- a/tests/integration_tests/data/test_dataset_indexer.py +++ b/tests/integration_tests/data/test_dataset_indexer.py @@ -2,7 +2,7 @@ import numpy as np -from choice_learn.data import ChoiceDataset, OneHotStorage +from choice_learn.data import ChoiceDataset, FeaturesStorage, OneHotStorage def test_batch(): @@ -93,3 +93,100 @@ def test_batch(): assert (batch[2] == np.array([[1.0, 1.0], [1.0, 1.0]])).all() assert (batch[3] == np.array([1, 1])).all() + + +def test_batch_2(): + """Test specific usecase of batching that was failing.""" + storage = OneHotStorage(ids=[0, 1, 2, 3], name="id") + shared_storage = OneHotStorage(ids=[0, 1, 2, 3], name="shared_id") + mixed_shared_storage = FeaturesStorage( + ids=[0, 1, 2, 3], values=[[10], [20], [30], [40]], name="mixed_shared_id" + ) + + items_features = np.array( + [ + [ + [2, 2, 2, 2], + [2, 2, 2, 3], + ], + [ + [2, 2, 3, 2], + [3, 2, 2, 2], + ], + [[3, 2, 2, 2], [2, 3, 2, 2]], + ] + ) + + items_features_ids = np.array( + [ + [[0], [1]], + [[3], [2]], + [[0], [1]], + ] + ) + + shared_features = np.array([[2, 3, 1], [3, 2, 4], [9, 1, 4]]) + shared_features_ids = np.array([[0], [1], [2]]) + + choices = np.array([0, 1, 1]) + + dataset = ChoiceDataset( + shared_features_by_choice=(shared_features, shared_features_ids), + shared_features_by_choice_names=( + ["shared_a", "mixed_shared_id", "shared_b"], + ["shared_id"], + ), + items_features_by_choice=(items_features, items_features_ids), + items_features_by_choice_names=(["a", "b", "c", "d"], ["id"]), + choices=choices, + features_by_ids=[storage, shared_storage, mixed_shared_storage], + ) + + batch = dataset.get_choices_batch(0) + print(batch) + assert (batch[0][0] == np.array([2, 40, 1])).all() + assert (batch[0][1] == np.array([1, 0, 0, 0])).all() + + assert (batch[1][0] == np.array([[2, 2, 2, 2], [2, 2, 2, 3]])).all() + assert (batch[1][1] == np.array([[1, 0, 0, 0], [0, 1, 0, 0]])).all() + + assert (batch[2] == np.array([1.0, 1.0])).all() + assert batch[3] == 0 + + batch = dataset.batch[0] + assert (batch[0][0] == np.array([2, 40, 1])).all() + assert (batch[0][1] == np.array([1, 0, 0, 0])).all() + + assert (batch[1][0] == np.array([[2, 2, 2, 2], [2, 2, 2, 3]])).all() + assert (batch[1][1] == np.array([[1, 0, 0, 0], [0, 1, 0, 0]])).all() + + assert (batch[2] == np.array([1.0, 1.0])).all() + assert batch[3] == 0 + + batch = dataset.get_choices_batch([1, 2]) + assert (batch[0][0] == np.array([[3, 30, 4], [9, 20, 4]])).all() + assert (batch[0][1] == np.array([[0, 1, 0, 0], [0, 0, 1, 0]])).all() + + assert ( + batch[1][0] == np.array([[[2, 2, 3, 2], [3, 2, 2, 2]], [[3, 2, 2, 2], [2, 3, 2, 2]]]) + ).all() + assert ( + batch[1][1] == np.array([[[0, 0, 0, 1], [0, 0, 1, 0]], [[1, 0, 0, 0], [0, 1, 0, 0]]]) + ).all() + + assert (batch[2] == np.array([[1.0, 1.0], [1.0, 1.0]])).all() + assert (batch[3] == np.array([1, 1])).all() + + batch = dataset.batch[[1, 2]] + assert (batch[0][0] == np.array([[3, 30, 4], [9, 20, 4]])).all() + assert (batch[0][1] == np.array([[0, 1, 0, 0], [0, 0, 1, 0]])).all() + + assert ( + batch[1][0] == np.array([[[2, 2, 3, 2], [3, 2, 2, 2]], [[3, 2, 2, 2], [2, 3, 2, 2]]]) + ).all() + assert ( + batch[1][1] == np.array([[[0, 0, 0, 1], [0, 0, 1, 0]], [[1, 0, 0, 0], [0, 1, 0, 0]]]) + ).all() + + assert (batch[2] == np.array([[1.0, 1.0], [1.0, 1.0]])).all() + assert (batch[3] == np.array([1, 1])).all()