Hotfix/storage (#117)

FIX: - ChoiceDataset with FeaturesStorage for availabilities - FeaturesStorage in the middle other features were ignoring last features ENH: - Faster OneHotStorage batching ADD: - Tests corresponding to fixes
artefactory · Jul 3, 2024 · b43e6be · b43e6be
1 parent 610165a
commit b43e6be
Show file tree

Hide file tree

Showing 3 changed files with 155 additions and 28 deletions.
diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py
@@ -1222,17 +1222,22 @@ def get_choices_batch(self, choices_indexes, features=None):
                         for feature_index in np.sort(
                             list(self.shared_features_by_choice_map[tuple_index].keys())
                         ):
-                            unstacked_feat.append(
-                                shared_features_by_choice[tuple_index][
-                                    :, feat_ind_min:feature_index
-                                ]
-                            )
+                            if feat_ind_min != feature_index:
+                                unstacked_feat.append(
+                                    shared_features_by_choice[tuple_index][
+                                        :, feat_ind_min:feature_index
+                                    ]
+                                )
                             unstacked_feat.append(
                                 self.shared_features_by_choice_map[tuple_index][
                                     feature_index
                                 ].batch[shared_features_by_choice[tuple_index][:, feature_index]]
                             )
                             feat_ind_min = feature_index + 1
+                        if feat_ind_min != shared_features_by_choice[tuple_index].shape[1]:
+                            unstacked_feat.append(
+                                shared_features_by_choice[tuple_index][:, feat_ind_min:]
+                            )
                         mapped_features.append(np.concatenate(unstacked_feat, axis=1))
                     else:
                         mapped_features.append(shared_features_by_choice[tuple_index])
@@ -1255,11 +1260,12 @@ def get_choices_batch(self, choices_indexes, features=None):
                             for feature_index in np.sort(
                                 list(self.items_features_by_choice_map[tuple_index].keys())
                             ):
-                                unstacked_feat.append(
-                                    items_features_by_choice[tuple_index][
-                                        :, :, feat_ind_min:feature_index
-                                    ]
-                                )
+                                if feat_ind_min != feature_index:
+                                    unstacked_feat.append(
+                                        items_features_by_choice[tuple_index][
+                                            :, :, feat_ind_min:feature_index
+                                        ]
+                                    )
                                 unstacked_feat.append(
                                     self.items_features_by_choice_map[tuple_index][
                                         feature_index
@@ -1268,6 +1274,10 @@ def get_choices_batch(self, choices_indexes, features=None):
                                     ]
                                 )
                                 feat_ind_min = feature_index + 1
+                            if feat_ind_min != items_features_by_choice[tuple_index].shape[2]:
+                                unstacked_feat.append(
+                                    shared_features_by_choice[tuple_index][:, :, feat_ind_min:]
+                                )
                             mapped_features.append(np.concatenate(unstacked_feat, axis=2))
                     else:
                         mapped_features.append(items_features_by_choice[tuple_index])
@@ -1398,7 +1408,7 @@ def __getitem__(self, choices_indexes):
 
         try:
             if isinstance(self.available_items_by_choice, tuple):
-                available_items_by_choice = self.available_items_by_choice[1]
+                available_items_by_choice = self.available_items_by_choice[1][choices_indexes]
             else:
                 available_items_by_choice = self.available_items_by_choice[choices_indexes]
         except TypeError:

diff --git a/choice_learn/data/indexer.py b/choice_learn/data/indexer.py
@@ -162,6 +162,15 @@ def __getitem__(self, sequence_keys):
         """
         if isinstance(sequence_keys, list) or isinstance(sequence_keys, np.ndarray):
             # Construction of the OneHot vector from the index of the 1 value
+
+            if np.array(sequence_keys).ndim == 1:
+                one_hot = []
+                for j in sequence_keys:
+                    # one_hot.append(self[j])
+                    one_hot.append(self.storage.storage[j])
+                matrix = np.zeros((len(one_hot), self.shape[1]))
+                matrix[np.arange(len(one_hot)), one_hot] = 1
+                return matrix.astype(self.dtype)
             one_hot = []
             for j in sequence_keys:
                 one_hot.append(self[j])
@@ -256,9 +265,10 @@ def _get_shared_features_by_choice(self, choices_indexes):
                 if hasattr(shared_feature, "batch"):
                     shared_features_by_choice.append(shared_feature.batch[choices_indexes])
                 else:
-                    shared_features_by_choice.append(
-                        np.stack(shared_feature[choices_indexes], axis=0)
-                    )
+                    # shared_features_by_choice.append(
+                    #     np.stack(shared_feature[choices_indexes], axis=0)
+                    # )
+                    shared_features_by_choice.append(shared_feature[choices_indexes])
         return shared_features_by_choice
 
     def _get_items_features_by_choice(self, choices_indexes):
@@ -281,7 +291,8 @@ def _get_items_features_by_choice(self, choices_indexes):
             if hasattr(items_feature, "batch"):
                 items_features_by_choice.append(items_feature.batch[choices_indexes])
             else:
-                items_features_by_choice.append(np.stack(items_feature[choices_indexes], axis=0))
+                # items_features_by_choice.append(np.stack(items_feature[choices_indexes], axis=0))
+                items_features_by_choice.append(items_feature[choices_indexes])
         return items_features_by_choice
 
     def __getitem__(self, choices_indexes):
@@ -352,18 +363,23 @@ def __getitem__(self, choices_indexes):
                                 ].keys()
                             )
                         ):
-                            unstacked_feat.append(
-                                shared_features_by_choice[tuple_index][
-                                    :, feat_ind_min:feature_index
-                                ]
-                            )
+                            if feat_ind_min != feature_index:
+                                unstacked_feat.append(
+                                    shared_features_by_choice[tuple_index][
+                                        :, feat_ind_min:feature_index
+                                    ]
+                                )
                             unstacked_feat.append(
                                 self.choice_dataset.shared_features_by_choice_map[tuple_index][
                                     feature_index
                                 ].batch[shared_features_by_choice[tuple_index][:, feature_index]]
                             )
                             feat_ind_min = feature_index + 1
-                        mapped_features.append(np.concatenate(unstacked_feat, axis=1))
+                        if feat_ind_min != shared_features_by_choice[tuple_index].shape[1]:
+                            unstacked_feat.append(
+                                shared_features_by_choice[tuple_index][:, feat_ind_min:]
+                            )
+                        mapped_features.append(np.hstack(unstacked_feat))
                     else:
                         mapped_features.append(shared_features_by_choice[tuple_index])
 
@@ -389,11 +405,12 @@ def __getitem__(self, choices_indexes):
                                     ].keys()
                                 )
                             ):
-                                unstacked_feat.append(
-                                    items_features_by_choice[tuple_index][
-                                        :, :, feat_ind_min:feature_index
-                                    ]
-                                )
+                                if feat_ind_min != feature_index:
+                                    unstacked_feat.append(
+                                        items_features_by_choice[tuple_index][
+                                            :, :, feat_ind_min:feature_index
+                                        ]
+                                    )
                                 unstacked_feat.append(
                                     self.choice_dataset.items_features_by_choice_map[tuple_index][
                                         feature_index
@@ -402,6 +419,10 @@ def __getitem__(self, choices_indexes):
                                     ]
                                 )
                                 feat_ind_min = feature_index + 1
+                            if feat_ind_min != items_features_by_choice[tuple_index].shape[2]:
+                                unstacked_feat.append(
+                                    shared_features_by_choice[tuple_index][:, :, feat_ind_min:]
+                                )
                             mapped_features.append(np.concatenate(unstacked_feat, axis=2))
                     else:
                         mapped_features.append(items_features_by_choice[tuple_index])
@@ -429,7 +450,6 @@ def __getitem__(self, choices_indexes):
                     items_features_by_choice = items_features_by_choice[0]
                 else:
                     items_features_by_choice = tuple(items_features_by_choice)
-
             return (
                 shared_features_by_choice,
                 items_features_by_choice,

diff --git a/tests/integration_tests/data/test_dataset_indexer.py b/tests/integration_tests/data/test_dataset_indexer.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from choice_learn.data import ChoiceDataset, OneHotStorage
+from choice_learn.data import ChoiceDataset, FeaturesStorage, OneHotStorage
 
 
 def test_batch():
@@ -93,3 +93,100 @@ def test_batch():
 
     assert (batch[2] == np.array([[1.0, 1.0], [1.0, 1.0]])).all()
     assert (batch[3] == np.array([1, 1])).all()
+
+
+def test_batch_2():
+    """Test specific usecase of batching that was failing."""
+    storage = OneHotStorage(ids=[0, 1, 2, 3], name="id")
+    shared_storage = OneHotStorage(ids=[0, 1, 2, 3], name="shared_id")
+    mixed_shared_storage = FeaturesStorage(
+        ids=[0, 1, 2, 3], values=[[10], [20], [30], [40]], name="mixed_shared_id"
+    )
+
+    items_features = np.array(
+        [
+            [
+                [2, 2, 2, 2],
+                [2, 2, 2, 3],
+            ],
+            [
+                [2, 2, 3, 2],
+                [3, 2, 2, 2],
+            ],
+            [[3, 2, 2, 2], [2, 3, 2, 2]],
+        ]
+    )
+
+    items_features_ids = np.array(
+        [
+            [[0], [1]],
+            [[3], [2]],
+            [[0], [1]],
+        ]
+    )
+
+    shared_features = np.array([[2, 3, 1], [3, 2, 4], [9, 1, 4]])
+    shared_features_ids = np.array([[0], [1], [2]])
+
+    choices = np.array([0, 1, 1])
+
+    dataset = ChoiceDataset(
+        shared_features_by_choice=(shared_features, shared_features_ids),
+        shared_features_by_choice_names=(
+            ["shared_a", "mixed_shared_id", "shared_b"],
+            ["shared_id"],
+        ),
+        items_features_by_choice=(items_features, items_features_ids),
+        items_features_by_choice_names=(["a", "b", "c", "d"], ["id"]),
+        choices=choices,
+        features_by_ids=[storage, shared_storage, mixed_shared_storage],
+    )
+
+    batch = dataset.get_choices_batch(0)
+    print(batch)
+    assert (batch[0][0] == np.array([2, 40, 1])).all()
+    assert (batch[0][1] == np.array([1, 0, 0, 0])).all()
+
+    assert (batch[1][0] == np.array([[2, 2, 2, 2], [2, 2, 2, 3]])).all()
+    assert (batch[1][1] == np.array([[1, 0, 0, 0], [0, 1, 0, 0]])).all()
+
+    assert (batch[2] == np.array([1.0, 1.0])).all()
+    assert batch[3] == 0
+
+    batch = dataset.batch[0]
+    assert (batch[0][0] == np.array([2, 40, 1])).all()
+    assert (batch[0][1] == np.array([1, 0, 0, 0])).all()
+
+    assert (batch[1][0] == np.array([[2, 2, 2, 2], [2, 2, 2, 3]])).all()
+    assert (batch[1][1] == np.array([[1, 0, 0, 0], [0, 1, 0, 0]])).all()
+
+    assert (batch[2] == np.array([1.0, 1.0])).all()
+    assert batch[3] == 0
+
+    batch = dataset.get_choices_batch([1, 2])
+    assert (batch[0][0] == np.array([[3, 30, 4], [9, 20, 4]])).all()
+    assert (batch[0][1] == np.array([[0, 1, 0, 0], [0, 0, 1, 0]])).all()
+
+    assert (
+        batch[1][0] == np.array([[[2, 2, 3, 2], [3, 2, 2, 2]], [[3, 2, 2, 2], [2, 3, 2, 2]]])
+    ).all()
+    assert (
+        batch[1][1] == np.array([[[0, 0, 0, 1], [0, 0, 1, 0]], [[1, 0, 0, 0], [0, 1, 0, 0]]])
+    ).all()
+
+    assert (batch[2] == np.array([[1.0, 1.0], [1.0, 1.0]])).all()
+    assert (batch[3] == np.array([1, 1])).all()
+
+    batch = dataset.batch[[1, 2]]
+    assert (batch[0][0] == np.array([[3, 30, 4], [9, 20, 4]])).all()
+    assert (batch[0][1] == np.array([[0, 1, 0, 0], [0, 0, 1, 0]])).all()
+
+    assert (
+        batch[1][0] == np.array([[[2, 2, 3, 2], [3, 2, 2, 2]], [[3, 2, 2, 2], [2, 3, 2, 2]]])
+    ).all()
+    assert (
+        batch[1][1] == np.array([[[0, 0, 0, 1], [0, 0, 1, 0]], [[1, 0, 0, 0], [0, 1, 0, 0]]])
+    ).all()
+
+    assert (batch[2] == np.array([[1.0, 1.0], [1.0, 1.0]])).all()
+    assert (batch[3] == np.array([1, 1])).all()