Skip to content

Commit

Permalink
Hotfix/storage (#117)
Browse files Browse the repository at this point in the history
FIX:
- ChoiceDataset with FeaturesStorage for availabilities
- FeaturesStorage in the middle other features were ignoring last features

ENH:
- Faster OneHotStorage batching

ADD:
- Tests corresponding to fixes
  • Loading branch information
VincentAuriau authored Jul 3, 2024
1 parent 610165a commit b43e6be
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 28 deletions.
32 changes: 21 additions & 11 deletions choice_learn/data/choice_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1222,17 +1222,22 @@ def get_choices_batch(self, choices_indexes, features=None):
for feature_index in np.sort(
list(self.shared_features_by_choice_map[tuple_index].keys())
):
unstacked_feat.append(
shared_features_by_choice[tuple_index][
:, feat_ind_min:feature_index
]
)
if feat_ind_min != feature_index:
unstacked_feat.append(
shared_features_by_choice[tuple_index][
:, feat_ind_min:feature_index
]
)
unstacked_feat.append(
self.shared_features_by_choice_map[tuple_index][
feature_index
].batch[shared_features_by_choice[tuple_index][:, feature_index]]
)
feat_ind_min = feature_index + 1
if feat_ind_min != shared_features_by_choice[tuple_index].shape[1]:
unstacked_feat.append(
shared_features_by_choice[tuple_index][:, feat_ind_min:]
)
mapped_features.append(np.concatenate(unstacked_feat, axis=1))
else:
mapped_features.append(shared_features_by_choice[tuple_index])
Expand All @@ -1255,11 +1260,12 @@ def get_choices_batch(self, choices_indexes, features=None):
for feature_index in np.sort(
list(self.items_features_by_choice_map[tuple_index].keys())
):
unstacked_feat.append(
items_features_by_choice[tuple_index][
:, :, feat_ind_min:feature_index
]
)
if feat_ind_min != feature_index:
unstacked_feat.append(
items_features_by_choice[tuple_index][
:, :, feat_ind_min:feature_index
]
)
unstacked_feat.append(
self.items_features_by_choice_map[tuple_index][
feature_index
Expand All @@ -1268,6 +1274,10 @@ def get_choices_batch(self, choices_indexes, features=None):
]
)
feat_ind_min = feature_index + 1
if feat_ind_min != items_features_by_choice[tuple_index].shape[2]:
unstacked_feat.append(
shared_features_by_choice[tuple_index][:, :, feat_ind_min:]
)
mapped_features.append(np.concatenate(unstacked_feat, axis=2))
else:
mapped_features.append(items_features_by_choice[tuple_index])
Expand Down Expand Up @@ -1398,7 +1408,7 @@ def __getitem__(self, choices_indexes):

try:
if isinstance(self.available_items_by_choice, tuple):
available_items_by_choice = self.available_items_by_choice[1]
available_items_by_choice = self.available_items_by_choice[1][choices_indexes]
else:
available_items_by_choice = self.available_items_by_choice[choices_indexes]
except TypeError:
Expand Down
52 changes: 36 additions & 16 deletions choice_learn/data/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,15 @@ def __getitem__(self, sequence_keys):
"""
if isinstance(sequence_keys, list) or isinstance(sequence_keys, np.ndarray):
# Construction of the OneHot vector from the index of the 1 value

if np.array(sequence_keys).ndim == 1:
one_hot = []
for j in sequence_keys:
# one_hot.append(self[j])
one_hot.append(self.storage.storage[j])
matrix = np.zeros((len(one_hot), self.shape[1]))
matrix[np.arange(len(one_hot)), one_hot] = 1
return matrix.astype(self.dtype)
one_hot = []
for j in sequence_keys:
one_hot.append(self[j])
Expand Down Expand Up @@ -256,9 +265,10 @@ def _get_shared_features_by_choice(self, choices_indexes):
if hasattr(shared_feature, "batch"):
shared_features_by_choice.append(shared_feature.batch[choices_indexes])
else:
shared_features_by_choice.append(
np.stack(shared_feature[choices_indexes], axis=0)
)
# shared_features_by_choice.append(
# np.stack(shared_feature[choices_indexes], axis=0)
# )
shared_features_by_choice.append(shared_feature[choices_indexes])
return shared_features_by_choice

def _get_items_features_by_choice(self, choices_indexes):
Expand All @@ -281,7 +291,8 @@ def _get_items_features_by_choice(self, choices_indexes):
if hasattr(items_feature, "batch"):
items_features_by_choice.append(items_feature.batch[choices_indexes])
else:
items_features_by_choice.append(np.stack(items_feature[choices_indexes], axis=0))
# items_features_by_choice.append(np.stack(items_feature[choices_indexes], axis=0))
items_features_by_choice.append(items_feature[choices_indexes])
return items_features_by_choice

def __getitem__(self, choices_indexes):
Expand Down Expand Up @@ -352,18 +363,23 @@ def __getitem__(self, choices_indexes):
].keys()
)
):
unstacked_feat.append(
shared_features_by_choice[tuple_index][
:, feat_ind_min:feature_index
]
)
if feat_ind_min != feature_index:
unstacked_feat.append(
shared_features_by_choice[tuple_index][
:, feat_ind_min:feature_index
]
)
unstacked_feat.append(
self.choice_dataset.shared_features_by_choice_map[tuple_index][
feature_index
].batch[shared_features_by_choice[tuple_index][:, feature_index]]
)
feat_ind_min = feature_index + 1
mapped_features.append(np.concatenate(unstacked_feat, axis=1))
if feat_ind_min != shared_features_by_choice[tuple_index].shape[1]:
unstacked_feat.append(
shared_features_by_choice[tuple_index][:, feat_ind_min:]
)
mapped_features.append(np.hstack(unstacked_feat))
else:
mapped_features.append(shared_features_by_choice[tuple_index])

Expand All @@ -389,11 +405,12 @@ def __getitem__(self, choices_indexes):
].keys()
)
):
unstacked_feat.append(
items_features_by_choice[tuple_index][
:, :, feat_ind_min:feature_index
]
)
if feat_ind_min != feature_index:
unstacked_feat.append(
items_features_by_choice[tuple_index][
:, :, feat_ind_min:feature_index
]
)
unstacked_feat.append(
self.choice_dataset.items_features_by_choice_map[tuple_index][
feature_index
Expand All @@ -402,6 +419,10 @@ def __getitem__(self, choices_indexes):
]
)
feat_ind_min = feature_index + 1
if feat_ind_min != items_features_by_choice[tuple_index].shape[2]:
unstacked_feat.append(
shared_features_by_choice[tuple_index][:, :, feat_ind_min:]
)
mapped_features.append(np.concatenate(unstacked_feat, axis=2))
else:
mapped_features.append(items_features_by_choice[tuple_index])
Expand Down Expand Up @@ -429,7 +450,6 @@ def __getitem__(self, choices_indexes):
items_features_by_choice = items_features_by_choice[0]
else:
items_features_by_choice = tuple(items_features_by_choice)

return (
shared_features_by_choice,
items_features_by_choice,
Expand Down
99 changes: 98 additions & 1 deletion tests/integration_tests/data/test_dataset_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np

from choice_learn.data import ChoiceDataset, OneHotStorage
from choice_learn.data import ChoiceDataset, FeaturesStorage, OneHotStorage


def test_batch():
Expand Down Expand Up @@ -93,3 +93,100 @@ def test_batch():

assert (batch[2] == np.array([[1.0, 1.0], [1.0, 1.0]])).all()
assert (batch[3] == np.array([1, 1])).all()


def test_batch_2():
"""Test specific usecase of batching that was failing."""
storage = OneHotStorage(ids=[0, 1, 2, 3], name="id")
shared_storage = OneHotStorage(ids=[0, 1, 2, 3], name="shared_id")
mixed_shared_storage = FeaturesStorage(
ids=[0, 1, 2, 3], values=[[10], [20], [30], [40]], name="mixed_shared_id"
)

items_features = np.array(
[
[
[2, 2, 2, 2],
[2, 2, 2, 3],
],
[
[2, 2, 3, 2],
[3, 2, 2, 2],
],
[[3, 2, 2, 2], [2, 3, 2, 2]],
]
)

items_features_ids = np.array(
[
[[0], [1]],
[[3], [2]],
[[0], [1]],
]
)

shared_features = np.array([[2, 3, 1], [3, 2, 4], [9, 1, 4]])
shared_features_ids = np.array([[0], [1], [2]])

choices = np.array([0, 1, 1])

dataset = ChoiceDataset(
shared_features_by_choice=(shared_features, shared_features_ids),
shared_features_by_choice_names=(
["shared_a", "mixed_shared_id", "shared_b"],
["shared_id"],
),
items_features_by_choice=(items_features, items_features_ids),
items_features_by_choice_names=(["a", "b", "c", "d"], ["id"]),
choices=choices,
features_by_ids=[storage, shared_storage, mixed_shared_storage],
)

batch = dataset.get_choices_batch(0)
print(batch)
assert (batch[0][0] == np.array([2, 40, 1])).all()
assert (batch[0][1] == np.array([1, 0, 0, 0])).all()

assert (batch[1][0] == np.array([[2, 2, 2, 2], [2, 2, 2, 3]])).all()
assert (batch[1][1] == np.array([[1, 0, 0, 0], [0, 1, 0, 0]])).all()

assert (batch[2] == np.array([1.0, 1.0])).all()
assert batch[3] == 0

batch = dataset.batch[0]
assert (batch[0][0] == np.array([2, 40, 1])).all()
assert (batch[0][1] == np.array([1, 0, 0, 0])).all()

assert (batch[1][0] == np.array([[2, 2, 2, 2], [2, 2, 2, 3]])).all()
assert (batch[1][1] == np.array([[1, 0, 0, 0], [0, 1, 0, 0]])).all()

assert (batch[2] == np.array([1.0, 1.0])).all()
assert batch[3] == 0

batch = dataset.get_choices_batch([1, 2])
assert (batch[0][0] == np.array([[3, 30, 4], [9, 20, 4]])).all()
assert (batch[0][1] == np.array([[0, 1, 0, 0], [0, 0, 1, 0]])).all()

assert (
batch[1][0] == np.array([[[2, 2, 3, 2], [3, 2, 2, 2]], [[3, 2, 2, 2], [2, 3, 2, 2]]])
).all()
assert (
batch[1][1] == np.array([[[0, 0, 0, 1], [0, 0, 1, 0]], [[1, 0, 0, 0], [0, 1, 0, 0]]])
).all()

assert (batch[2] == np.array([[1.0, 1.0], [1.0, 1.0]])).all()
assert (batch[3] == np.array([1, 1])).all()

batch = dataset.batch[[1, 2]]
assert (batch[0][0] == np.array([[3, 30, 4], [9, 20, 4]])).all()
assert (batch[0][1] == np.array([[0, 1, 0, 0], [0, 0, 1, 0]])).all()

assert (
batch[1][0] == np.array([[[2, 2, 3, 2], [3, 2, 2, 2]], [[3, 2, 2, 2], [2, 3, 2, 2]]])
).all()
assert (
batch[1][1] == np.array([[[0, 0, 0, 1], [0, 0, 1, 0]], [[1, 0, 0, 0], [0, 1, 0, 0]]])
).all()

assert (batch[2] == np.array([[1.0, 1.0], [1.0, 1.0]])).all()
assert (batch[3] == np.array([1, 1])).all()

0 comments on commit b43e6be

Please sign in to comment.