Skip to content

Commit

Permalink
save
Browse files Browse the repository at this point in the history
  • Loading branch information
brilee committed Jan 31, 2024
1 parent bbb14ac commit adb494c
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 0 deletions.
4 changes: 4 additions & 0 deletions lilac/data/dataset_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -1873,6 +1873,7 @@ def select_groups(
if not leaf.categorical and (leaf_is_float or leaf_is_integer):
if named_bins is None:
# Auto-bin.
print(stats)
named_bins = _auto_bins(stats, NUM_AUTO_BINS)

sql_bounds = []
Expand Down Expand Up @@ -3915,6 +3916,9 @@ def _normalize_bins(bins: Optional[Union[Sequence[Bin], Sequence[float]]]) -> Op


def _auto_bins(stats: StatsResult, num_bins: int) -> list[Bin]:
if stats.min_val is None or stats.max_val is None:
return [('0', None, None)]

min_val = cast(float, stats.min_val)
max_val = cast(float, stats.max_val)
value_range = max_val - min_val
Expand Down
12 changes: 12 additions & 0 deletions lilac/data/dataset_select_groups_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,18 @@ def test_auto_bins_for_float(make_test_data: TestDataMaker) -> None:
assert res.bins


def test_auto_bins_for_missing_float(make_test_data: TestDataMaker) -> None:
items: list[Item] = [{'feature': 1.0}] + [{'feature': float('nan')}] * 5
dataset = make_test_data(items)
# The 1.0 row was just to get the right type inference going; desired dataset is a bunch of NaNs.
dataset.delete_rows(filters=[('feature', 'equals', 1.0)])

res = dataset.select_groups('feature')
assert res.counts == [(None, 5)]
assert res.too_many_distinct is False
assert res.bins == [('0', None, None)]


def test_map_dtype(make_test_data: TestDataMaker) -> None:
items = [
{'column': {'a': 1.0, 'b': 2.0}},
Expand Down

0 comments on commit adb494c

Please sign in to comment.