diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py index c1d85c71..48bbed67 100644 --- a/lilac/data/dataset_duckdb.py +++ b/lilac/data/dataset_duckdb.py @@ -1873,6 +1873,7 @@ def select_groups( if not leaf.categorical and (leaf_is_float or leaf_is_integer): if named_bins is None: # Auto-bin. + print(stats) named_bins = _auto_bins(stats, NUM_AUTO_BINS) sql_bounds = [] @@ -3915,6 +3916,9 @@ def _normalize_bins(bins: Optional[Union[Sequence[Bin], Sequence[float]]]) -> Op def _auto_bins(stats: StatsResult, num_bins: int) -> list[Bin]: + if stats.min_val is None or stats.max_val is None: + return [('0', None, None)] + min_val = cast(float, stats.min_val) max_val = cast(float, stats.max_val) value_range = max_val - min_val diff --git a/lilac/data/dataset_select_groups_test.py b/lilac/data/dataset_select_groups_test.py index 7f8adabd..66eb56e5 100644 --- a/lilac/data/dataset_select_groups_test.py +++ b/lilac/data/dataset_select_groups_test.py @@ -263,6 +263,18 @@ def test_auto_bins_for_float(make_test_data: TestDataMaker) -> None: assert res.bins +def test_auto_bins_for_missing_float(make_test_data: TestDataMaker) -> None: + items: list[Item] = [{'feature': 1.0}] + [{'feature': float('nan')}] * 5 + dataset = make_test_data(items) + # The 1.0 row was just to get the right type inference going; desired dataset is a bunch of NaNs. + dataset.delete_rows(filters=[('feature', 'equals', 1.0)]) + + res = dataset.select_groups('feature') + assert res.counts == [(None, 5)] + assert res.too_many_distinct is False + assert res.bins == [('0', None, None)] + + def test_map_dtype(make_test_data: TestDataMaker) -> None: items = [ {'column': {'a': 1.0, 'b': 2.0}},