lenskit · mdekstrand · Jul 31, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/docs/data.rst b/docs/data.rst
@@ -73,9 +73,9 @@ abstract class with implementations covering various scenarios.
 Creating Datasets
 ~~~~~~~~~~~~~~~~~
 
-Several functions create :class:`Dataset`s from different input data sources.
+Several functions can create a :class:`Dataset` from different input data sources.
 
-.. autofunction:: from_interaction_df
+.. autofunction:: from_interactions_df
 
 Loading Common Datasets
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -89,20 +89,40 @@ LensKit uses *vocabularies* to record user/item IDs, tags, terms, etc. in a way
 that facilitates easy mapping to 0-based contiguous indexes for use in matrix
 and tensor data structures.
 
-.. module:: lenskit.data.vocab
+.. module:: lenskit.data
 
 .. autoclass:: Vocabulary
 
-Dataset implementations
+
+Item Lists
+~~~~~~~~~~
+
+LensKit uses *item lists* to represent collections of items that may be scored,
+ranked, etc.
+
+.. autoclass:: ItemList
+
+User-Item Data Tables
+~~~~~~~~~~~~~~~~~~~~~
+
+.. module:: lenskit.data.tables
+
+.. autoclass:: NumpyUserItemTable
+.. autoclass:: TorchUserItemTable
+
+Dataset Implementations
 ~~~~~~~~~~~~~~~~~~~~~~~
 
+.. module:: lenskit.data.dataset
+
 Matrix Dataset
 --------------
 
 The :class:`MatrixDataset` provides an in-memory dataset implementation backed
 by a ratings matrix or implicit-feedback matrix.
 
 .. autoclass:: MatrixDataset
+    :no-members:
 
 Lazy Dataset
 ------------
@@ -111,11 +131,4 @@ The lazy data set takes a function that loads a data set (of any type), and
 lazily uses that function to load an underlying data set when needed.
 
 .. autoclass:: LazyDataset
-
-User-Item Data Tables
-~~~~~~~~~~~~~~~~~~~~~
-
-.. module:: lenskit.data.tables
-
-.. autoclass:: NumpyUserItemTable
-.. autoclass:: TorchUserItemTable
+    :members: delegate
diff --git a/docs/releases/2024.rst b/docs/releases/2024.rst
@@ -24,13 +24,6 @@ Significant Changes
 
 2024.1 brings substantial changes to LensKit.
 
-*   **PyTorch**. LensKit now uses PyTorch to implement most of its algorithms,
-    instead of Numba-accelerated NumPy code.  Algorithms using PyTorch are:
-
-    * :py:class:`~lenskit.algorithms.knn.ItemItem`
-    * :py:class:`~lenskit.algorithms.als.ImplicitMF`
-    * :py:class:`~lenskit.algorithms.als.BiasedMF`
-
 *   :class:`~lenskit.data.Dataset`.  LensKit now provides an abstraction for
     training data instead of working with Pandas data frames directly, that
     allows components to reduce code duplication and recomputation, access data
@@ -39,6 +32,22 @@ Significant Changes
     supersedes the old bespoke dataset loading support, with functions like
     :func:`~lenskit.data.load_movielens` to load standard datasets.
 
+*   New classes like :class:`~lenskit.data.ItemList` for routing item data
+    instead of using Pandas data frames and series.  This makes component return
+    types more self-documenting (rather than requiring developers to remember
+    what is on the index, what the column names are, etc.), and facilitates more
+    efficient data transfer between components that do not use Pandas (e.g. data
+    passed between components using PyTorch can leave the data in tensors
+    without round-tripping through Pandas and NumPy, and keep this transparent
+    to client code).
+
+*   **PyTorch**. LensKit now uses PyTorch to implement most of its algorithms,
+    instead of Numba-accelerated NumPy code.  Algorithms using PyTorch are:
+
+    * :py:class:`~lenskit.algorithms.knn.ItemItem`
+    * :py:class:`~lenskit.algorithms.als.ImplicitMF`
+    * :py:class:`~lenskit.algorithms.als.BiasedMF`
+
 *   Many LensKit components (batch running, model training, etc.) now report progress with
     :py:mod:`progress_api`, and can be connected to TQDM or Enlighten.
 

diff --git a/lenskit/lenskit/data/__init__.py b/lenskit/lenskit/data/__init__.py
@@ -12,4 +12,6 @@
 "Types of feedback supported."
 
 from .dataset import Dataset, from_interactions_df  # noqa: F401, E402
+from .items import ItemList  # noqa: F401, E402
 from .movielens import load_movielens  # noqa: F401, E402
+from .mtarray import MTArray, MTFloatArray, MTGenericArray, MTIntArray  # noqa: F401, E402
diff --git a/lenskit/lenskit/data/checks.py b/lenskit/lenskit/data/checks.py
@@ -0,0 +1,150 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+"Data check functions for LensKit."
+
+# pyright: strict
+from __future__ import annotations
+
+from typing import Any, Literal, Protocol, TypeVar, overload
+
+import numpy as np
+from numpy.typing import NDArray
+
+
+class HasShape(Protocol):
+    @property
+    def shape(self) -> tuple[int, ...]: ...
+
+
+A = TypeVar("A", bound=HasShape)
+NPT = TypeVar("NPT", bound=np.generic)
+
+
+@overload
+def check_1d(
+    arr: A,
+    size: int | None = None,
+    *,
+    label: str = "array",
+    error: Literal["raise"] = "raise",
+) -> A: ...
+@overload
+def check_1d(
+    arr: HasShape,
+    size: int | None = None,
+    *,
+    error: Literal["return"],
+) -> bool: ...
+def check_1d(
+    arr: A,
+    size: int | None = None,
+    *,
+    label: str = "array",
+    error: Literal["raise", "return"] = "raise",
+) -> bool | A:
+    """
+    Check that an array is one-dimensional, optionally checking that it has the
+    expected length.
+
+    This check function has 2 modes:
+
+    *   If ``error="raise"`` (the default), it will raise a :class:`TypeError`
+        if the array shape is incorrect, and return the array otherwise.
+    *   If ``error="return"``, it will return ``True`` or ``False`` depending on
+        whether the size is correct.
+
+    Args:
+        arr:
+            The array to check.
+        size:
+            The expected size of the array. If unspecified, this function simply
+            checks that the array is 1-dimensional, but does not check the size
+            of that dimension.
+        label:
+            A label to use in the exception message.
+        error:
+            The behavior when an array fails the test.
+
+    Returns:
+        The array, if ``error="raise"`` and the array passes the check, or a
+        boolean indicating whether it passes the check.
+
+    Raises:
+        TypeError: if ``error="raise"`` and the array fails the check.
+    """
+    if size is None and len(arr.shape) > 1:
+        if error == "raise":
+            raise TypeError(f"{label} must be 1D (has shape {arr.shape})")
+        else:
+            return False
+    elif size is not None and arr.shape != (size,):
+        if error == "raise":
+            raise TypeError(f"{label} has incorrect shape (found {arr.shape}, expected {size})")
+        else:
+            return False
+
+    if error == "raise":
+        return arr
+    else:
+        return True
+
+
+@overload
+def check_type(
+    arr: NDArray[Any],
+    *types: type[NPT],
+    label: str = "array",
+    error: Literal["raise"] = "raise",
+) -> NDArray[NPT]: ...
+@overload
+def check_type(
+    arr: NDArray[Any],
+    *types: type[NPT],
+    error: Literal["return"],
+) -> bool: ...
+def check_type(
+    arr: NDArray[Any],
+    *types: type[NPT],
+    label: str = "array",
+    error: Literal["raise", "return"] = "raise",
+) -> bool | NDArray[Any]:
+    """
+    Check that an array array is of an acceptable type.
+
+    This check function has 2 modes:
+
+    *   If ``error="raise"`` (the default), it will raise a :class:`TypeError`
+        if the array shape is incorrect, and return the array otherwise.
+    *   If ``error="return"``, it will return ``True`` or ``False`` depending on
+        whether the size is correct.
+
+    Args:
+        arr:
+            The array to check.
+        types:
+            The acceptable types for the array.
+        label:
+            A label to use in the exception message.
+        error:
+            The behavior when an array fails the test.
+
+    Returns:
+        The array, if ``error="raise"`` and the array passes the check, or a
+        boolean indicating whether it passes the check.
+
+    Raises:
+        TypeError: if ``error="raise"`` and the array fails the check.
+    """
+    if issubclass(arr.dtype.type, types):
+        if error == "raise":
+            return arr
+        else:
+            return True
+    elif error == "raise":
+        raise TypeError(f"{label} has incorrect type {arr.dtype} (allowed: {types})")
+    else:
+        return False