Merge pull request #465 from mdekstrand/feature/type-docs

Simply Vocabulary, add types module, and improve type documentation
lenskit · Aug 6, 2024 · 967655d · 967655d
2 parents 19ce50b + 4cf3134
commit 967655d
Show file tree

Hide file tree

Showing 20 changed files with 189 additions and 111 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -83,8 +83,8 @@
 autodoc_default_options = {"members": True, "member-order": "bysource", "show-inheritance": True}
 autodoc_typehints = "description"
 autodoc_type_aliases = {
-    "Iterable": "Iterable",
-    "ArrayLike": "ArrayLike",
+    "ArrayLike": "numpy.typing.ArrayLike",
+    "RandomSeed": "lenskit.types.RandomSeed",
 }
 
 todo_include_todos = True
@@ -95,7 +95,6 @@
     "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
     "scipy": ("https://docs.scipy.org/doc/scipy/", None),
-    "scikit": ("https://scikit-learn.org/stable/", None),
     "sklearn": ("https://scikit-learn.org/stable/", None),
     "seedbank": ("https://seedbank.lenskit.org/en/latest/", None),
     "progress_api": ("https://progress-api.readthedocs.io/en/latest/", None),

diff --git a/docs/data.rst b/docs/data.rst
@@ -56,7 +56,7 @@ Identifiers and numbers can be mapped to each other with the user and item
 *vocabularies* (:attr:`~Dataset.users` and :attr:`~Dataset.items`, see the
 :class:`~lenskit.data.vocab.Vocabulary` class).
 
-.. autodata:: lenskit.data.vocab.EntityId
+.. autodata:: EntityId
 
 .. _dataset:
 
@@ -89,13 +89,16 @@ LensKit uses *vocabularies* to record user/item IDs, tags, terms, etc. in a way
 that facilitates easy mapping to 0-based contiguous indexes for use in matrix
 and tensor data structures.
 
-.. module:: lenskit.data
-
 .. autoclass:: Vocabulary
 
+User and Item Data
+~~~~~~~~~~~~~~~~~~
+
+The :mod:`lenskit.data` package also provides various classes for representing
+user and item data.
 
 Item Lists
-~~~~~~~~~~
+----------
 
 LensKit uses *item lists* to represent collections of items that may be scored,
 ranked, etc.
@@ -131,4 +134,5 @@ The lazy data set takes a function that loads a data set (of any type), and
 lazily uses that function to load an underlying data set when needed.
 
 .. autoclass:: LazyDataset
+    :no-members:
     :members: delegate
diff --git a/docs/internals.rst b/docs/internals.rst
@@ -4,3 +4,19 @@ LensKit Internals
 These modules are primarily for internal infrastructural support in Lenskit.
 Neither LensKit users nor algorithm developers are likely to need to use this
 code directly.
+
+.. class:: lenskit.types.RandomSeed
+
+    Random seed values for LensKit models and components.  Can be any valid
+    input to :func:`seedbank.numpy_rng`, including:
+
+    * Any :data:`seedbank.SeedLike`
+    * A :class:`numpy.random.Generator`
+    * A :class:`numpy.random.RandomState` (deprecated)
+
+    .. note::
+
+        This is a type alias, not a class; it is documented as a class to work
+        around limitations in Sphinx.
+
+.. autoclass:: lenskit.types.UITuple
diff --git a/lenskit-implicit/lenskit/implicit.py b/lenskit-implicit/lenskit/implicit.py
@@ -17,7 +17,7 @@
 
 from lenskit.algorithms import Predictor, Recommender
 from lenskit.data.dataset import Dataset
-from lenskit.data.vocab import EntityId, Vocabulary
+from lenskit.data.vocab import Vocabulary
 
 _logger = logging.getLogger(__name__)
 
@@ -59,11 +59,11 @@ class BaseRec(Recommender, Predictor):
     """
     The user-item rating matrix from training.
     """
-    users_: Vocabulary[EntityId]
+    users_: Vocabulary
     """
     The user ID mapping from training.
     """
-    items_: Vocabulary[EntityId]
+    items_: Vocabulary
     """
     The item ID mapping from training.
     """

diff --git a/lenskit/lenskit/algorithms/als/common.py b/lenskit/lenskit/algorithms/als/common.py
@@ -18,8 +18,7 @@
 
 from lenskit import util
 from lenskit.algorithms.mf_common import MFPredictor
-from lenskit.data.dataset import Dataset
-from lenskit.data.vocab import EntityId, Vocabulary
+from lenskit.data import Dataset, Vocabulary
 from lenskit.parallel.config import ensure_parallel_init
 
 
@@ -55,9 +54,9 @@ class TrainingData(NamedTuple):
     Data for training the ALS model.
     """
 
-    users: Vocabulary[EntityId]
+    users: Vocabulary
     "User ID mapping."
-    items: Vocabulary[EntityId]
+    items: Vocabulary
     "Item ID mapping."
     ui_rates: torch.Tensor
     "User-item rating matrix."
@@ -73,9 +72,7 @@ def n_items(self):
         return len(self.items)
 
     @classmethod
-    def create(
-        cls, users: Vocabulary[EntityId], items: Vocabulary[EntityId], ratings: torch.Tensor
-    ) -> TrainingData:
+    def create(cls, users: Vocabulary, items: Vocabulary, ratings: torch.Tensor) -> TrainingData:
         assert ratings.shape == (len(users), len(items))
 
         transposed = ratings.transpose(0, 1).to_sparse_csr()

diff --git a/lenskit/lenskit/algorithms/knn/item.py b/lenskit/lenskit/algorithms/knn/item.py
@@ -24,7 +24,7 @@
 from lenskit.data import FeedbackType
 from lenskit.data.dataset import Dataset
 from lenskit.data.matrix import normalize_sparse_rows, safe_spmv
-from lenskit.data.vocab import EntityId, Vocabulary
+from lenskit.data.vocab import Vocabulary
 from lenskit.diagnostics import ConfigWarning, DataWarning
 from lenskit.parallel import ensure_parallel_init
 from lenskit.util.logging import pbh_update, progress_handle
@@ -111,15 +111,15 @@ class ItemItem(Predictor):
     aggregate: str
     use_ratings: bool
 
-    items_: Vocabulary[EntityId]
+    items_: Vocabulary
     "Vocabulary of item IDs."
     item_means_: torch.Tensor | None
     "Mean rating for each known item."
     item_counts_: torch.Tensor
     "Number of saved neighbors for each item."
     sim_matrix_: torch.Tensor
     "Similarity matrix (sparse CSR tensor)."
-    users_: Vocabulary[EntityId]
+    users_: Vocabulary
     "Vocabulary of user IDs."
     rating_matrix_: torch.Tensor
     "Normalized rating matrix to look up user ratings at prediction time."

diff --git a/lenskit/lenskit/algorithms/knn/user.py b/lenskit/lenskit/algorithms/knn/user.py
@@ -23,7 +23,7 @@
 from lenskit.data import FeedbackType
 from lenskit.data.dataset import Dataset
 from lenskit.data.matrix import normalize_sparse_rows, safe_spmv
-from lenskit.data.vocab import EntityId, Vocabulary
+from lenskit.data.vocab import Vocabulary
 from lenskit.diagnostics import DataWarning
 from lenskit.parallel.config import ensure_parallel_init
 
@@ -83,9 +83,9 @@ class UserUser(Predictor):
     aggregate: str
     use_ratings: bool
 
-    users_: Vocabulary[EntityId]
+    users_: Vocabulary
     "The index of user IDs."
-    items_: Vocabulary[EntityId]
+    items_: Vocabulary
     "The index of item IDs."
     user_means_: torch.Tensor | None
     "Mean rating for each known user."

diff --git a/lenskit/lenskit/algorithms/svd.py b/lenskit/lenskit/algorithms/svd.py
@@ -13,7 +13,7 @@
 from typing_extensions import Literal, override
 
 from lenskit.data.dataset import Dataset
-from lenskit.data.vocab import EntityId, Vocabulary
+from lenskit.data.vocab import Vocabulary
 
 try:
     from sklearn.decomposition import TruncatedSVD
@@ -42,8 +42,8 @@ class BiasedSVD(Predictor):
 
     bias: Bias
     factorization: TruncatedSVD
-    users_: Vocabulary[EntityId]
-    items_: Vocabulary[EntityId]
+    users_: Vocabulary
+    items_: Vocabulary
 
     def __init__(
         self,

diff --git a/lenskit/lenskit/data/__init__.py b/lenskit/lenskit/data/__init__.py
@@ -3,10 +3,11 @@
 # Copyright (C) 2023-2024 Drexel University
 # Licensed under the MIT license, see LICENSE.md for details.
 # SPDX-License-Identifier: MIT
+from __future__ import annotations
 
-from typing import Literal, TypeAlias
+from typing_extensions import Literal, TypeAlias
 
-from .vocab import EntityId, Vocabulary  # noqa: F401, E402
+from lenskit.types import EntityId, NPEntityId  # noqa: F401
 
 FeedbackType: TypeAlias = Literal["explicit", "implicit"]
 "Types of feedback supported."
@@ -15,3 +16,4 @@
 from .items import ItemList  # noqa: F401, E402
 from .movielens import load_movielens  # noqa: F401, E402
 from .mtarray import MTArray, MTFloatArray, MTGenericArray, MTIntArray  # noqa: F401, E402
+from .vocab import Vocabulary  # noqa: F401, E402
diff --git a/lenskit/lenskit/data/dataset.py b/lenskit/lenskit/data/dataset.py
@@ -32,12 +32,12 @@
     override,
 )
 
-from lenskit.data.items import ItemList
-from lenskit.data.matrix import CSRStructure, InteractionMatrix
-from lenskit.data.vocab import Vocabulary
+from lenskit.types import EntityId
 
-from . import EntityId
+from .items import ItemList
+from .matrix import CSRStructure, InteractionMatrix
 from .tables import NumpyUserItemTable, TorchUserItemTable
+from .vocab import Vocabulary
 
 DF_FORMAT: TypeAlias = Literal["numpy", "pandas", "torch"]
 MAT_FORMAT: TypeAlias = Literal["scipy", "torch", "pandas", "structure"]
@@ -84,15 +84,15 @@ class Dataset(ABC):
 
     @property
     @abstractmethod
-    def items(self) -> Vocabulary[EntityId]:
+    def items(self) -> Vocabulary:
         """
         The items known by this dataset.
         """
         raise NotImplementedError()
 
     @property
     @abstractmethod
-    def users(self) -> Vocabulary[EntityId]:
+    def users(self) -> Vocabulary:
         """
         The users known by this dataset.
         """
@@ -504,9 +504,9 @@ class MatrixDataset(Dataset):
         :mod:`lenskit.data`.
     """
 
-    _users: Vocabulary[EntityId]
+    _users: Vocabulary
     "User ID vocabulary, to map between IDs and row numbers."
-    _items: Vocabulary[EntityId]
+    _items: Vocabulary
     "Item ID vocabulary, to map between IDs and column or row numbers."
     _matrix: InteractionMatrix
 
@@ -546,12 +546,12 @@ def _init_structures(self, df: pd.DataFrame):
 
     @property
     @override
-    def items(self) -> Vocabulary[EntityId]:
+    def items(self) -> Vocabulary:
         return self._items
 
     @property
     @override
-    def users(self) -> Vocabulary[EntityId]:
+    def users(self) -> Vocabulary:
         return self._users
 
     @override
@@ -795,12 +795,12 @@ def delegate(self) -> Dataset:
 
     @property
     @override
-    def items(self) -> Vocabulary[EntityId]:
+    def items(self) -> Vocabulary:
         return self.delegate().items
 
     @property
     @override
-    def users(self) -> Vocabulary[EntityId]:
+    def users(self) -> Vocabulary:
         return self.delegate().users
 
     @override

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
@@ -20,17 +20,17 @@
     LiteralString,
     Sequence,
     TypeAlias,
-    TypeVar,
     cast,
     overload,
 )
 
-from lenskit.data.checks import check_1d
-from lenskit.data.mtarray import MTArray, MTGenericArray
-from lenskit.data.vocab import EntityId, NPEntityId, Vocabulary
+from lenskit.types import EntityId, NPEntityId
+
+from .checks import check_1d
+from .mtarray import MTArray, MTGenericArray
+from .vocab import Vocabulary
 
 Backend: TypeAlias = Literal["numpy", "torch"]
-EID = TypeVar("EID", bound=EntityId)
 
 
 class ItemList:
@@ -110,7 +110,7 @@ class is doing somewhat double-duty, representing a list of items along
     _len: int
     _ids: np.ndarray[int, np.dtype[NPEntityId]] | None = None
     _numbers: MTArray[np.int32] | None = None
-    _vocab: Vocabulary[EntityId] | None = None
+    _vocab: Vocabulary | None = None
     _ranks: MTArray[np.int32] | None = None
     _fields: dict[str, MTGenericArray]
 
@@ -119,7 +119,7 @@ def __init__(
         *,
         item_ids: NDArray[NPEntityId] | pd.Series[EntityId] | Sequence[EntityId] | None = None,
         item_nums: NDArray[np.int32] | pd.Series[int] | Sequence[int] | ArrayLike | None = None,
-        vocabulary: Vocabulary[EID] | None = None,
+        vocabulary: Vocabulary | None = None,
         ordered: bool = False,
         scores: NDArray[np.generic] | torch.Tensor | ArrayLike | None = None,
         **fields: NDArray[np.generic] | torch.Tensor | ArrayLike,
@@ -167,7 +167,7 @@ def __init__(
 
     @classmethod
     def from_df(
-        cls, df: pd.DataFrame, *, vocabulary=Vocabulary[EntityId], keep_user: bool = False
+        cls, df: pd.DataFrame, *, vocabulary=Vocabulary, keep_user: bool = False
     ) -> ItemList:
         """
         Create a item list from a Pandas data frame.  The frame should have
@@ -223,24 +223,24 @@ def ids(self) -> NDArray[NPEntityId]:
             if self._vocab is None:
                 raise RuntimeError("item IDs not available (no IDs or vocabulary provided)")
             assert self._numbers is not None
-            self._ids = self._vocab.ids(self._numbers.numpy())
+            self._ids = cast(NDArray[NPEntityId], self._vocab.ids(self._numbers.numpy()))
 
         return self._ids
 
     @overload
     def numbers(
-        self, format: Literal["numpy"] = "numpy", *, vocabulary: Vocabulary[EID] | None = None
+        self, format: Literal["numpy"] = "numpy", *, vocabulary: Vocabulary | None = None
     ) -> NDArray[np.int32]: ...
     @overload
     def numbers(
-        self, format: Literal["torch"], *, vocabulary: Vocabulary[EID] | None = None
+        self, format: Literal["torch"], *, vocabulary: Vocabulary | None = None
     ) -> torch.Tensor: ...
     @overload
     def numbers(
-        self, format: LiteralString = "numpy", *, vocabulary: Vocabulary[EID] | None = None
+        self, format: LiteralString = "numpy", *, vocabulary: Vocabulary | None = None
     ) -> ArrayLike: ...
     def numbers(
-        self, format: LiteralString = "numpy", *, vocabulary: Vocabulary[EID] | None = None
+        self, format: LiteralString = "numpy", *, vocabulary: Vocabulary | None = None
     ) -> ArrayLike:
         """
         Get the item numbers.