Skip to content

Commit

Permalink
Merge pull request #465 from mdekstrand/feature/type-docs
Browse files Browse the repository at this point in the history
Simply Vocabulary, add types module, and improve type documentation
  • Loading branch information
mdekstrand authored Aug 6, 2024
2 parents 19ce50b + 4cf3134 commit 967655d
Show file tree
Hide file tree
Showing 20 changed files with 189 additions and 111 deletions.
5 changes: 2 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@
autodoc_default_options = {"members": True, "member-order": "bysource", "show-inheritance": True}
autodoc_typehints = "description"
autodoc_type_aliases = {
"Iterable": "Iterable",
"ArrayLike": "ArrayLike",
"ArrayLike": "numpy.typing.ArrayLike",
"RandomSeed": "lenskit.types.RandomSeed",
}

todo_include_todos = True
Expand All @@ -95,7 +95,6 @@
"pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None),
"numpy": ("https://numpy.org/doc/stable/", None),
"scipy": ("https://docs.scipy.org/doc/scipy/", None),
"scikit": ("https://scikit-learn.org/stable/", None),
"sklearn": ("https://scikit-learn.org/stable/", None),
"seedbank": ("https://seedbank.lenskit.org/en/latest/", None),
"progress_api": ("https://progress-api.readthedocs.io/en/latest/", None),
Expand Down
12 changes: 8 additions & 4 deletions docs/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ Identifiers and numbers can be mapped to each other with the user and item
*vocabularies* (:attr:`~Dataset.users` and :attr:`~Dataset.items`, see the
:class:`~lenskit.data.vocab.Vocabulary` class).

.. autodata:: lenskit.data.vocab.EntityId
.. autodata:: EntityId

.. _dataset:

Expand Down Expand Up @@ -89,13 +89,16 @@ LensKit uses *vocabularies* to record user/item IDs, tags, terms, etc. in a way
that facilitates easy mapping to 0-based contiguous indexes for use in matrix
and tensor data structures.

.. module:: lenskit.data

.. autoclass:: Vocabulary

User and Item Data
~~~~~~~~~~~~~~~~~~

The :mod:`lenskit.data` package also provides various classes for representing
user and item data.

Item Lists
~~~~~~~~~~
----------

LensKit uses *item lists* to represent collections of items that may be scored,
ranked, etc.
Expand Down Expand Up @@ -131,4 +134,5 @@ The lazy data set takes a function that loads a data set (of any type), and
lazily uses that function to load an underlying data set when needed.

.. autoclass:: LazyDataset
:no-members:
:members: delegate
16 changes: 16 additions & 0 deletions docs/internals.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,19 @@ LensKit Internals
These modules are primarily for internal infrastructural support in Lenskit.
Neither LensKit users nor algorithm developers are likely to need to use this
code directly.

.. class:: lenskit.types.RandomSeed

Random seed values for LensKit models and components. Can be any valid
input to :func:`seedbank.numpy_rng`, including:

* Any :data:`seedbank.SeedLike`
* A :class:`numpy.random.Generator`
* A :class:`numpy.random.RandomState` (deprecated)

.. note::

This is a type alias, not a class; it is documented as a class to work
around limitations in Sphinx.

.. autoclass:: lenskit.types.UITuple
6 changes: 3 additions & 3 deletions lenskit-implicit/lenskit/implicit.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from lenskit.algorithms import Predictor, Recommender
from lenskit.data.dataset import Dataset
from lenskit.data.vocab import EntityId, Vocabulary
from lenskit.data.vocab import Vocabulary

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -59,11 +59,11 @@ class BaseRec(Recommender, Predictor):
"""
The user-item rating matrix from training.
"""
users_: Vocabulary[EntityId]
users_: Vocabulary
"""
The user ID mapping from training.
"""
items_: Vocabulary[EntityId]
items_: Vocabulary
"""
The item ID mapping from training.
"""
Expand Down
11 changes: 4 additions & 7 deletions lenskit/lenskit/algorithms/als/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@

from lenskit import util
from lenskit.algorithms.mf_common import MFPredictor
from lenskit.data.dataset import Dataset
from lenskit.data.vocab import EntityId, Vocabulary
from lenskit.data import Dataset, Vocabulary
from lenskit.parallel.config import ensure_parallel_init


Expand Down Expand Up @@ -55,9 +54,9 @@ class TrainingData(NamedTuple):
Data for training the ALS model.
"""

users: Vocabulary[EntityId]
users: Vocabulary
"User ID mapping."
items: Vocabulary[EntityId]
items: Vocabulary
"Item ID mapping."
ui_rates: torch.Tensor
"User-item rating matrix."
Expand All @@ -73,9 +72,7 @@ def n_items(self):
return len(self.items)

@classmethod
def create(
cls, users: Vocabulary[EntityId], items: Vocabulary[EntityId], ratings: torch.Tensor
) -> TrainingData:
def create(cls, users: Vocabulary, items: Vocabulary, ratings: torch.Tensor) -> TrainingData:
assert ratings.shape == (len(users), len(items))

transposed = ratings.transpose(0, 1).to_sparse_csr()
Expand Down
6 changes: 3 additions & 3 deletions lenskit/lenskit/algorithms/knn/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from lenskit.data import FeedbackType
from lenskit.data.dataset import Dataset
from lenskit.data.matrix import normalize_sparse_rows, safe_spmv
from lenskit.data.vocab import EntityId, Vocabulary
from lenskit.data.vocab import Vocabulary
from lenskit.diagnostics import ConfigWarning, DataWarning
from lenskit.parallel import ensure_parallel_init
from lenskit.util.logging import pbh_update, progress_handle
Expand Down Expand Up @@ -111,15 +111,15 @@ class ItemItem(Predictor):
aggregate: str
use_ratings: bool

items_: Vocabulary[EntityId]
items_: Vocabulary
"Vocabulary of item IDs."
item_means_: torch.Tensor | None
"Mean rating for each known item."
item_counts_: torch.Tensor
"Number of saved neighbors for each item."
sim_matrix_: torch.Tensor
"Similarity matrix (sparse CSR tensor)."
users_: Vocabulary[EntityId]
users_: Vocabulary
"Vocabulary of user IDs."
rating_matrix_: torch.Tensor
"Normalized rating matrix to look up user ratings at prediction time."
Expand Down
6 changes: 3 additions & 3 deletions lenskit/lenskit/algorithms/knn/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from lenskit.data import FeedbackType
from lenskit.data.dataset import Dataset
from lenskit.data.matrix import normalize_sparse_rows, safe_spmv
from lenskit.data.vocab import EntityId, Vocabulary
from lenskit.data.vocab import Vocabulary
from lenskit.diagnostics import DataWarning
from lenskit.parallel.config import ensure_parallel_init

Expand Down Expand Up @@ -83,9 +83,9 @@ class UserUser(Predictor):
aggregate: str
use_ratings: bool

users_: Vocabulary[EntityId]
users_: Vocabulary
"The index of user IDs."
items_: Vocabulary[EntityId]
items_: Vocabulary
"The index of item IDs."
user_means_: torch.Tensor | None
"Mean rating for each known user."
Expand Down
6 changes: 3 additions & 3 deletions lenskit/lenskit/algorithms/svd.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from typing_extensions import Literal, override

from lenskit.data.dataset import Dataset
from lenskit.data.vocab import EntityId, Vocabulary
from lenskit.data.vocab import Vocabulary

try:
from sklearn.decomposition import TruncatedSVD
Expand Down Expand Up @@ -42,8 +42,8 @@ class BiasedSVD(Predictor):

bias: Bias
factorization: TruncatedSVD
users_: Vocabulary[EntityId]
items_: Vocabulary[EntityId]
users_: Vocabulary
items_: Vocabulary

def __init__(
self,
Expand Down
6 changes: 4 additions & 2 deletions lenskit/lenskit/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
# Copyright (C) 2023-2024 Drexel University
# Licensed under the MIT license, see LICENSE.md for details.
# SPDX-License-Identifier: MIT
from __future__ import annotations

from typing import Literal, TypeAlias
from typing_extensions import Literal, TypeAlias

from .vocab import EntityId, Vocabulary # noqa: F401, E402
from lenskit.types import EntityId, NPEntityId # noqa: F401

FeedbackType: TypeAlias = Literal["explicit", "implicit"]
"Types of feedback supported."
Expand All @@ -15,3 +16,4 @@
from .items import ItemList # noqa: F401, E402
from .movielens import load_movielens # noqa: F401, E402
from .mtarray import MTArray, MTFloatArray, MTGenericArray, MTIntArray # noqa: F401, E402
from .vocab import Vocabulary # noqa: F401, E402
24 changes: 12 additions & 12 deletions lenskit/lenskit/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@
override,
)

from lenskit.data.items import ItemList
from lenskit.data.matrix import CSRStructure, InteractionMatrix
from lenskit.data.vocab import Vocabulary
from lenskit.types import EntityId

from . import EntityId
from .items import ItemList
from .matrix import CSRStructure, InteractionMatrix
from .tables import NumpyUserItemTable, TorchUserItemTable
from .vocab import Vocabulary

DF_FORMAT: TypeAlias = Literal["numpy", "pandas", "torch"]
MAT_FORMAT: TypeAlias = Literal["scipy", "torch", "pandas", "structure"]
Expand Down Expand Up @@ -84,15 +84,15 @@ class Dataset(ABC):

@property
@abstractmethod
def items(self) -> Vocabulary[EntityId]:
def items(self) -> Vocabulary:
"""
The items known by this dataset.
"""
raise NotImplementedError()

@property
@abstractmethod
def users(self) -> Vocabulary[EntityId]:
def users(self) -> Vocabulary:
"""
The users known by this dataset.
"""
Expand Down Expand Up @@ -504,9 +504,9 @@ class MatrixDataset(Dataset):
:mod:`lenskit.data`.
"""

_users: Vocabulary[EntityId]
_users: Vocabulary
"User ID vocabulary, to map between IDs and row numbers."
_items: Vocabulary[EntityId]
_items: Vocabulary
"Item ID vocabulary, to map between IDs and column or row numbers."
_matrix: InteractionMatrix

Expand Down Expand Up @@ -546,12 +546,12 @@ def _init_structures(self, df: pd.DataFrame):

@property
@override
def items(self) -> Vocabulary[EntityId]:
def items(self) -> Vocabulary:
return self._items

@property
@override
def users(self) -> Vocabulary[EntityId]:
def users(self) -> Vocabulary:
return self._users

@override
Expand Down Expand Up @@ -795,12 +795,12 @@ def delegate(self) -> Dataset:

@property
@override
def items(self) -> Vocabulary[EntityId]:
def items(self) -> Vocabulary:
return self.delegate().items

@property
@override
def users(self) -> Vocabulary[EntityId]:
def users(self) -> Vocabulary:
return self.delegate().users

@override
Expand Down
26 changes: 13 additions & 13 deletions lenskit/lenskit/data/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,17 @@
LiteralString,
Sequence,
TypeAlias,
TypeVar,
cast,
overload,
)

from lenskit.data.checks import check_1d
from lenskit.data.mtarray import MTArray, MTGenericArray
from lenskit.data.vocab import EntityId, NPEntityId, Vocabulary
from lenskit.types import EntityId, NPEntityId

from .checks import check_1d
from .mtarray import MTArray, MTGenericArray
from .vocab import Vocabulary

Backend: TypeAlias = Literal["numpy", "torch"]
EID = TypeVar("EID", bound=EntityId)


class ItemList:
Expand Down Expand Up @@ -110,7 +110,7 @@ class is doing somewhat double-duty, representing a list of items along
_len: int
_ids: np.ndarray[int, np.dtype[NPEntityId]] | None = None
_numbers: MTArray[np.int32] | None = None
_vocab: Vocabulary[EntityId] | None = None
_vocab: Vocabulary | None = None
_ranks: MTArray[np.int32] | None = None
_fields: dict[str, MTGenericArray]

Expand All @@ -119,7 +119,7 @@ def __init__(
*,
item_ids: NDArray[NPEntityId] | pd.Series[EntityId] | Sequence[EntityId] | None = None,
item_nums: NDArray[np.int32] | pd.Series[int] | Sequence[int] | ArrayLike | None = None,
vocabulary: Vocabulary[EID] | None = None,
vocabulary: Vocabulary | None = None,
ordered: bool = False,
scores: NDArray[np.generic] | torch.Tensor | ArrayLike | None = None,
**fields: NDArray[np.generic] | torch.Tensor | ArrayLike,
Expand Down Expand Up @@ -167,7 +167,7 @@ def __init__(

@classmethod
def from_df(
cls, df: pd.DataFrame, *, vocabulary=Vocabulary[EntityId], keep_user: bool = False
cls, df: pd.DataFrame, *, vocabulary=Vocabulary, keep_user: bool = False
) -> ItemList:
"""
Create a item list from a Pandas data frame. The frame should have
Expand Down Expand Up @@ -223,24 +223,24 @@ def ids(self) -> NDArray[NPEntityId]:
if self._vocab is None:
raise RuntimeError("item IDs not available (no IDs or vocabulary provided)")
assert self._numbers is not None
self._ids = self._vocab.ids(self._numbers.numpy())
self._ids = cast(NDArray[NPEntityId], self._vocab.ids(self._numbers.numpy()))

return self._ids

@overload
def numbers(
self, format: Literal["numpy"] = "numpy", *, vocabulary: Vocabulary[EID] | None = None
self, format: Literal["numpy"] = "numpy", *, vocabulary: Vocabulary | None = None
) -> NDArray[np.int32]: ...
@overload
def numbers(
self, format: Literal["torch"], *, vocabulary: Vocabulary[EID] | None = None
self, format: Literal["torch"], *, vocabulary: Vocabulary | None = None
) -> torch.Tensor: ...
@overload
def numbers(
self, format: LiteralString = "numpy", *, vocabulary: Vocabulary[EID] | None = None
self, format: LiteralString = "numpy", *, vocabulary: Vocabulary | None = None
) -> ArrayLike: ...
def numbers(
self, format: LiteralString = "numpy", *, vocabulary: Vocabulary[EID] | None = None
self, format: LiteralString = "numpy", *, vocabulary: Vocabulary | None = None
) -> ArrayLike:
"""
Get the item numbers.
Expand Down
Loading

0 comments on commit 967655d

Please sign in to comment.