Merge branch 'tweak/remove-sparse-ratings'

lenskit · Jul 25, 2024 · 40df7b1 · 40df7b1
2 parents 4454412 + 910eb93
commit 40df7b1
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 361 deletions.
diff --git a/docs/data.rst b/docs/data.rst
@@ -119,12 +119,3 @@ User-Item Data Tables
 
 .. autoclass:: NumpyUserItemTable
 .. autoclass:: TorchUserItemTable
-
-Building Ratings Matrices
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. module:: lenskit.data.matrix
-
-.. autofunction:: sparse_ratings
-.. autoclass:: RatingMatrix
-.. autoclass:: CSRStructure
diff --git a/lenskit/lenskit/data/__init__.py b/lenskit/lenskit/data/__init__.py
@@ -12,5 +12,4 @@
 "Types of feedback supported."
 
 from .dataset import Dataset, from_interactions_df  # noqa: F401, E402
-from .matrix import RatingMatrix, sparse_ratings  # noqa: F401, E402
 from .movielens import load_movielens  # noqa: F401, E402
diff --git a/lenskit/lenskit/data/matrix.py b/lenskit/lenskit/data/matrix.py
@@ -15,11 +15,10 @@
 import platform
 
 import numpy as np
-import pandas as pd
 import scipy.sparse as sps
 import torch
 from numpy.typing import ArrayLike
-from typing_extensions import Any, Generic, Literal, NamedTuple, Optional, TypeVar, overload
+from typing_extensions import Literal, NamedTuple, Optional, TypeVar, overload
 
 _log = logging.getLogger(__name__)
 
@@ -114,192 +113,6 @@ def shape(self) -> tuple[int, int]:
         return (self.n_users, self.n_items)
 
 
-class RatingMatrix(NamedTuple, Generic[M]):
-    """
-    A rating matrix with associated indices.
-    """
-
-    matrix: M
-    "The rating matrix, with users on rows and items on columns."
-    users: pd.Index[Any]
-    "Mapping from user IDs to row numbers."
-    items: pd.Index[Any]
-    "Mapping from item IDs to column numbers."
-
-
-class DimStats(NamedTuple):
-    """
-    The statistics for a matrix along a dimension (e.g. rows or columns).
-    """
-
-    "The size along this dimension."
-    n: int
-    "The other dimension of the matrix."
-    n_other: int
-    "The number of stored entries for each element."
-    counts: t.Tensor
-    "The sum of entries for each element."
-    sums: t.Tensor
-    "The mean of stored entries for each element."
-    means: t.Tensor
-
-
-@overload
-def sparse_ratings(
-    ratings: pd.DataFrame,
-    *,
-    type: Literal["scipy"] = "scipy",
-    layout: Literal["csr"] = "csr",
-    users: Optional[pd.Index[Any]] = None,
-    items: Optional[pd.Index[Any]] = None,
-) -> RatingMatrix[sps.csr_array]: ...
-@overload
-def sparse_ratings(
-    ratings: pd.DataFrame,
-    *,
-    type: Literal["scipy"] = "scipy",
-    layout: Literal["coo"] = "coo",
-    users: Optional[pd.Index[Any]] = None,
-    items: Optional[pd.Index[Any]] = None,
-) -> RatingMatrix[sps.coo_array]: ...
-@overload
-def sparse_ratings(
-    ratings: pd.DataFrame,
-    *,
-    type: Literal["spmatrix"] = "spmatrix",
-    layout: Literal["csr"] = "csr",
-    users: Optional[pd.Index[Any]] = None,
-    items: Optional[pd.Index[Any]] = None,
-) -> RatingMatrix[sps.csr_matrix]: ...
-@overload
-def sparse_ratings(
-    ratings: pd.DataFrame,
-    *,
-    type: Literal["spmatrix"] = "spmatrix",
-    layout: Literal["coo"] = "coo",
-    users: Optional[pd.Index[Any]] = None,
-    items: Optional[pd.Index[Any]] = None,
-) -> RatingMatrix[sps.coo_matrix]: ...
-@overload
-def sparse_ratings(
-    ratings: pd.DataFrame,
-    *,
-    type: Literal["torch"],
-    layout: Literal["coo", "csr"] = "csr",
-    users: Optional[pd.Index[Any]] = None,
-    items: Optional[pd.Index[Any]] = None,
-) -> RatingMatrix[t.Tensor]: ...
-@overload
-def sparse_ratings(
-    ratings: pd.DataFrame,
-    *,
-    type: Literal["structure"] = "structure",
-    layout: Literal["csr"] = "csr",
-    users: Optional[pd.Index[Any]] = None,
-    items: Optional[pd.Index[Any]] = None,
-) -> RatingMatrix[CSRStructure]: ...
-def sparse_ratings(
-    ratings: pd.DataFrame,
-    *,
-    type: Literal["scipy", "spmatrix", "torch", "structure"] = "scipy",
-    layout: Literal["csr", "coo"] = "csr",
-    users: Optional[pd.Index[Any]] = None,
-    items: Optional[pd.Index[Any]] = None,
-) -> RatingMatrix[Any]:
-    """
-    Convert a rating table to a sparse matrix of ratings.
-
-    Args:
-        ratings:
-            A data table of (user, item, rating) triples.
-        type:
-            The type of matrix to create.  Can be any of the following:
-
-            * ``scipy`` creates a SciPy sparse array (see :mod:`scipy.sparse`)
-            * ``torch`` creates a sparse tensor (see :mod:`torch.sparse`)
-            * ``spmatrix`` creates a legacy SciPy :class:`~scipy.sparse.spmatrix`
-        layout:
-            The matrix layout to use.
-        users:
-            An index of user IDs.
-        items:
-            An index of items IDs.
-
-    Returns:
-        RatingMatrix:
-            a named tuple containing the sparse matrix, user index, and item
-            index.
-    """
-    if users is None:
-        users = pd.Index(np.unique(ratings.user), name="user")
-
-    if items is None:
-        items = pd.Index(np.unique(ratings.item), name="item")
-
-    n = len(ratings)
-    ni = len(items)
-    nu = len(users)
-
-    _log.debug("creating matrix with %d ratings for %d items by %d users", n, ni, nu)
-
-    row_ind = users.get_indexer(ratings.user).astype(np.intc)
-    if np.any(row_ind < 0):
-        raise ValueError("provided user index does not cover all users")
-    col_ind = items.get_indexer(ratings.item).astype(np.intc)
-    if np.any(col_ind < 0):
-        raise ValueError("provided item index does not cover all users")
-
-    if type == "torch":
-        if "rating" in ratings.columns:
-            vals = t.from_numpy(ratings["rating"].values).to(t.float32)
-        else:
-            vals = t.ones((len(ratings),), dtype=t.float32)
-        indices = t.stack([t.from_numpy(row_ind), t.from_numpy(col_ind)], dim=0)
-        matrix = t.sparse_coo_tensor(indices, vals, size=(nu, ni))
-        if layout == "csr":
-            matrix = matrix.to_sparse_csr()
-    elif type == "scipy" or type == "spmatrix":
-        if "rating" in ratings.columns:
-            vals = ratings["rating"].values
-        else:
-            vals = np.ones((len(ratings),), dtype=np.float32)
-        if type == "spmatrix":
-            matrix = sps.coo_matrix((vals, (row_ind, col_ind)), shape=(nu, ni))
-        else:
-            matrix = sps.coo_array((vals, (row_ind, col_ind)), shape=(nu, ni))
-        if layout == "csr":
-            matrix = matrix.tocsr()
-    elif type == "structure":
-        if layout != "csr":
-            raise ValueError("only CSR is supported for structure matrices")
-
-        df = pd.DataFrame({"row": row_ind, "col": col_ind})
-        df.sort_values(["row", "col"], inplace=True, ignore_index=True)
-        counts = df["row"].value_counts(sort=False)
-        rps = np.zeros(nu + 1, dtype=np.int32)
-        rps[counts.index + 1] = counts.values
-        rps = np.cumsum(rps)
-        matrix = CSRStructure(rps, df["col"].values, (nu, ni))
-    else:
-        raise ValueError(f"unknown type {type}")
-
-    return RatingMatrix(matrix, users, items)
-
-
-def sparse_row_stats(matrix: t.Tensor) -> DimStats:
-    if not matrix.is_sparse_csr:
-        raise TypeError("only sparse CSR matrice supported")
-
-    n, n_other = matrix.shape
-    counts = matrix.crow_indices().diff()
-    assert counts.shape == (n,), f"count shape {counts.shape} != {n}"
-    sums = matrix.sum(dim=1, keepdim=True).to_dense().reshape(n)
-    assert sums.shape == (n,), f"sum shape {sums.shape} != {n}"
-    means = sums / counts
-
-    return DimStats(n, n_other, counts, sums, means)
-
-
 @overload
 def normalize_sparse_rows(
     matrix: t.Tensor, method: Literal["center"], inplace: bool = False
@@ -324,13 +137,17 @@ def normalize_sparse_rows(
 
 
 def _nsr_mean_center(matrix: t.Tensor) -> tuple[t.Tensor, t.Tensor]:
-    stats = sparse_row_stats(matrix)
+    nr, _nc = matrix.shape
+    sums = matrix.sum(dim=1, keepdim=True).to_dense().reshape(nr)
+    counts = torch.diff(matrix.crow_indices())
+    assert sums.shape == counts.shape
+    means = torch.nan_to_num(sums / counts, 0)
     return t.sparse_csr_tensor(
         crow_indices=matrix.crow_indices(),
         col_indices=matrix.col_indices(),
-        values=matrix.values() - t.repeat_interleave(stats.means, stats.counts),
+        values=matrix.values() - t.repeat_interleave(means, counts),
         size=matrix.shape,
-    ), stats.means
+    ), means
 
 
 def _nsr_unit(matrix: t.Tensor) -> tuple[t.Tensor, t.Tensor]: