Skip to content

Commit

Permalink
Merge branch 'tweak/remove-sparse-ratings'
Browse files Browse the repository at this point in the history
  • Loading branch information
mdekstrand committed Jul 25, 2024
2 parents 4454412 + 910eb93 commit 40df7b1
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 361 deletions.
9 changes: 0 additions & 9 deletions docs/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,3 @@ User-Item Data Tables

.. autoclass:: NumpyUserItemTable
.. autoclass:: TorchUserItemTable

Building Ratings Matrices
~~~~~~~~~~~~~~~~~~~~~~~~~

.. module:: lenskit.data.matrix

.. autofunction:: sparse_ratings
.. autoclass:: RatingMatrix
.. autoclass:: CSRStructure
1 change: 0 additions & 1 deletion lenskit/lenskit/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,4 @@
"Types of feedback supported."

from .dataset import Dataset, from_interactions_df # noqa: F401, E402
from .matrix import RatingMatrix, sparse_ratings # noqa: F401, E402
from .movielens import load_movielens # noqa: F401, E402
199 changes: 8 additions & 191 deletions lenskit/lenskit/data/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@
import platform

import numpy as np
import pandas as pd
import scipy.sparse as sps
import torch
from numpy.typing import ArrayLike
from typing_extensions import Any, Generic, Literal, NamedTuple, Optional, TypeVar, overload
from typing_extensions import Literal, NamedTuple, Optional, TypeVar, overload

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -114,192 +113,6 @@ def shape(self) -> tuple[int, int]:
return (self.n_users, self.n_items)


class RatingMatrix(NamedTuple, Generic[M]):
"""
A rating matrix with associated indices.
"""

matrix: M
"The rating matrix, with users on rows and items on columns."
users: pd.Index[Any]
"Mapping from user IDs to row numbers."
items: pd.Index[Any]
"Mapping from item IDs to column numbers."


class DimStats(NamedTuple):
"""
The statistics for a matrix along a dimension (e.g. rows or columns).
"""

"The size along this dimension."
n: int
"The other dimension of the matrix."
n_other: int
"The number of stored entries for each element."
counts: t.Tensor
"The sum of entries for each element."
sums: t.Tensor
"The mean of stored entries for each element."
means: t.Tensor


@overload
def sparse_ratings(
ratings: pd.DataFrame,
*,
type: Literal["scipy"] = "scipy",
layout: Literal["csr"] = "csr",
users: Optional[pd.Index[Any]] = None,
items: Optional[pd.Index[Any]] = None,
) -> RatingMatrix[sps.csr_array]: ...
@overload
def sparse_ratings(
ratings: pd.DataFrame,
*,
type: Literal["scipy"] = "scipy",
layout: Literal["coo"] = "coo",
users: Optional[pd.Index[Any]] = None,
items: Optional[pd.Index[Any]] = None,
) -> RatingMatrix[sps.coo_array]: ...
@overload
def sparse_ratings(
ratings: pd.DataFrame,
*,
type: Literal["spmatrix"] = "spmatrix",
layout: Literal["csr"] = "csr",
users: Optional[pd.Index[Any]] = None,
items: Optional[pd.Index[Any]] = None,
) -> RatingMatrix[sps.csr_matrix]: ...
@overload
def sparse_ratings(
ratings: pd.DataFrame,
*,
type: Literal["spmatrix"] = "spmatrix",
layout: Literal["coo"] = "coo",
users: Optional[pd.Index[Any]] = None,
items: Optional[pd.Index[Any]] = None,
) -> RatingMatrix[sps.coo_matrix]: ...
@overload
def sparse_ratings(
ratings: pd.DataFrame,
*,
type: Literal["torch"],
layout: Literal["coo", "csr"] = "csr",
users: Optional[pd.Index[Any]] = None,
items: Optional[pd.Index[Any]] = None,
) -> RatingMatrix[t.Tensor]: ...
@overload
def sparse_ratings(
ratings: pd.DataFrame,
*,
type: Literal["structure"] = "structure",
layout: Literal["csr"] = "csr",
users: Optional[pd.Index[Any]] = None,
items: Optional[pd.Index[Any]] = None,
) -> RatingMatrix[CSRStructure]: ...
def sparse_ratings(
ratings: pd.DataFrame,
*,
type: Literal["scipy", "spmatrix", "torch", "structure"] = "scipy",
layout: Literal["csr", "coo"] = "csr",
users: Optional[pd.Index[Any]] = None,
items: Optional[pd.Index[Any]] = None,
) -> RatingMatrix[Any]:
"""
Convert a rating table to a sparse matrix of ratings.
Args:
ratings:
A data table of (user, item, rating) triples.
type:
The type of matrix to create. Can be any of the following:
* ``scipy`` creates a SciPy sparse array (see :mod:`scipy.sparse`)
* ``torch`` creates a sparse tensor (see :mod:`torch.sparse`)
* ``spmatrix`` creates a legacy SciPy :class:`~scipy.sparse.spmatrix`
layout:
The matrix layout to use.
users:
An index of user IDs.
items:
An index of items IDs.
Returns:
RatingMatrix:
a named tuple containing the sparse matrix, user index, and item
index.
"""
if users is None:
users = pd.Index(np.unique(ratings.user), name="user")

if items is None:
items = pd.Index(np.unique(ratings.item), name="item")

n = len(ratings)
ni = len(items)
nu = len(users)

_log.debug("creating matrix with %d ratings for %d items by %d users", n, ni, nu)

row_ind = users.get_indexer(ratings.user).astype(np.intc)
if np.any(row_ind < 0):
raise ValueError("provided user index does not cover all users")
col_ind = items.get_indexer(ratings.item).astype(np.intc)
if np.any(col_ind < 0):
raise ValueError("provided item index does not cover all users")

if type == "torch":
if "rating" in ratings.columns:
vals = t.from_numpy(ratings["rating"].values).to(t.float32)
else:
vals = t.ones((len(ratings),), dtype=t.float32)
indices = t.stack([t.from_numpy(row_ind), t.from_numpy(col_ind)], dim=0)
matrix = t.sparse_coo_tensor(indices, vals, size=(nu, ni))
if layout == "csr":
matrix = matrix.to_sparse_csr()
elif type == "scipy" or type == "spmatrix":
if "rating" in ratings.columns:
vals = ratings["rating"].values
else:
vals = np.ones((len(ratings),), dtype=np.float32)
if type == "spmatrix":
matrix = sps.coo_matrix((vals, (row_ind, col_ind)), shape=(nu, ni))
else:
matrix = sps.coo_array((vals, (row_ind, col_ind)), shape=(nu, ni))
if layout == "csr":
matrix = matrix.tocsr()
elif type == "structure":
if layout != "csr":
raise ValueError("only CSR is supported for structure matrices")

df = pd.DataFrame({"row": row_ind, "col": col_ind})
df.sort_values(["row", "col"], inplace=True, ignore_index=True)
counts = df["row"].value_counts(sort=False)
rps = np.zeros(nu + 1, dtype=np.int32)
rps[counts.index + 1] = counts.values
rps = np.cumsum(rps)
matrix = CSRStructure(rps, df["col"].values, (nu, ni))
else:
raise ValueError(f"unknown type {type}")

return RatingMatrix(matrix, users, items)


def sparse_row_stats(matrix: t.Tensor) -> DimStats:
if not matrix.is_sparse_csr:
raise TypeError("only sparse CSR matrice supported")

n, n_other = matrix.shape
counts = matrix.crow_indices().diff()
assert counts.shape == (n,), f"count shape {counts.shape} != {n}"
sums = matrix.sum(dim=1, keepdim=True).to_dense().reshape(n)
assert sums.shape == (n,), f"sum shape {sums.shape} != {n}"
means = sums / counts

return DimStats(n, n_other, counts, sums, means)


@overload
def normalize_sparse_rows(
matrix: t.Tensor, method: Literal["center"], inplace: bool = False
Expand All @@ -324,13 +137,17 @@ def normalize_sparse_rows(


def _nsr_mean_center(matrix: t.Tensor) -> tuple[t.Tensor, t.Tensor]:
stats = sparse_row_stats(matrix)
nr, _nc = matrix.shape
sums = matrix.sum(dim=1, keepdim=True).to_dense().reshape(nr)
counts = torch.diff(matrix.crow_indices())
assert sums.shape == counts.shape
means = torch.nan_to_num(sums / counts, 0)
return t.sparse_csr_tensor(
crow_indices=matrix.crow_indices(),
col_indices=matrix.col_indices(),
values=matrix.values() - t.repeat_interleave(stats.means, stats.counts),
values=matrix.values() - t.repeat_interleave(means, counts),
size=matrix.shape,
), stats.means
), means


def _nsr_unit(matrix: t.Tensor) -> tuple[t.Tensor, t.Tensor]:
Expand Down
Loading

0 comments on commit 40df7b1

Please sign in to comment.