From 070b35e32a010e3a3b55f36e5c44ad2768cb885c Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Thu, 25 Jul 2024 11:18:56 -0400 Subject: [PATCH 1/6] remove sparse_ratings function --- docs/data.rst | 9 -- lenskit/lenskit/data/__init__.py | 1 - lenskit/lenskit/data/matrix.py | 158 +------------------------------ lenskit/tests/test_matrix.py | 140 +-------------------------- 4 files changed, 4 insertions(+), 304 deletions(-) diff --git a/docs/data.rst b/docs/data.rst index 76d8f0ac4..76aea5c72 100644 --- a/docs/data.rst +++ b/docs/data.rst @@ -119,12 +119,3 @@ User-Item Data Tables .. autoclass:: NumpyUserItemTable .. autoclass:: TorchUserItemTable - -Building Ratings Matrices -~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. module:: lenskit.data.matrix - -.. autofunction:: sparse_ratings -.. autoclass:: RatingMatrix -.. autoclass:: CSRStructure diff --git a/lenskit/lenskit/data/__init__.py b/lenskit/lenskit/data/__init__.py index 5d6d5ecd6..6ddefed5d 100644 --- a/lenskit/lenskit/data/__init__.py +++ b/lenskit/lenskit/data/__init__.py @@ -12,5 +12,4 @@ "Types of feedback supported." from .dataset import Dataset, from_interactions_df # noqa: F401, E402 -from .matrix import RatingMatrix, sparse_ratings # noqa: F401, E402 from .movielens import load_movielens # noqa: F401, E402 diff --git a/lenskit/lenskit/data/matrix.py b/lenskit/lenskit/data/matrix.py index 3be88eb55..b48d7c241 100644 --- a/lenskit/lenskit/data/matrix.py +++ b/lenskit/lenskit/data/matrix.py @@ -15,11 +15,10 @@ import platform import numpy as np -import pandas as pd import scipy.sparse as sps import torch from numpy.typing import ArrayLike -from typing_extensions import Any, Generic, Literal, NamedTuple, Optional, TypeVar, overload +from typing_extensions import Literal, NamedTuple, Optional, TypeVar, overload _log = logging.getLogger(__name__) @@ -114,19 +113,6 @@ def shape(self) -> tuple[int, int]: return (self.n_users, self.n_items) -class RatingMatrix(NamedTuple, Generic[M]): - """ - A rating matrix with associated indices. - """ - - matrix: M - "The rating matrix, with users on rows and items on columns." - users: pd.Index[Any] - "Mapping from user IDs to row numbers." - items: pd.Index[Any] - "Mapping from item IDs to column numbers." - - class DimStats(NamedTuple): """ The statistics for a matrix along a dimension (e.g. rows or columns). @@ -144,148 +130,6 @@ class DimStats(NamedTuple): means: t.Tensor -@overload -def sparse_ratings( - ratings: pd.DataFrame, - *, - type: Literal["scipy"] = "scipy", - layout: Literal["csr"] = "csr", - users: Optional[pd.Index[Any]] = None, - items: Optional[pd.Index[Any]] = None, -) -> RatingMatrix[sps.csr_array]: ... -@overload -def sparse_ratings( - ratings: pd.DataFrame, - *, - type: Literal["scipy"] = "scipy", - layout: Literal["coo"] = "coo", - users: Optional[pd.Index[Any]] = None, - items: Optional[pd.Index[Any]] = None, -) -> RatingMatrix[sps.coo_array]: ... -@overload -def sparse_ratings( - ratings: pd.DataFrame, - *, - type: Literal["spmatrix"] = "spmatrix", - layout: Literal["csr"] = "csr", - users: Optional[pd.Index[Any]] = None, - items: Optional[pd.Index[Any]] = None, -) -> RatingMatrix[sps.csr_matrix]: ... -@overload -def sparse_ratings( - ratings: pd.DataFrame, - *, - type: Literal["spmatrix"] = "spmatrix", - layout: Literal["coo"] = "coo", - users: Optional[pd.Index[Any]] = None, - items: Optional[pd.Index[Any]] = None, -) -> RatingMatrix[sps.coo_matrix]: ... -@overload -def sparse_ratings( - ratings: pd.DataFrame, - *, - type: Literal["torch"], - layout: Literal["coo", "csr"] = "csr", - users: Optional[pd.Index[Any]] = None, - items: Optional[pd.Index[Any]] = None, -) -> RatingMatrix[t.Tensor]: ... -@overload -def sparse_ratings( - ratings: pd.DataFrame, - *, - type: Literal["structure"] = "structure", - layout: Literal["csr"] = "csr", - users: Optional[pd.Index[Any]] = None, - items: Optional[pd.Index[Any]] = None, -) -> RatingMatrix[CSRStructure]: ... -def sparse_ratings( - ratings: pd.DataFrame, - *, - type: Literal["scipy", "spmatrix", "torch", "structure"] = "scipy", - layout: Literal["csr", "coo"] = "csr", - users: Optional[pd.Index[Any]] = None, - items: Optional[pd.Index[Any]] = None, -) -> RatingMatrix[Any]: - """ - Convert a rating table to a sparse matrix of ratings. - - Args: - ratings: - A data table of (user, item, rating) triples. - type: - The type of matrix to create. Can be any of the following: - - * ``scipy`` creates a SciPy sparse array (see :mod:`scipy.sparse`) - * ``torch`` creates a sparse tensor (see :mod:`torch.sparse`) - * ``spmatrix`` creates a legacy SciPy :class:`~scipy.sparse.spmatrix` - layout: - The matrix layout to use. - users: - An index of user IDs. - items: - An index of items IDs. - - Returns: - RatingMatrix: - a named tuple containing the sparse matrix, user index, and item - index. - """ - if users is None: - users = pd.Index(np.unique(ratings.user), name="user") - - if items is None: - items = pd.Index(np.unique(ratings.item), name="item") - - n = len(ratings) - ni = len(items) - nu = len(users) - - _log.debug("creating matrix with %d ratings for %d items by %d users", n, ni, nu) - - row_ind = users.get_indexer(ratings.user).astype(np.intc) - if np.any(row_ind < 0): - raise ValueError("provided user index does not cover all users") - col_ind = items.get_indexer(ratings.item).astype(np.intc) - if np.any(col_ind < 0): - raise ValueError("provided item index does not cover all users") - - if type == "torch": - if "rating" in ratings.columns: - vals = t.from_numpy(ratings["rating"].values).to(t.float32) - else: - vals = t.ones((len(ratings),), dtype=t.float32) - indices = t.stack([t.from_numpy(row_ind), t.from_numpy(col_ind)], dim=0) - matrix = t.sparse_coo_tensor(indices, vals, size=(nu, ni)) - if layout == "csr": - matrix = matrix.to_sparse_csr() - elif type == "scipy" or type == "spmatrix": - if "rating" in ratings.columns: - vals = ratings["rating"].values - else: - vals = np.ones((len(ratings),), dtype=np.float32) - if type == "spmatrix": - matrix = sps.coo_matrix((vals, (row_ind, col_ind)), shape=(nu, ni)) - else: - matrix = sps.coo_array((vals, (row_ind, col_ind)), shape=(nu, ni)) - if layout == "csr": - matrix = matrix.tocsr() - elif type == "structure": - if layout != "csr": - raise ValueError("only CSR is supported for structure matrices") - - df = pd.DataFrame({"row": row_ind, "col": col_ind}) - df.sort_values(["row", "col"], inplace=True, ignore_index=True) - counts = df["row"].value_counts(sort=False) - rps = np.zeros(nu + 1, dtype=np.int32) - rps[counts.index + 1] = counts.values - rps = np.cumsum(rps) - matrix = CSRStructure(rps, df["col"].values, (nu, ni)) - else: - raise ValueError(f"unknown type {type}") - - return RatingMatrix(matrix, users, items) - - def sparse_row_stats(matrix: t.Tensor) -> DimStats: if not matrix.is_sparse_csr: raise TypeError("only sparse CSR matrice supported") diff --git a/lenskit/tests/test_matrix.py b/lenskit/tests/test_matrix.py index 845ad1b60..e7cba873e 100644 --- a/lenskit/tests/test_matrix.py +++ b/lenskit/tests/test_matrix.py @@ -4,151 +4,17 @@ # Licensed under the MIT license, see LICENSE.md for details. # SPDX-License-Identifier: MIT -import logging import numpy as np -import pandas as pd -import scipy.sparse as sps import torch import hypothesis.extra.numpy as nph import hypothesis.strategies as st from hypothesis import HealthCheck, assume, given, settings -from pytest import approx, mark +from pytest import approx -from lenskit.data import sparse_ratings -from lenskit.data.matrix import CSRStructure, safe_spmv, torch_sparse_from_scipy -from lenskit.util.test import coo_arrays, ml_test - -_log = logging.getLogger(__name__) - - -def test_sparse_ratings(rng): - ratings = ml_test.ratings - mat, uidx, iidx = sparse_ratings(ratings) - - assert mat.shape[0] == len(uidx) - assert mat.shape[0] == ratings.user.nunique() - assert mat.shape[1] == len(iidx) - assert mat.shape[1] == ratings.item.nunique() - - # user indicators should correspond to user item counts - ucounts = ratings.groupby("user").item.count() - ucounts = ucounts.loc[uidx].cumsum() - assert all(mat.indptr[1:] == ucounts.values) - - # verify rating values - ratings = ratings.set_index(["user", "item"]) - for u in rng.choice(uidx, size=50): - ui = uidx.get_loc(u) - r = mat[[ui], :] - vs = pd.Series(r.data, iidx[r.indices]) - rates = ratings.loc[u]["rating"] - print(f"values:\n{vs}") - print(f"ratings:\n{rates}") - vs, rates = vs.align(rates) - assert not any(vs.isna()) - assert not any(rates.isna()) - assert all(vs == rates) - - -def test_sparse_ratings_implicit(): - ratings = ml_test.ratings - ratings = ratings.loc[:, ["user", "item"]] - mat, uidx, iidx = sparse_ratings(ratings) - - assert mat.shape[0] == len(uidx) - assert mat.shape[0] == ratings.user.nunique() - assert mat.shape[1] == len(iidx) - assert mat.shape[1] == ratings.item.nunique() - # assert mat.values is None - - -@mark.parametrize( - "format, sps_fmt_checker", - [ - ("csr", lambda a: isinstance(a, sps.csr_array)), - ("coo", lambda a: isinstance(a, sps.coo_array)), - ], -) -def test_sparse_ratings_scipy(format, sps_fmt_checker): - ratings = ml_test.ratings - mat, uidx, iidx = sparse_ratings(ratings, layout=format) - - assert sps.issparse(mat) - assert sps_fmt_checker(mat) - assert len(uidx) == ratings.user.nunique() - assert len(iidx) == ratings.item.nunique() - - # user indicators should correspond to user item counts - ucounts = ratings.groupby("user").item.count() - ucounts = ucounts.loc[uidx].cumsum() - m2 = mat.tocsr() - assert np.all(m2.indptr[1:] == ucounts.values) - - -def test_sparse_ratings_scipy_implicit(): - ratings = ml_test.ratings - ratings = ratings.loc[:, ["user", "item"]] - mat, uidx, iidx = sparse_ratings(ratings) - - assert sps.issparse(mat) - assert isinstance(mat, sps.csr_array) - assert len(uidx) == ratings.user.nunique() - assert len(iidx) == ratings.item.nunique() - - assert all(mat.data == 1.0) - - -def test_sparse_ratings_structure(): - ratings = ml_test.ratings - ratings = ratings.loc[:, ["user", "item"]] - mat, uidx, iidx = sparse_ratings(ratings, type="structure") - spmat, _uidx, _iidx = sparse_ratings(ratings, users=uidx, items=iidx) - - assert isinstance(mat, CSRStructure) - assert mat.nrows == ratings.user.nunique() - assert mat.ncols == ratings.item.nunique() - assert mat.nnz == len(ratings) - assert mat.rowptrs[mat.nrows] == mat.nnz - assert np.all(mat.rowptrs == spmat.indptr) - assert np.all(mat.colinds == spmat.indices) - - -def test_sparse_ratings_torch(): - ratings = ml_test.ratings - mat: torch.Tensor - mat, uidx, iidx = sparse_ratings(ratings, type="torch") - - assert torch.is_tensor(mat) - assert mat.is_sparse_csr - assert len(uidx) == ratings.user.nunique() - assert len(iidx) == ratings.item.nunique() - - -def test_sparse_ratings_indexes(rng): - ratings = ml_test.ratings - uidx = pd.Index(rng.permutation(ratings["user"].unique())) - iidx = pd.Index(rng.permutation(ratings["item"].unique())) - - mat, _uidx, _iidx = sparse_ratings(ratings, users=uidx, items=iidx) - - assert _uidx is uidx - assert _iidx is iidx - assert len(_uidx) == ratings.user.nunique() - assert len(_iidx) == ratings.item.nunique() - - # verify rating values - ratings = ratings.set_index(["user", "item"]) - for u in rng.choice(_uidx, size=50): - ui = _uidx.get_loc(u) - r = mat[[ui], :] - vs = pd.Series(r.data, _iidx[r.indices]) - rates = ratings.loc[u]["rating"] - vs, rates = vs.align(rates) - assert not any(vs.isna()) - assert not any(rates.isna()) - assert all(vs == rates) +from lenskit.data.matrix import safe_spmv, torch_sparse_from_scipy +from lenskit.util.test import coo_arrays @settings(deadline=1000, suppress_health_check=[HealthCheck.too_slow]) From 6502152659012e629b84b60c06cc2f35672936a6 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Thu, 25 Jul 2024 12:01:56 -0400 Subject: [PATCH 2/6] remove sparse_row_stats function --- lenskit/lenskit/data/matrix.py | 41 ++++++------------------------- lenskit/tests/test_matrix_rows.py | 35 +++++++++----------------- 2 files changed, 19 insertions(+), 57 deletions(-) diff --git a/lenskit/lenskit/data/matrix.py b/lenskit/lenskit/data/matrix.py index b48d7c241..1e7e1f75d 100644 --- a/lenskit/lenskit/data/matrix.py +++ b/lenskit/lenskit/data/matrix.py @@ -113,37 +113,6 @@ def shape(self) -> tuple[int, int]: return (self.n_users, self.n_items) -class DimStats(NamedTuple): - """ - The statistics for a matrix along a dimension (e.g. rows or columns). - """ - - "The size along this dimension." - n: int - "The other dimension of the matrix." - n_other: int - "The number of stored entries for each element." - counts: t.Tensor - "The sum of entries for each element." - sums: t.Tensor - "The mean of stored entries for each element." - means: t.Tensor - - -def sparse_row_stats(matrix: t.Tensor) -> DimStats: - if not matrix.is_sparse_csr: - raise TypeError("only sparse CSR matrice supported") - - n, n_other = matrix.shape - counts = matrix.crow_indices().diff() - assert counts.shape == (n,), f"count shape {counts.shape} != {n}" - sums = matrix.sum(dim=1, keepdim=True).to_dense().reshape(n) - assert sums.shape == (n,), f"sum shape {sums.shape} != {n}" - means = sums / counts - - return DimStats(n, n_other, counts, sums, means) - - @overload def normalize_sparse_rows( matrix: t.Tensor, method: Literal["center"], inplace: bool = False @@ -168,13 +137,17 @@ def normalize_sparse_rows( def _nsr_mean_center(matrix: t.Tensor) -> tuple[t.Tensor, t.Tensor]: - stats = sparse_row_stats(matrix) + nr, _nc = matrix.shape + sums = matrix.sum(dim=1, keepdim=True).to_dense().reshape(nr) + counts = torch.diff(matrix.crow_indices()) + assert sums.shape == counts.shape + means = torch.nan_to_num(sums / counts, 0) return t.sparse_csr_tensor( crow_indices=matrix.crow_indices(), col_indices=matrix.col_indices(), - values=matrix.values() - t.repeat_interleave(stats.means, stats.counts), + values=matrix.values() - t.repeat_interleave(means, counts), size=matrix.shape, - ), stats.means + ), means def _nsr_unit(matrix: t.Tensor) -> tuple[t.Tensor, t.Tensor]: diff --git a/lenskit/tests/test_matrix_rows.py b/lenskit/tests/test_matrix_rows.py index ec62fcd92..6cdd1cfb5 100644 --- a/lenskit/tests/test_matrix_rows.py +++ b/lenskit/tests/test_matrix_rows.py @@ -16,40 +16,29 @@ from hypothesis import HealthCheck, given, settings from pytest import approx -from lenskit.data.matrix import normalize_sparse_rows, sparse_row_stats +from lenskit.data.matrix import normalize_sparse_rows from lenskit.util.test import sparse_tensors _log = logging.getLogger(__name__) -@settings(suppress_health_check=[HealthCheck.too_slow]) -@given(sparse_tensors()) -def test_sparse_stats(tensor): - nr, nc = tensor.shape - _log.debug("tensor: %d x %d", nr, nc) - - stats = sparse_row_stats(tensor) - assert stats.means.shape == (nr,) - assert stats.counts.shape == (nr,) - - assert np.sum(stats.counts.numpy()) == tensor.values().shape[0] - - sums = tensor.sum(dim=1, keepdim=True) - sums = sums.to_dense().reshape(-1) - tots = stats.means * stats.counts - mask = stats.counts.numpy() > 0 - assert tots.numpy()[mask] == approx(sums.numpy()[mask]) - - @settings(deadline=1000, suppress_health_check=[HealthCheck.too_slow]) @given(sparse_tensors()) -def test_sparse_mean_center(tensor): +def test_sparse_mean_center(tensor: torch.Tensor): nr, nc = tensor.shape - stats = sparse_row_stats(tensor) + coo = tensor.to_sparse_coo() + rows = coo.indices()[0, :].numpy() + counts = np.zeros(nr, dtype=np.int32) + sums = np.zeros(nr, dtype=np.float32) + np.add.at(counts, rows, 1) + np.add.at(sums, rows, coo.values().numpy()) + tgt_means = sums / counts + tgt_means = np.nan_to_num(tgt_means, nan=0) nt, means = normalize_sparse_rows(tensor, "center") + assert means.shape == torch.Size([nr]) - assert means.numpy() == approx(stats.means.numpy(), nan_ok=True) + assert means.numpy() == approx(tgt_means, nan_ok=True) for i in range(nr): tr = tensor[i].values().numpy() From 6b2f2f841020add4e870f962df0e5c184cacd8af Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Thu, 25 Jul 2024 12:19:41 -0400 Subject: [PATCH 3/6] tweak matrix test sensitivity --- lenskit/tests/test_matrix_rows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lenskit/tests/test_matrix_rows.py b/lenskit/tests/test_matrix_rows.py index 6cdd1cfb5..ff8b0d98a 100644 --- a/lenskit/tests/test_matrix_rows.py +++ b/lenskit/tests/test_matrix_rows.py @@ -30,7 +30,7 @@ def test_sparse_mean_center(tensor: torch.Tensor): coo = tensor.to_sparse_coo() rows = coo.indices()[0, :].numpy() counts = np.zeros(nr, dtype=np.int32) - sums = np.zeros(nr, dtype=np.float32) + sums = np.zeros(nr, dtype=np.float64) np.add.at(counts, rows, 1) np.add.at(sums, rows, coo.values().numpy()) tgt_means = sums / counts @@ -38,7 +38,7 @@ def test_sparse_mean_center(tensor: torch.Tensor): nt, means = normalize_sparse_rows(tensor, "center") assert means.shape == torch.Size([nr]) - assert means.numpy() == approx(tgt_means, nan_ok=True) + assert means.numpy() == approx(tgt_means, nan_ok=True, rel=1.0e-5) for i in range(nr): tr = tensor[i].values().numpy() From ee4beeea20456026196692e25d4044376ec02a9e Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Thu, 25 Jul 2024 14:31:04 -0400 Subject: [PATCH 4/6] relax test tolerance further --- lenskit/tests/test_matrix_rows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lenskit/tests/test_matrix_rows.py b/lenskit/tests/test_matrix_rows.py index ff8b0d98a..14ec04d10 100644 --- a/lenskit/tests/test_matrix_rows.py +++ b/lenskit/tests/test_matrix_rows.py @@ -38,7 +38,7 @@ def test_sparse_mean_center(tensor: torch.Tensor): nt, means = normalize_sparse_rows(tensor, "center") assert means.shape == torch.Size([nr]) - assert means.numpy() == approx(tgt_means, nan_ok=True, rel=1.0e-5) + assert means.numpy() == approx(tgt_means, nan_ok=True, rel=5.0e-5) for i in range(nr): tr = tensor[i].values().numpy() From f75eacea5e7ca571ad68a31c7cd5d4ec3ccf8fd9 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Thu, 25 Jul 2024 14:40:01 -0400 Subject: [PATCH 5/6] make dtype flexible based on tensor type --- lenskit/tests/test_matrix_rows.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lenskit/tests/test_matrix_rows.py b/lenskit/tests/test_matrix_rows.py index 14ec04d10..b6438fef5 100644 --- a/lenskit/tests/test_matrix_rows.py +++ b/lenskit/tests/test_matrix_rows.py @@ -30,15 +30,20 @@ def test_sparse_mean_center(tensor: torch.Tensor): coo = tensor.to_sparse_coo() rows = coo.indices()[0, :].numpy() counts = np.zeros(nr, dtype=np.int32) - sums = np.zeros(nr, dtype=np.float64) + if tensor.dtype == torch.float64: + sums = np.zeros(nr, dtype=np.float64) + else: + sums = np.zeros(nr, dtype=np.float64) + np.add.at(counts, rows, 1) np.add.at(sums, rows, coo.values().numpy()) tgt_means = sums / counts tgt_means = np.nan_to_num(tgt_means, nan=0) + nt, means = normalize_sparse_rows(tensor, "center") assert means.shape == torch.Size([nr]) - assert means.numpy() == approx(tgt_means, nan_ok=True, rel=5.0e-5) + assert means.numpy() == approx(tgt_means, nan_ok=True, rel=1.0e-5) for i in range(nr): tr = tensor[i].values().numpy() From 910eb934cfde583dea873bdc3858a68420af31c1 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Thu, 25 Jul 2024 14:50:38 -0400 Subject: [PATCH 6/6] only test means on float64 --- lenskit/tests/test_matrix_rows.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lenskit/tests/test_matrix_rows.py b/lenskit/tests/test_matrix_rows.py index b6438fef5..d0e52bd33 100644 --- a/lenskit/tests/test_matrix_rows.py +++ b/lenskit/tests/test_matrix_rows.py @@ -23,17 +23,14 @@ @settings(deadline=1000, suppress_health_check=[HealthCheck.too_slow]) -@given(sparse_tensors()) +@given(sparse_tensors(dtype=np.float64)) def test_sparse_mean_center(tensor: torch.Tensor): nr, nc = tensor.shape coo = tensor.to_sparse_coo() rows = coo.indices()[0, :].numpy() counts = np.zeros(nr, dtype=np.int32) - if tensor.dtype == torch.float64: - sums = np.zeros(nr, dtype=np.float64) - else: - sums = np.zeros(nr, dtype=np.float64) + sums = np.zeros(nr, dtype=np.float64) np.add.at(counts, rows, 1) np.add.at(sums, rows, coo.values().numpy()) @@ -43,7 +40,7 @@ def test_sparse_mean_center(tensor: torch.Tensor): nt, means = normalize_sparse_rows(tensor, "center") assert means.shape == torch.Size([nr]) - assert means.numpy() == approx(tgt_means, nan_ok=True, rel=1.0e-5) + assert means.numpy() == approx(tgt_means, nan_ok=True, rel=1.0e-6) for i in range(nr): tr = tensor[i].values().numpy()