lenskit · mdekstrand · Jul 25, 2024 · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -556,7 +556,7 @@ jobs:
           path: |
             data
             !data/*.zip
-          key: test-mldata-000-2ee919d5c0eef34d5a4f40bcf0480c1bf0310417db6921e3a2575c48991f379c2f4ad179f8514390133795614a96fa5b4ece55906c68a90af07c09670b2c3c5b
+          key: test-mldata-001-2ee919d5c0eef34d5a4f40bcf0480c1bf0310417db6921e3a2575c48991f379c2f4ad179f8514390133795614a96fa5b4ece55906c68a90af07c09670b2c3c5b
       - name: Download ML data
         run: |
           python -m lenskit.data.fetch ml-100k ml-20m
@@ -613,7 +613,7 @@ jobs:
           path: |
             data
             !data/*.zip
-          key: test-mldata-000-cd26f1c44a6962b0936346b346a9b418a3ed04b01a2892269fccd24a6387e943dba6d5e64ab2f8feb1823475601d65c2e6ebbeeeca0c2c210f0d37c00aabf2e9
+          key: test-mldata-001-cd26f1c44a6962b0936346b346a9b418a3ed04b01a2892269fccd24a6387e943dba6d5e64ab2f8feb1823475601d65c2e6ebbeeeca0c2c210f0d37c00aabf2e9
       - name: Download ML data
         run: |
           python -m lenskit.data.fetch ml-100k ml-1m ml-10m ml-20m

diff --git a/.vscode/ltex.dictionary.en-US.txt b/.vscode/ltex.dictionary.en-US.txt
@@ -9,3 +9,4 @@ lenskit
 invoker
 CUDA
 subpackages
+recomputation
diff --git a/conftest.py b/conftest.py
@@ -15,6 +15,7 @@
 from pytest import fixture, skip
 
 from lenskit.parallel import ensure_parallel_init
+from lenskit.util.test import ml_100k, ml_ds, ml_ratings  # noqa: F401
 
 logging.getLogger("numba").setLevel(logging.INFO)
 

diff --git a/docs/GettingStarted.ipynb b/docs/GettingStarted.ipynb
diff --git a/docs/datasets.rst b/docs/datasets.rst
diff --git a/docs/index.rst b/docs/index.rst
@@ -39,7 +39,6 @@ Resources
     :caption: Running Experiments
 
     data
-    datasets
     crossfold
     batch
     evaluation/index

diff --git a/docs/releases/2024.rst b/docs/releases/2024.rst
@@ -31,6 +31,14 @@ Significant Changes
     * :py:class:`~lenskit.algorithms.als.ImplicitMF`
     * :py:class:`~lenskit.algorithms.als.BiasedMF`
 
+*   :class:`~lenskit.data.Dataset`.  LensKit now provides an abstraction for
+    training data instead of working with Pandas data frames directly, that
+    allows components to reduce code duplication and recomputation, access data
+    in multiple formats (Pandas, NumPy, and PyTorch), and provided standardized
+    structures like mappings of user or item IDs to array indices.  This also
+    supersedes the old bespoke dataset loading support, with functions like
+    :func:`~lenskit.data.load_movielens` to load standard datasets.
+
 *   Many LensKit components (batch running, model training, etc.) now report progress with
     :py:mod:`progress_api`, and can be connected to TQDM or Enlighten.
 

diff --git a/lenskit-funksvd/tests/test_funksvd.py b/lenskit-funksvd/tests/test_funksvd.py
@@ -13,7 +13,7 @@
 
 from pytest import approx, mark
 
-from lenskit.data.dataset import from_interactions_df
+from lenskit.data.dataset import Dataset, from_interactions_df
 import lenskit.funksvd as svd
 import lenskit.util.test as lktu
 
@@ -139,16 +139,16 @@ def test_fsvd_predict_bad_user():
 
 @lktu.wantjit
 @mark.slow
-def test_fsvd_save_load():
-    ratings = lktu.ml_test.ratings
-
+def test_fsvd_save_load(ml_ds: Dataset):
     original = svd.FunkSVD(20, iterations=20)
-    original.fit(from_interactions_df(ratings))
+    original.fit(ml_ds)
 
     assert original.bias is not None
-    assert original.bias.mean_ == approx(ratings.rating.mean())
-    assert original.item_features_.shape == (ratings.item.nunique(), 20)
-    assert original.user_features_.shape == (ratings.user.nunique(), 20)
+    assert original.bias.mean_ == approx(
+        ml_ds.interaction_matrix("scipy", field="rating").data.mean()
+    )
+    assert original.item_features_.shape == (ml_ds.item_count, 20)
+    assert original.user_features_.shape == (ml_ds.user_count, 20)
 
     mod = pickle.dumps(original)
     _log.info("serialized to %d bytes", len(mod))
@@ -165,8 +165,8 @@ def test_fsvd_save_load():
 
 @lktu.wantjit
 @mark.slow
-def test_fsvd_train_binary():
-    ratings = lktu.ml_test.ratings.drop(columns=["rating", "timestamp"])
+def test_fsvd_train_binary(ml_ratings: pd.DataFrame):
+    ratings = ml_ratings.drop(columns=["rating", "timestamp"])
 
     original = svd.FunkSVD(20, iterations=20, bias=False)
     original.fit(from_interactions_df(ratings))
@@ -178,10 +178,10 @@ def test_fsvd_train_binary():
 
 @lktu.wantjit
 @mark.slow
-def test_fsvd_known_preds():
+def test_fsvd_known_preds(ml_ds: Dataset):
     algo = svd.FunkSVD(15, iterations=125, lrate=0.001)
     _log.info("training %s on ml data", algo)
-    algo.fit(from_interactions_df(lktu.ml_test.ratings))
+    algo.fit(ml_ds)
 
     dir = Path(__file__).parent
     pred_file = dir / "funksvd-preds.csv"
@@ -207,15 +207,12 @@ def test_fsvd_known_preds():
 @lktu.wantjit
 @mark.slow
 @mark.eval
-@mark.skipif(not lktu.ml100k.available, reason="ML100K data not present")
-def test_fsvd_batch_accuracy():
+def test_fsvd_batch_accuracy(ml_100k: pd.DataFrame):
     import lenskit.crossfold as xf
     import lenskit.metrics.predict as pm
     from lenskit import batch
     from lenskit.algorithms import basic, bias
 
-    ratings = lktu.ml100k.ratings
-
     svd_algo = svd.FunkSVD(25, 125, damping=10)
     algo = basic.Fallback(svd_algo, bias.Bias(damping=10))
 
@@ -225,7 +222,7 @@ def eval(train, test):
         _log.info("testing %d users", test.user.nunique())
         return batch.predict(algo, test)
 
-    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
+    folds = xf.partition_users(ml_100k, 5, xf.SampleFrac(0.2))
     preds = pd.concat(eval(train, test) for (train, test) in folds)
     mae = pm.mae(preds.prediction, preds.rating)
     assert mae == approx(0.74, abs=0.025)

diff --git a/lenskit-hpf/tests/test_hpf.py b/lenskit-hpf/tests/test_hpf.py
@@ -20,10 +20,9 @@
 
 
 @mark.slow
-def test_hpf_train_large(tmp_path):
+def test_hpf_train_large(tmp_path, ml_ratings):
     algo = hpf.HPF(20)
-    ratings = lktu.ml_test.ratings
-    ratings = ratings.assign(rating=ratings.rating + 0.5)
+    ratings = ml_ratings.assign(rating=ml_ratings.rating + 0.5)
     ds = from_interactions_df(ratings)
     algo.fit(ds)
 
@@ -51,9 +50,9 @@ def test_hpf_train_large(tmp_path):
 
 
 @mark.slow
-def test_hpf_train_binary(tmp_path):
+def test_hpf_train_binary(tmp_path, ml_ratings):
     algo = hpf.HPF(20)
-    ratings = lktu.ml_test.ratings.drop(columns=["timestamp", "rating"])
+    ratings = ml_ratings.drop(columns=["timestamp", "rating"])
     ds = from_interactions_df(ratings)
     algo.fit(ds)
 

diff --git a/lenskit-implicit/tests/test_implicit.py b/lenskit-implicit/tests/test_implicit.py
@@ -19,13 +19,11 @@
 
 
 @mark.slow
-def test_implicit_als_train_rec():
+def test_implicit_als_train_rec(ml_ds):
     algo = ALS(25)
     assert algo.factors == 25
-    ratings = lktu.ml_test.ratings
-    ds = from_interactions_df(ratings)
 
-    ret = algo.fit(ds)
+    ret = algo.fit(ml_ds)
     assert ret is algo
 
     recs = algo.recommend(100, n=20)
@@ -46,14 +44,11 @@ def test_implicit_als_train_rec():
 
 @mark.slow
 @mark.eval
-@mark.skipif(not lktu.ml100k.available, reason="ML100K not downloaded")
 @mark.parametrize("n_jobs", [1, None])
-def test_implicit_als_batch_accuracy(n_jobs):
+def test_implicit_als_batch_accuracy(ml_100k, n_jobs):
     import lenskit.crossfold as xf
     from lenskit import batch, topn
 
-    ratings = lktu.ml100k.ratings
-
     algo_t = ALS(25)
 
     def eval(train, test):
@@ -66,7 +61,7 @@ def eval(train, test):
         recs = batch.recommend(algo, users, 100, n_jobs=n_jobs)
         return recs
 
-    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
+    folds = list(xf.partition_users(ml_100k, 5, xf.SampleFrac(0.2)))
     test = pd.concat(f.test for f in folds)
 
     recs = pd.concat(eval(train, test) for (train, test) in folds)
@@ -81,12 +76,11 @@ def eval(train, test):
 
 
 @mark.slow
-def test_implicit_bpr_train_rec():
+def test_implicit_bpr_train_rec(ml_ds):
     algo = BPR(25, use_gpu=False)
     assert algo.factors == 25
-    ratings = lktu.ml_test.ratings
 
-    algo.fit(from_interactions_df(ratings))
+    algo.fit(ml_ds)
 
     recs = algo.recommend(100, n=20)
     assert len(recs) == 20

diff --git a/lenskit/lenskit/algorithms/basic.py b/lenskit/lenskit/algorithms/basic.py
@@ -10,6 +10,7 @@
 
 import logging
 from collections.abc import Iterable, Sequence
+from typing import overload
 
 import numpy as np
 import pandas as pd
@@ -112,20 +113,25 @@ class Fallback(Predictor):
     missing values, and so forth.
     """
 
-    def __init__(self, algorithms, *others):
+    algorithms: list[Predictor]
+
+    @overload
+    def __init__(self, algorithms: Iterable[Predictor]): ...
+    @overload
+    def __init__(self, algorithms: Predictor, *others: Predictor): ...
+    def __init__(self, algorithms: Predictor | Iterable[Predictor], *others):
         """
         Args:
             algorithms: a list of component algorithms.  Each one will be trained.
             others:
                 additional algorithms, in which case ``algorithms`` is taken to be
                 a single algorithm.
         """
-        if others:
-            self.algorithms = [algorithms] + list(others)
-        elif isinstance(algorithms, Iterable) or isinstance(algorithms, Sequence):
-            self.algorithms = algorithms
+        if isinstance(algorithms, Iterable) or isinstance(algorithms, Sequence):
+            assert not others
+            self.algorithms = list(algorithms)
         else:
-            self.algorithms = [algorithms]
+            self.algorithms = [algorithms] + list(others)
 
     @override
     def fit(self, data: Dataset, **kwargs):
@@ -172,7 +178,7 @@ def fit(self, data: Dataset, **kwarsg):
 
     @override
     def candidates(self, user, ratings=None):
-        return np.array([], dtype=self.dtype_)
+        return np.array([], dtype=self.dtype_)  # type: ignore
 
 
 class UnratedItemCandidateSelector(CandidateSelector):

diff --git a/lenskit/lenskit/algorithms/bias.py b/lenskit/lenskit/algorithms/bias.py
@@ -92,7 +92,7 @@ def fit(self, data: Dataset, **kwargs):
         """
         _logger.info("building bias model for %d ratings", data.interaction_count)
         ratings = data.interaction_matrix("scipy", layout="coo", field="rating")
-        nrows, ncols = ratings.shape
+        nrows, ncols = ratings.shape  # type: ignore
 
         self.mean_ = float(np.mean(ratings.data))
         _logger.info("global mean: %.3f", self.mean_)
@@ -242,7 +242,7 @@ def inverse_transform_user(self, user, ratings, user_bias=None):
 
     def fit_transform(self, data: Dataset, **kwargs) -> pd.DataFrame:
         """
-        Fit with ratings and return the training data transformed.
+        Fit with ratings and return the training data matrix transformed.
         """
         # FIXME: make this more efficient, don't rename things.
         self.fit(data)

diff --git a/lenskit/lenskit/batch/_predict.py b/lenskit/lenskit/batch/_predict.py
@@ -42,9 +42,9 @@ def predict(algo, pairs, *, n_jobs=None, **kwargs):
 
         >>> from lenskit.algorithms.bias import Bias
         >>> from lenskit.metrics.predict import rmse
-        >>> from lenskit import datasets
         >>> from lenskit.data import from_interactions_df
-        >>> ratings = datasets.MovieLens('data/ml-latest-small').ratings
+        >>> from lenskit.data.movielens import load_movielens_df
+        >>> ratings = load_movielens_df('data/ml-latest-small')
         >>> bias = Bias()
         >>> bias.fit(from_interactions_df(ratings[:-1000]))
         <lenskit.algorithms.bias.Bias object at ...>

diff --git a/lenskit/lenskit/crossfold.py b/lenskit/lenskit/crossfold.py
@@ -67,8 +67,8 @@ def sample_rows(data, partitions, size, disjoint=True, *, rng_spec=None):
 
     We can loop over a sequence of train-test pairs::
 
-        >>> from lenskit import datasets
-        >>> ratings = datasets.MovieLens('data/ml-latest-small').ratings
+        >>> from lenskit.data.movielens import load_movielens_df
+        >>> ratings = load_movielens_df('data/ml-latest-small')
         >>> for train, test in sample_rows(ratings, 5, 1000):
         ...     print(len(test))
         1000
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,3 +9,4 @@ lenskit @@
     invoker
     CUDA
     subpackages
+    recomputation