Merge pull request #572 from lenskit/feature/drop-old-columns

Update load_movielens_df to return new standard field names
lenskit · Dec 25, 2024 · ce74335 · ce74335
2 parents acd6f5d + a3e3b3a
commit ce74335
Show file tree

Hide file tree

Showing 14 changed files with 153 additions and 147 deletions.
diff --git a/lenskit-hpf/tests/test_hpf.py b/lenskit-hpf/tests/test_hpf.py
@@ -20,10 +20,6 @@
 
 _log = logging.getLogger(__name__)
 
-simple_df = pd.DataFrame(
-    {"item": [1, 1, 2, 3], "user": [10, 12, 10, 13], "rating": [4.0, 3.0, 5.0, 2.0]}
-)
-
 
 @mark.slow
 def test_hpf_train_large(tmp_path, ml_ratings):
@@ -32,8 +28,8 @@ def test_hpf_train_large(tmp_path, ml_ratings):
     ds = from_interactions_df(ratings)
     algo.train(ds)
 
-    assert algo.user_features_.shape[0] == ratings.user.nunique()
-    assert algo.item_features_.shape[0] == ratings.item.nunique()
+    assert algo.user_features_.shape[0] == ratings.user_id.nunique()
+    assert algo.item_features_.shape[0] == ratings.item_id.nunique()
 
     mfile = tmp_path / "hpf.dat"
     with mfile.open("wb") as mf:
@@ -48,7 +44,7 @@ def test_hpf_train_large(tmp_path, ml_ratings):
     pipe = topn_pipeline(algo)
     pipe.train(ds, retrain=False)
 
-    for u in np.random.choice(ratings.user.unique(), size=50, replace=False):
+    for u in np.random.choice(ratings.user_id.unique(), size=50, replace=False):
         recs = pipe.run("recommender", query=u, n=50)
         assert isinstance(recs, ItemList)
         assert len(recs) == 50
@@ -61,8 +57,8 @@ def test_hpf_train_binary(tmp_path, ml_ratings):
     ds = from_interactions_df(ratings)
     algo.train(ds)
 
-    assert algo.user_features_.shape[0] == ratings.user.nunique()
-    assert algo.item_features_.shape[0] == ratings.item.nunique()
+    assert algo.user_features_.shape[0] == ratings.user_id.nunique()
+    assert algo.item_features_.shape[0] == ratings.item_id.nunique()
 
     mfile = tmp_path / "hpf.dat"
     with mfile.open("wb") as mf:
@@ -77,7 +73,7 @@ def test_hpf_train_binary(tmp_path, ml_ratings):
     pipe = topn_pipeline(algo)
     pipe.train(ds, retrain=False)
 
-    for u in np.random.choice(ratings.user.unique(), size=50, replace=False):
+    for u in np.random.choice(ratings.user_id.unique(), size=50, replace=False):
         recs = pipe.run("recommender", query=u, n=50)
         assert isinstance(recs, ItemList)
         assert len(recs) == 50
diff --git a/lenskit/lenskit/data/movielens.py b/lenskit/lenskit/data/movielens.py
@@ -8,7 +8,6 @@
 Code to import MovieLens data sets into LensKit.
 """
 
-import logging
 import re
 from dataclasses import dataclass
 from enum import Enum
@@ -18,11 +17,12 @@
 
 import numpy as np
 import pandas as pd
+import structlog
 
 from .convert import from_interactions_df
 from .dataset import Dataset
 
-_log = logging.getLogger(__name__)
+_log = structlog.stdlib.get_logger(__name__)
 
 LOC: TypeAlias = Path | tuple[ZipFile, str]
 
@@ -105,50 +105,52 @@ def _ml_detect_and_open(path: str | Path) -> MLData:
     ds: MLVersion
 
     if loc.is_file() and loc.suffix == ".zip":
-        _log.debug("opening zip file at %s", loc)
+        log = _log.bind(zipfile=str(loc))
+        log.debug("opening zip file")
         zf = ZipFile(loc, "r")
         try:
             infos = zf.infolist()
             first = infos[0]
             if not first.is_dir:
-                _log.error("%s: first entry is not directory")
+                log.error("first entry is not directory")
                 raise RuntimeError("invalid ML zip file")
 
-            _log.debug("%s: base dir filename %s", loc, first.filename)
+            log.debug("base dir filename %s", first.filename)
             dsm = re.match(r"^(ml-(?:\d+[MmKk]|latest|latest-small))", first.filename)
             if not dsm:
-                _log.error("%s: invalid directory name %s", loc, first.filename)
+                log.error("invalid directory name %s", first.filename)
                 raise RuntimeError("invalid ML zip file")
 
             ds = MLVersion(dsm.group(1).lower())
-            _log.debug("%s: found ML data set %s", loc, ds)
+            log.debug("found ML data set %s", ds)
             return MLData(ds, zf, first.filename)
         except Exception as e:  # pragma nocover
             zf.close()
             raise e
     else:
-        _log.debug("loading from directory %s", loc)
+        log = _log.bind(dir=str(loc))
+        log.debug("loading from directory")
         dsm = re.match(r"^(ml-\d+[MmKk])", loc.name)
         if dsm:
             ds = MLVersion(dsm.group(1))
-            _log.debug("%s: inferred data set %s from dir name", loc, ds)
+            _log.debug("inferred data set %s from dir name", ds)
         else:
-            _log.debug("%s: checking contents for data type", loc)
+            _log.debug("checking contents for data type")
             if (loc / "u.data").exists():
-                _log.debug("%s: found u.data, interpreting as 100K")
+                _log.debug("found u.data, interpreting as 100K")
                 ds = MLVersion.ML_100K
             elif (loc / "ratings.dat").exists():
                 if (loc / "tags.dat").exists():
-                    _log.debug("%s: found ratings.dat and tags.dat, interpreting as 10M", loc)
+                    _log.debug("found ratings.dat and tags.dat, interpreting as 10M")
                     ds = MLVersion.ML_10M
                 else:
-                    _log.debug("%s: found ratings.dat but no tags, interpreting as 1M", loc)
+                    _log.debug("found ratings.dat but no tags, interpreting as 1M")
                     ds = MLVersion.ML_1M
             elif (loc / "ratings.csv").exists():
-                _log.debug("%s: found ratings.csv, interpreting as modern (20M and later)", loc)
+                _log.debug("found ratings.csv, interpreting as modern (20M and later)")
                 ds = MLVersion.ML_MODERN
             else:
-                _log.error("%s: could not detect MovieLens data", loc)
+                _log.error("could not detect MovieLens data")
                 raise RuntimeError("invalid ML directory")
 
         return MLData(ds, loc)
@@ -160,10 +162,10 @@ def _load_ml_100k(ml: MLData) -> pd.DataFrame:
             data,
             sep="\t",
             header=None,
-            names=["user", "item", "rating", "timestamp"],
+            names=["user_id", "item_id", "rating", "timestamp"],
             dtype={
-                "user": np.int32,
-                "item": np.int32,
+                "user_id": np.int32,
+                "item_id": np.int32,
                 "rating": np.float32,
                 "timestamp": np.int32,
             },
@@ -176,11 +178,11 @@ def _load_ml_million(ml: MLData) -> pd.DataFrame:
             data,
             sep=":",
             header=None,
-            names=["user", "_ui", "item", "_ir", "rating", "_rt", "timestamp"],
+            names=["user_id", "_ui", "item_id", "_ir", "rating", "_rt", "timestamp"],
             usecols=[0, 2, 4, 6],
             dtype={
-                "user": np.int32,
-                "item": np.int32,
+                "user_id": np.int32,
+                "item_id": np.int32,
                 "rating": np.float32,
                 "timestamp": np.int32,
             },
@@ -197,4 +199,4 @@ def _load_ml_modern(ml: MLData) -> pd.DataFrame:
                 "rating": np.float32,
                 "timestamp": np.int64,
             },
-        ).rename(columns={"userId": "user", "movieId": "item"})
+        ).rename(columns={"userId": "user_id", "movieId": "item_id"})
diff --git a/lenskit/tests/basic/test_bias.py b/lenskit/tests/basic/test_bias.py
@@ -225,7 +225,7 @@ def test_bias_train_ml_ratings(ml_ratings: pd.DataFrame, ml_ds: Dataset):
     ares, data = pd.Series(imeans_algo, index=bm.items.ids()).align(imeans_data)
     assert ares.values == approx(data.values)
 
-    urates = ml_ratings.set_index("user").loc[2].set_index("item").rating
+    urates = ml_ratings.set_index("user_id").loc[2].set_index("item_id").rating
     umean = (urates - imeans_data[urates.index]).mean()
     p = bias(2, ItemList(item_ids=[10, 11, -1]))
     assert len(p) == 3

diff --git a/lenskit/tests/data/test_bulk.py b/lenskit/tests/data/test_bulk.py
@@ -36,7 +36,7 @@ def test_iter_df(ml_ratings: pd.DataFrame):
         counts[key] = len(il)
 
     counts = pd.Series(counts)
-    in_counts = ml_ratings.value_counts("user")
+    in_counts = ml_ratings.value_counts("user_id")
     assert len(counts) == len(in_counts)
     counts, in_counts = counts.align(in_counts, join="outer")
     assert np.all(counts == in_counts)
diff --git a/lenskit/tests/data/test_dataset_ids.py b/lenskit/tests/data/test_dataset_ids.py
@@ -18,21 +18,21 @@
 
 
 def test_from_ratings_default_names(ml_ratings: pd.DataFrame):
-    ratings = ml_ratings.rename(columns={"user": "user_id", "item": "item_id"})
+    ratings = ml_ratings
     ds = from_interactions_df(ratings)
     assert ds.item_count == ratings["item_id"].nunique()
     assert ds.user_count == ratings["user_id"].nunique()
 
 
 def test_from_ratings_nosuffix(ml_ratings: pd.DataFrame):
-    ratings = ml_ratings.rename(columns={"user": "user", "item": "item"})
+    ratings = ml_ratings.rename(columns={"user_id": "user", "item_id": "item"})
     ds = from_interactions_df(ratings)
     assert ds.item_count == ratings["item"].nunique()
     assert ds.user_count == ratings["user"].nunique()
 
 
 def test_from_ratings_names_upper(ml_ratings: pd.DataFrame):
-    ratings = ml_ratings.rename(columns={"user": "USER", "item": "ITEM"})
+    ratings = ml_ratings.rename(columns={"user_id": "USER", "item_id": "ITEM"})
     ds = from_interactions_df(ratings)
     assert ds.item_count == ratings["ITEM"].nunique()
     assert ds.user_count == ratings["USER"].nunique()

diff --git a/lenskit/tests/data/test_dataset_lazy.py b/lenskit/tests/data/test_dataset_lazy.py
@@ -29,15 +29,17 @@ def test_item_stats(ml_ratings: pd.DataFrame):
     assert len(stats) == ml_ds.item_count
     assert np.all(stats.index == ml_ds.items.index)
 
-    assert np.all(stats["count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items))
-    assert np.all(stats["user_count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items))
-    assert np.all(stats["rating_count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items))
+    assert np.all(stats["count"] == ml_ratings["item_id"].value_counts().reindex(ml_ds.items))
+    assert np.all(stats["user_count"] == ml_ratings["item_id"].value_counts().reindex(ml_ds.items))
+    assert np.all(
+        stats["rating_count"] == ml_ratings["item_id"].value_counts().reindex(ml_ds.items)
+    )
 
     assert stats["mean_rating"].values == approx(
-        ml_ratings.groupby("item")["rating"].mean().reindex(ml_ds.items).values
+        ml_ratings.groupby("item_id")["rating"].mean().reindex(ml_ds.items).values
     )
 
-    ts = ml_ratings.groupby("item")["timestamp"].min().reindex(ml_ds.items)
+    ts = ml_ratings.groupby("item_id")["timestamp"].min().reindex(ml_ds.items)
     bad = stats["first_time"] != ts
     nbad = np.sum(bad)
     if nbad:
@@ -54,16 +56,18 @@ def test_user_stats(ml_ratings: pd.DataFrame):
     assert len(stats) == ml_ds.user_count
     assert np.all(stats.index == ml_ds.users.index)
 
-    assert np.all(stats["count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users))
-    assert np.all(stats["item_count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users))
-    assert np.all(stats["rating_count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users))
+    assert np.all(stats["count"] == ml_ratings["user_id"].value_counts().reindex(ml_ds.users))
+    assert np.all(stats["item_count"] == ml_ratings["user_id"].value_counts().reindex(ml_ds.users))
+    assert np.all(
+        stats["rating_count"] == ml_ratings["user_id"].value_counts().reindex(ml_ds.users)
+    )
 
     assert stats["mean_rating"].values == approx(
-        ml_ratings.groupby("user")["rating"].mean().reindex(ml_ds.users).values
+        ml_ratings.groupby("user_id")["rating"].mean().reindex(ml_ds.users).values
     )
     assert np.all(
-        stats["first_time"] == ml_ratings.groupby("user")["timestamp"].min().reindex(ml_ds.users)
+        stats["first_time"] == ml_ratings.groupby("user_id")["timestamp"].min().reindex(ml_ds.users)
     )
     assert np.all(
-        stats["last_time"] == ml_ratings.groupby("user")["timestamp"].max().reindex(ml_ds.users)
+        stats["last_time"] == ml_ratings.groupby("user_id")["timestamp"].max().reindex(ml_ds.users)
     )
diff --git a/lenskit/tests/data/test_dataset_log.py b/lenskit/tests/data/test_dataset_log.py
@@ -31,9 +31,9 @@ def test_pandas_log_defaults(ml_ratings: pd.DataFrame, ml_ds: Dataset):
     uids = ml_ds.users.ids(int_df["user_num"])
     iids = ml_ds.items.ids(int_df["item_num"])
 
-    ml_df = ml_ratings.sort_values(["user", "item"])
-    assert np.all(uids == ml_df["user"])
-    assert np.all(iids == ml_df["item"])
+    ml_df = ml_ratings.sort_values(["user_id", "item_id"])
+    assert np.all(uids == ml_df["user_id"])
+    assert np.all(iids == ml_df["item_id"])
     assert np.all(int_df["rating"] == ml_df["rating"])
     assert np.all(int_df["timestamp"] == ml_df["timestamp"])
 
@@ -54,9 +54,9 @@ def test_pandas_log_ids(ml_ratings: pd.DataFrame, ml_ds: Dataset):
     # the interact
     int_df = int_df.sort_values(["user_id", "item_id"])
 
-    ml_df = ml_ratings.sort_values(["user", "item"])
-    assert np.all(int_df["user_id"] == ml_df["user"])
-    assert np.all(int_df["item_id"] == ml_df["item"])
+    ml_df = ml_ratings.sort_values(["user_id", "item_id"])
+    assert np.all(int_df["user_id"] == ml_df["user_id"])
+    assert np.all(int_df["item_id"] == ml_df["item_id"])
     assert np.all(int_df["rating"] == ml_df["rating"])
     assert np.all(int_df["timestamp"] == ml_df["timestamp"])
 
@@ -78,9 +78,9 @@ def test_pandas_log_no_ts(ml_ratings: pd.DataFrame, ml_ds: Dataset):
     uids = ml_ds.users.ids(int_df["user_num"])
     iids = ml_ds.items.ids(int_df["item_num"])
 
-    ml_df = ml_ratings.sort_values(["user", "item"])
-    assert np.all(uids == ml_df["user"])
-    assert np.all(iids == ml_df["item"])
+    ml_df = ml_ratings.sort_values(["user_id", "item_id"])
+    assert np.all(uids == ml_df["user_id"])
+    assert np.all(iids == ml_df["item_id"])
     assert np.all(int_df["rating"] == ml_df["rating"])
 
     # and the total length