Skip to content

Commit

Permalink
Merge pull request #572 from lenskit/feature/drop-old-columns
Browse files Browse the repository at this point in the history
Update load_movielens_df to return new standard field names
  • Loading branch information
mdekstrand authored Dec 25, 2024
2 parents acd6f5d + a3e3b3a commit ce74335
Show file tree
Hide file tree
Showing 14 changed files with 153 additions and 147 deletions.
16 changes: 6 additions & 10 deletions lenskit-hpf/tests/test_hpf.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@

_log = logging.getLogger(__name__)

simple_df = pd.DataFrame(
{"item": [1, 1, 2, 3], "user": [10, 12, 10, 13], "rating": [4.0, 3.0, 5.0, 2.0]}
)


@mark.slow
def test_hpf_train_large(tmp_path, ml_ratings):
Expand All @@ -32,8 +28,8 @@ def test_hpf_train_large(tmp_path, ml_ratings):
ds = from_interactions_df(ratings)
algo.train(ds)

assert algo.user_features_.shape[0] == ratings.user.nunique()
assert algo.item_features_.shape[0] == ratings.item.nunique()
assert algo.user_features_.shape[0] == ratings.user_id.nunique()
assert algo.item_features_.shape[0] == ratings.item_id.nunique()

mfile = tmp_path / "hpf.dat"
with mfile.open("wb") as mf:
Expand All @@ -48,7 +44,7 @@ def test_hpf_train_large(tmp_path, ml_ratings):
pipe = topn_pipeline(algo)
pipe.train(ds, retrain=False)

for u in np.random.choice(ratings.user.unique(), size=50, replace=False):
for u in np.random.choice(ratings.user_id.unique(), size=50, replace=False):
recs = pipe.run("recommender", query=u, n=50)
assert isinstance(recs, ItemList)
assert len(recs) == 50
Expand All @@ -61,8 +57,8 @@ def test_hpf_train_binary(tmp_path, ml_ratings):
ds = from_interactions_df(ratings)
algo.train(ds)

assert algo.user_features_.shape[0] == ratings.user.nunique()
assert algo.item_features_.shape[0] == ratings.item.nunique()
assert algo.user_features_.shape[0] == ratings.user_id.nunique()
assert algo.item_features_.shape[0] == ratings.item_id.nunique()

mfile = tmp_path / "hpf.dat"
with mfile.open("wb") as mf:
Expand All @@ -77,7 +73,7 @@ def test_hpf_train_binary(tmp_path, ml_ratings):
pipe = topn_pipeline(algo)
pipe.train(ds, retrain=False)

for u in np.random.choice(ratings.user.unique(), size=50, replace=False):
for u in np.random.choice(ratings.user_id.unique(), size=50, replace=False):
recs = pipe.run("recommender", query=u, n=50)
assert isinstance(recs, ItemList)
assert len(recs) == 50
46 changes: 24 additions & 22 deletions lenskit/lenskit/data/movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
Code to import MovieLens data sets into LensKit.
"""

import logging
import re
from dataclasses import dataclass
from enum import Enum
Expand All @@ -18,11 +17,12 @@

import numpy as np
import pandas as pd
import structlog

from .convert import from_interactions_df
from .dataset import Dataset

_log = logging.getLogger(__name__)
_log = structlog.stdlib.get_logger(__name__)

LOC: TypeAlias = Path | tuple[ZipFile, str]

Expand Down Expand Up @@ -105,50 +105,52 @@ def _ml_detect_and_open(path: str | Path) -> MLData:
ds: MLVersion

if loc.is_file() and loc.suffix == ".zip":
_log.debug("opening zip file at %s", loc)
log = _log.bind(zipfile=str(loc))
log.debug("opening zip file")
zf = ZipFile(loc, "r")
try:
infos = zf.infolist()
first = infos[0]
if not first.is_dir:
_log.error("%s: first entry is not directory")
log.error("first entry is not directory")
raise RuntimeError("invalid ML zip file")

_log.debug("%s: base dir filename %s", loc, first.filename)
log.debug("base dir filename %s", first.filename)
dsm = re.match(r"^(ml-(?:\d+[MmKk]|latest|latest-small))", first.filename)
if not dsm:
_log.error("%s: invalid directory name %s", loc, first.filename)
log.error("invalid directory name %s", first.filename)
raise RuntimeError("invalid ML zip file")

ds = MLVersion(dsm.group(1).lower())
_log.debug("%s: found ML data set %s", loc, ds)
log.debug("found ML data set %s", ds)
return MLData(ds, zf, first.filename)
except Exception as e: # pragma nocover
zf.close()
raise e
else:
_log.debug("loading from directory %s", loc)
log = _log.bind(dir=str(loc))
log.debug("loading from directory")
dsm = re.match(r"^(ml-\d+[MmKk])", loc.name)
if dsm:
ds = MLVersion(dsm.group(1))
_log.debug("%s: inferred data set %s from dir name", loc, ds)
_log.debug("inferred data set %s from dir name", ds)
else:
_log.debug("%s: checking contents for data type", loc)
_log.debug("checking contents for data type")
if (loc / "u.data").exists():
_log.debug("%s: found u.data, interpreting as 100K")
_log.debug("found u.data, interpreting as 100K")
ds = MLVersion.ML_100K
elif (loc / "ratings.dat").exists():
if (loc / "tags.dat").exists():
_log.debug("%s: found ratings.dat and tags.dat, interpreting as 10M", loc)
_log.debug("found ratings.dat and tags.dat, interpreting as 10M")
ds = MLVersion.ML_10M
else:
_log.debug("%s: found ratings.dat but no tags, interpreting as 1M", loc)
_log.debug("found ratings.dat but no tags, interpreting as 1M")
ds = MLVersion.ML_1M
elif (loc / "ratings.csv").exists():
_log.debug("%s: found ratings.csv, interpreting as modern (20M and later)", loc)
_log.debug("found ratings.csv, interpreting as modern (20M and later)")
ds = MLVersion.ML_MODERN
else:
_log.error("%s: could not detect MovieLens data", loc)
_log.error("could not detect MovieLens data")
raise RuntimeError("invalid ML directory")

return MLData(ds, loc)
Expand All @@ -160,10 +162,10 @@ def _load_ml_100k(ml: MLData) -> pd.DataFrame:
data,
sep="\t",
header=None,
names=["user", "item", "rating", "timestamp"],
names=["user_id", "item_id", "rating", "timestamp"],
dtype={
"user": np.int32,
"item": np.int32,
"user_id": np.int32,
"item_id": np.int32,
"rating": np.float32,
"timestamp": np.int32,
},
Expand All @@ -176,11 +178,11 @@ def _load_ml_million(ml: MLData) -> pd.DataFrame:
data,
sep=":",
header=None,
names=["user", "_ui", "item", "_ir", "rating", "_rt", "timestamp"],
names=["user_id", "_ui", "item_id", "_ir", "rating", "_rt", "timestamp"],
usecols=[0, 2, 4, 6],
dtype={
"user": np.int32,
"item": np.int32,
"user_id": np.int32,
"item_id": np.int32,
"rating": np.float32,
"timestamp": np.int32,
},
Expand All @@ -197,4 +199,4 @@ def _load_ml_modern(ml: MLData) -> pd.DataFrame:
"rating": np.float32,
"timestamp": np.int64,
},
).rename(columns={"userId": "user", "movieId": "item"})
).rename(columns={"userId": "user_id", "movieId": "item_id"})
2 changes: 1 addition & 1 deletion lenskit/tests/basic/test_bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def test_bias_train_ml_ratings(ml_ratings: pd.DataFrame, ml_ds: Dataset):
ares, data = pd.Series(imeans_algo, index=bm.items.ids()).align(imeans_data)
assert ares.values == approx(data.values)

urates = ml_ratings.set_index("user").loc[2].set_index("item").rating
urates = ml_ratings.set_index("user_id").loc[2].set_index("item_id").rating
umean = (urates - imeans_data[urates.index]).mean()
p = bias(2, ItemList(item_ids=[10, 11, -1]))
assert len(p) == 3
Expand Down
2 changes: 1 addition & 1 deletion lenskit/tests/data/test_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_iter_df(ml_ratings: pd.DataFrame):
counts[key] = len(il)

counts = pd.Series(counts)
in_counts = ml_ratings.value_counts("user")
in_counts = ml_ratings.value_counts("user_id")
assert len(counts) == len(in_counts)
counts, in_counts = counts.align(in_counts, join="outer")
assert np.all(counts == in_counts)
6 changes: 3 additions & 3 deletions lenskit/tests/data/test_dataset_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@


def test_from_ratings_default_names(ml_ratings: pd.DataFrame):
ratings = ml_ratings.rename(columns={"user": "user_id", "item": "item_id"})
ratings = ml_ratings
ds = from_interactions_df(ratings)
assert ds.item_count == ratings["item_id"].nunique()
assert ds.user_count == ratings["user_id"].nunique()


def test_from_ratings_nosuffix(ml_ratings: pd.DataFrame):
ratings = ml_ratings.rename(columns={"user": "user", "item": "item"})
ratings = ml_ratings.rename(columns={"user_id": "user", "item_id": "item"})
ds = from_interactions_df(ratings)
assert ds.item_count == ratings["item"].nunique()
assert ds.user_count == ratings["user"].nunique()


def test_from_ratings_names_upper(ml_ratings: pd.DataFrame):
ratings = ml_ratings.rename(columns={"user": "USER", "item": "ITEM"})
ratings = ml_ratings.rename(columns={"user_id": "USER", "item_id": "ITEM"})
ds = from_interactions_df(ratings)
assert ds.item_count == ratings["ITEM"].nunique()
assert ds.user_count == ratings["USER"].nunique()
Expand Down
26 changes: 15 additions & 11 deletions lenskit/tests/data/test_dataset_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,17 @@ def test_item_stats(ml_ratings: pd.DataFrame):
assert len(stats) == ml_ds.item_count
assert np.all(stats.index == ml_ds.items.index)

assert np.all(stats["count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items))
assert np.all(stats["user_count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items))
assert np.all(stats["rating_count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items))
assert np.all(stats["count"] == ml_ratings["item_id"].value_counts().reindex(ml_ds.items))
assert np.all(stats["user_count"] == ml_ratings["item_id"].value_counts().reindex(ml_ds.items))
assert np.all(
stats["rating_count"] == ml_ratings["item_id"].value_counts().reindex(ml_ds.items)
)

assert stats["mean_rating"].values == approx(
ml_ratings.groupby("item")["rating"].mean().reindex(ml_ds.items).values
ml_ratings.groupby("item_id")["rating"].mean().reindex(ml_ds.items).values
)

ts = ml_ratings.groupby("item")["timestamp"].min().reindex(ml_ds.items)
ts = ml_ratings.groupby("item_id")["timestamp"].min().reindex(ml_ds.items)
bad = stats["first_time"] != ts
nbad = np.sum(bad)
if nbad:
Expand All @@ -54,16 +56,18 @@ def test_user_stats(ml_ratings: pd.DataFrame):
assert len(stats) == ml_ds.user_count
assert np.all(stats.index == ml_ds.users.index)

assert np.all(stats["count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users))
assert np.all(stats["item_count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users))
assert np.all(stats["rating_count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users))
assert np.all(stats["count"] == ml_ratings["user_id"].value_counts().reindex(ml_ds.users))
assert np.all(stats["item_count"] == ml_ratings["user_id"].value_counts().reindex(ml_ds.users))
assert np.all(
stats["rating_count"] == ml_ratings["user_id"].value_counts().reindex(ml_ds.users)
)

assert stats["mean_rating"].values == approx(
ml_ratings.groupby("user")["rating"].mean().reindex(ml_ds.users).values
ml_ratings.groupby("user_id")["rating"].mean().reindex(ml_ds.users).values
)
assert np.all(
stats["first_time"] == ml_ratings.groupby("user")["timestamp"].min().reindex(ml_ds.users)
stats["first_time"] == ml_ratings.groupby("user_id")["timestamp"].min().reindex(ml_ds.users)
)
assert np.all(
stats["last_time"] == ml_ratings.groupby("user")["timestamp"].max().reindex(ml_ds.users)
stats["last_time"] == ml_ratings.groupby("user_id")["timestamp"].max().reindex(ml_ds.users)
)
18 changes: 9 additions & 9 deletions lenskit/tests/data/test_dataset_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def test_pandas_log_defaults(ml_ratings: pd.DataFrame, ml_ds: Dataset):
uids = ml_ds.users.ids(int_df["user_num"])
iids = ml_ds.items.ids(int_df["item_num"])

ml_df = ml_ratings.sort_values(["user", "item"])
assert np.all(uids == ml_df["user"])
assert np.all(iids == ml_df["item"])
ml_df = ml_ratings.sort_values(["user_id", "item_id"])
assert np.all(uids == ml_df["user_id"])
assert np.all(iids == ml_df["item_id"])
assert np.all(int_df["rating"] == ml_df["rating"])
assert np.all(int_df["timestamp"] == ml_df["timestamp"])

Expand All @@ -54,9 +54,9 @@ def test_pandas_log_ids(ml_ratings: pd.DataFrame, ml_ds: Dataset):
# the interact
int_df = int_df.sort_values(["user_id", "item_id"])

ml_df = ml_ratings.sort_values(["user", "item"])
assert np.all(int_df["user_id"] == ml_df["user"])
assert np.all(int_df["item_id"] == ml_df["item"])
ml_df = ml_ratings.sort_values(["user_id", "item_id"])
assert np.all(int_df["user_id"] == ml_df["user_id"])
assert np.all(int_df["item_id"] == ml_df["item_id"])
assert np.all(int_df["rating"] == ml_df["rating"])
assert np.all(int_df["timestamp"] == ml_df["timestamp"])

Expand All @@ -78,9 +78,9 @@ def test_pandas_log_no_ts(ml_ratings: pd.DataFrame, ml_ds: Dataset):
uids = ml_ds.users.ids(int_df["user_num"])
iids = ml_ds.items.ids(int_df["item_num"])

ml_df = ml_ratings.sort_values(["user", "item"])
assert np.all(uids == ml_df["user"])
assert np.all(iids == ml_df["item"])
ml_df = ml_ratings.sort_values(["user_id", "item_id"])
assert np.all(uids == ml_df["user_id"])
assert np.all(iids == ml_df["item_id"])
assert np.all(int_df["rating"] == ml_df["rating"])

# and the total length
Expand Down
Loading

0 comments on commit ce74335

Please sign in to comment.