Skip to content

Commit

Permalink
Merge pull request #493 from ZiyaoWei/main
Browse files Browse the repository at this point in the history
feat: Add TimeBoundedPopScore for time-bounded popularity
  • Loading branch information
mdekstrand authored Dec 22, 2024
2 parents 3cc9b4a + 5fc77fa commit 1dd80e8
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 3 deletions.
67 changes: 64 additions & 3 deletions lenskit/lenskit/basic/popularity.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
from datetime import datetime

import numpy as np
import pandas as pd
from typing_extensions import override

Expand Down Expand Up @@ -43,7 +45,11 @@ def train(self, data: Dataset):
_log.info("counting item popularity")
stats = data.item_stats()
scores = stats["count"]
self.item_scores_ = self._train_internal(scores)

return self

def _train_internal(self, scores: pd.Series):
if self.score_method == "rank":
_log.info("ranking %d items", len(scores))
scores = scores.rank().sort_index()
Expand All @@ -59,13 +65,68 @@ def train(self, data: Dataset):
else:
raise ValueError("invalid scoring method " + repr(self.score_method))

self.item_scores_ = scores

return self
return scores

def __call__(self, items: ItemList) -> ItemList:
scores = self.item_scores_.reindex(items.ids())
return ItemList(items, scores=scores)

def __str__(self):
return "PopScore({})".format(self.score_method)


class TimeBoundedPopScore(PopScorer):
"""
Score items by their time-bounded popularity, i.e., the popularity in the
most recent `time_window` period. Use with :py:class:`TopN` to get a
most-popular-recent-items recommender.
Args:
time_window(datetime.timedelta):
The time window for computing popularity scores.
score_type(str):
The method for computing popularity scores. Can be one of the following:
- ``'quantile'`` (the default)
- ``'rank'``
- ``'count'``
Attributes:
item_scores_(pandas.Series):
Time-bounded item popularity scores.
"""

def __init__(self, cutoff: datetime, score_method="quantile"):
super().__init__(score_method)

self.cutoff = cutoff
self.score_method = score_method

@override
def train(self, data: Dataset, **kwargs):
_log.info("counting time-bounded item popularity")

log = data.interaction_log("numpy")

item_scores = None
if log.timestamps is None:
_log.warning("no timestamps in interaction log; falling back to PopScorer")
item_scores = super().train(data, **kwargs).item_scores_
else:
counts = np.zeros(data.item_count, dtype=np.int32)
start_timestamp = self.cutoff.timestamp()
item_nums = log.item_nums[log.timestamps > start_timestamp]
np.add.at(counts, item_nums, 1)

item_scores = super()._train_internal(
pd.Series(counts, index=data.items.index),
**kwargs,
)

self.item_scores_ = item_scores

return self

@override
def __str__(self):
return "TimeBoundedPopScore({}, {})".format(self.cutoff, self.score_method)
79 changes: 79 additions & 0 deletions lenskit/tests/basic/test_time_bounded_popular.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# This file is part of LensKit.
# Copyright (C) 2018-2023 Boise State University
# Copyright (C) 2023-2024 Drexel University
# Licensed under the MIT license, see LICENSE.md for details.
# SPDX-License-Identifier: MIT

import pickle
from datetime import datetime, timedelta

import numpy as np
import pandas as pd

from lenskit.basic import popularity
from lenskit.data import from_interactions_df
from lenskit.data.items import ItemList

ts = datetime(year=2024, month=1, day=1)
one_day_ago = ts - timedelta(days=1)
two_days_ago = ts - timedelta(days=2)
simple_df = pd.DataFrame(
{
"item": [1, 2, 2, 3],
"user": [10, 12, 10, 13],
"rating": [4.0, 3.0, 5.0, 2.0],
"timestamp": [i.timestamp() for i in [ts, one_day_ago, one_day_ago, one_day_ago]],
}
)
simple_ds = from_interactions_df(simple_df)


def test_time_bounded_pop_score_quantile_one_day_window():
algo = popularity.TimeBoundedPopScore(one_day_ago)
algo.train(simple_ds)
assert algo.item_scores_.equals(pd.Series([1.0, 0.0, 0.0], index=[1, 2, 3]))


def test_time_bounded_pop_score_quantile_one_day_window_call_interface():
algo = popularity.TimeBoundedPopScore(one_day_ago)
algo.train(simple_ds)
p = algo(ItemList(item_ids=[1, 2, 3]))

assert len(p) == 3
assert (p.scores() == [1.0, 0.0, 0.0]).all()


def test_time_bounded_pop_score_quantile_two_day_window():
algo = popularity.TimeBoundedPopScore(two_days_ago)
algo.train(simple_ds)
assert algo.item_scores_.equals(pd.Series([0.25, 1.0, 0.5], index=[1, 2, 3]))


def test_time_bounded_pop_score_fallbacks_to_pop_score_for_dataset_without_timestamps():
ds = from_interactions_df(simple_df.drop(columns=["timestamp"]))

algo = popularity.TimeBoundedPopScore(one_day_ago)
algo.train(ds)
assert algo.item_scores_.equals(pd.Series([0.25, 1.0, 0.5], index=[1, 2, 3]))


def test_time_bounded_pop_score_rank():
algo = popularity.TimeBoundedPopScore(two_days_ago, "rank")
algo.train(simple_ds)
assert algo.item_scores_.equals(pd.Series([1.5, 3.0, 1.5], index=[1, 2, 3]))


def test_time_bounded_pop_score_counts():
algo = popularity.TimeBoundedPopScore(two_days_ago, "count")
algo.train(simple_ds)
assert algo.item_scores_.equals(pd.Series([1, 2, 1], index=[1, 2, 3], dtype=np.int32))


def test_time_bounded_pop_score_save_load():
original = popularity.TimeBoundedPopScore(one_day_ago)
original.train(simple_ds)

mod = pickle.dumps(original)
algo = pickle.loads(mod)

assert all(algo.item_scores_ == original.item_scores_)

0 comments on commit 1dd80e8

Please sign in to comment.