Merge pull request #493 from ZiyaoWei/main

feat: Add TimeBoundedPopScore for time-bounded popularity
lenskit · Dec 22, 2024 · 1dd80e8 · 1dd80e8
2 parents 3cc9b4a + 5fc77fa
commit 1dd80e8
Show file tree

Hide file tree

Showing 2 changed files with 143 additions and 3 deletions.
diff --git a/lenskit/lenskit/basic/popularity.py b/lenskit/lenskit/basic/popularity.py
@@ -1,5 +1,7 @@
 import logging
+from datetime import datetime
 
+import numpy as np
 import pandas as pd
 from typing_extensions import override
 
@@ -43,7 +45,11 @@ def train(self, data: Dataset):
         _log.info("counting item popularity")
         stats = data.item_stats()
         scores = stats["count"]
+        self.item_scores_ = self._train_internal(scores)
 
+        return self
+
+    def _train_internal(self, scores: pd.Series):
         if self.score_method == "rank":
             _log.info("ranking %d items", len(scores))
             scores = scores.rank().sort_index()
@@ -59,13 +65,68 @@ def train(self, data: Dataset):
         else:
             raise ValueError("invalid scoring method " + repr(self.score_method))
 
-        self.item_scores_ = scores
-
-        return self
+        return scores
 
     def __call__(self, items: ItemList) -> ItemList:
         scores = self.item_scores_.reindex(items.ids())
         return ItemList(items, scores=scores)
 
     def __str__(self):
         return "PopScore({})".format(self.score_method)
+
+
+class TimeBoundedPopScore(PopScorer):
+    """
+    Score items by their time-bounded popularity, i.e., the popularity in the
+    most recent `time_window` period.  Use with :py:class:`TopN` to get a
+    most-popular-recent-items recommender.
+
+    Args:
+        time_window(datetime.timedelta):
+            The time window for computing popularity scores.
+        score_type(str):
+            The method for computing popularity scores.  Can be one of the following:
+
+            - ``'quantile'`` (the default)
+            - ``'rank'``
+            - ``'count'``
+
+    Attributes:
+        item_scores_(pandas.Series):
+            Time-bounded item popularity scores.
+    """
+
+    def __init__(self, cutoff: datetime, score_method="quantile"):
+        super().__init__(score_method)
+
+        self.cutoff = cutoff
+        self.score_method = score_method
+
+    @override
+    def train(self, data: Dataset, **kwargs):
+        _log.info("counting time-bounded item popularity")
+
+        log = data.interaction_log("numpy")
+
+        item_scores = None
+        if log.timestamps is None:
+            _log.warning("no timestamps in interaction log; falling back to PopScorer")
+            item_scores = super().train(data, **kwargs).item_scores_
+        else:
+            counts = np.zeros(data.item_count, dtype=np.int32)
+            start_timestamp = self.cutoff.timestamp()
+            item_nums = log.item_nums[log.timestamps > start_timestamp]
+            np.add.at(counts, item_nums, 1)
+
+            item_scores = super()._train_internal(
+                pd.Series(counts, index=data.items.index),
+                **kwargs,
+            )
+
+        self.item_scores_ = item_scores
+
+        return self
+
+    @override
+    def __str__(self):
+        return "TimeBoundedPopScore({}, {})".format(self.cutoff, self.score_method)
diff --git a/lenskit/tests/basic/test_time_bounded_popular.py b/lenskit/tests/basic/test_time_bounded_popular.py
@@ -0,0 +1,79 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+import pickle
+from datetime import datetime, timedelta
+
+import numpy as np
+import pandas as pd
+
+from lenskit.basic import popularity
+from lenskit.data import from_interactions_df
+from lenskit.data.items import ItemList
+
+ts = datetime(year=2024, month=1, day=1)
+one_day_ago = ts - timedelta(days=1)
+two_days_ago = ts - timedelta(days=2)
+simple_df = pd.DataFrame(
+    {
+        "item": [1, 2, 2, 3],
+        "user": [10, 12, 10, 13],
+        "rating": [4.0, 3.0, 5.0, 2.0],
+        "timestamp": [i.timestamp() for i in [ts, one_day_ago, one_day_ago, one_day_ago]],
+    }
+)
+simple_ds = from_interactions_df(simple_df)
+
+
+def test_time_bounded_pop_score_quantile_one_day_window():
+    algo = popularity.TimeBoundedPopScore(one_day_ago)
+    algo.train(simple_ds)
+    assert algo.item_scores_.equals(pd.Series([1.0, 0.0, 0.0], index=[1, 2, 3]))
+
+
+def test_time_bounded_pop_score_quantile_one_day_window_call_interface():
+    algo = popularity.TimeBoundedPopScore(one_day_ago)
+    algo.train(simple_ds)
+    p = algo(ItemList(item_ids=[1, 2, 3]))
+
+    assert len(p) == 3
+    assert (p.scores() == [1.0, 0.0, 0.0]).all()
+
+
+def test_time_bounded_pop_score_quantile_two_day_window():
+    algo = popularity.TimeBoundedPopScore(two_days_ago)
+    algo.train(simple_ds)
+    assert algo.item_scores_.equals(pd.Series([0.25, 1.0, 0.5], index=[1, 2, 3]))
+
+
+def test_time_bounded_pop_score_fallbacks_to_pop_score_for_dataset_without_timestamps():
+    ds = from_interactions_df(simple_df.drop(columns=["timestamp"]))
+
+    algo = popularity.TimeBoundedPopScore(one_day_ago)
+    algo.train(ds)
+    assert algo.item_scores_.equals(pd.Series([0.25, 1.0, 0.5], index=[1, 2, 3]))
+
+
+def test_time_bounded_pop_score_rank():
+    algo = popularity.TimeBoundedPopScore(two_days_ago, "rank")
+    algo.train(simple_ds)
+    assert algo.item_scores_.equals(pd.Series([1.5, 3.0, 1.5], index=[1, 2, 3]))
+
+
+def test_time_bounded_pop_score_counts():
+    algo = popularity.TimeBoundedPopScore(two_days_ago, "count")
+    algo.train(simple_ds)
+    assert algo.item_scores_.equals(pd.Series([1, 2, 1], index=[1, 2, 3], dtype=np.int32))
+
+
+def test_time_bounded_pop_score_save_load():
+    original = popularity.TimeBoundedPopScore(one_day_ago)
+    original.train(simple_ds)
+
+    mod = pickle.dumps(original)
+    algo = pickle.loads(mod)
+
+    assert all(algo.item_scores_ == original.item_scores_)