diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 14ed6869e..55af5d198 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -556,7 +556,7 @@ jobs: path: | data !data/*.zip - key: test-mldata-000-2ee919d5c0eef34d5a4f40bcf0480c1bf0310417db6921e3a2575c48991f379c2f4ad179f8514390133795614a96fa5b4ece55906c68a90af07c09670b2c3c5b + key: test-mldata-001-2ee919d5c0eef34d5a4f40bcf0480c1bf0310417db6921e3a2575c48991f379c2f4ad179f8514390133795614a96fa5b4ece55906c68a90af07c09670b2c3c5b - name: Download ML data run: | python -m lenskit.data.fetch ml-100k ml-20m @@ -613,7 +613,7 @@ jobs: path: | data !data/*.zip - key: test-mldata-000-cd26f1c44a6962b0936346b346a9b418a3ed04b01a2892269fccd24a6387e943dba6d5e64ab2f8feb1823475601d65c2e6ebbeeeca0c2c210f0d37c00aabf2e9 + key: test-mldata-001-cd26f1c44a6962b0936346b346a9b418a3ed04b01a2892269fccd24a6387e943dba6d5e64ab2f8feb1823475601d65c2e6ebbeeeca0c2c210f0d37c00aabf2e9 - name: Download ML data run: | python -m lenskit.data.fetch ml-100k ml-1m ml-10m ml-20m diff --git a/.vscode/ltex.dictionary.en-US.txt b/.vscode/ltex.dictionary.en-US.txt index b7e0c9ac0..2882dffb6 100644 --- a/.vscode/ltex.dictionary.en-US.txt +++ b/.vscode/ltex.dictionary.en-US.txt @@ -9,3 +9,4 @@ lenskit invoker CUDA subpackages +recomputation diff --git a/conftest.py b/conftest.py index 277df074b..9deb7043d 100644 --- a/conftest.py +++ b/conftest.py @@ -15,6 +15,7 @@ from pytest import fixture, skip from lenskit.parallel import ensure_parallel_init +from lenskit.util.test import ml_100k, ml_ds, ml_ratings # noqa: F401 logging.getLogger("numba").setLevel(logging.INFO) diff --git a/docs/GettingStarted.ipynb b/docs/GettingStarted.ipynb index 9742b32e7..04c933104 100644 --- a/docs/GettingStarted.ipynb +++ b/docs/GettingStarted.ipynb @@ -26,8 +26,8 @@ "metadata": {}, "outputs": [], "source": [ - "from lenskit.datasets import ML100K\n", "from lenskit.data import from_interactions_df\n", + "from lenskit.data.movielens import load_movielens_df\n", "from lenskit import batch, topn, util\n", "from lenskit import crossfold as xf\n", "from lenskit.algorithms import Recommender, als, knn\n", @@ -77,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -156,15 +156,14 @@ "4 166 346 1.0 886397596" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ml100k = ML100K(here('data/ml-100k'))\n", - "ratings = ml100k.ratings\n", - "ratings.head()" + "ml100k = load_movielens_df(here('data/ml-100k.zip'))\n", + "ml100k.head()" ] }, { @@ -178,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -210,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -235,22 +234,22 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/Users/mde48/LensKit/lkpy/lenskit/lenskit/data/matrix.py:152: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /Users/runner/miniforge3/conda-bld/libtorch_1716578890680/work/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)\n", - " matrix = matrix.to_sparse_csr()\n" + "/Users/mde48/LensKit/lkpy/lenskit/lenskit/data/dataset.py:628: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /Users/runner/miniforge3/conda-bld/libtorch_1719361060788/work/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)\n", + " return torch.sparse_csr_tensor(\n" ] } ], "source": [ "all_recs = []\n", "test_data = []\n", - "for train, test in xf.partition_users(ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)):\n", + "for train, test in xf.partition_users(ml100k[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)):\n", " test_data.append(test)\n", " all_recs.append(eval('ItemItem', algo_ii, train, test))\n", " all_recs.append(eval('ALS', algo_als, train, test))" @@ -265,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -299,32 +298,32 @@ " \n", " \n", " 0\n", - " 1125\n", - " 5.014371\n", + " 1449\n", + " 4.994975\n", " 2\n", " 1\n", " ItemItem\n", " \n", " \n", " 1\n", - " 1449\n", - " 4.967544\n", + " 1398\n", + " 4.866851\n", " 2\n", " 2\n", " ItemItem\n", " \n", " \n", " 2\n", - " 427\n", - " 4.863028\n", + " 511\n", + " 4.845399\n", " 2\n", " 3\n", " ItemItem\n", " \n", " \n", " 3\n", - " 483\n", - " 4.855851\n", + " 1512\n", + " 4.805413\n", " 2\n", " 4\n", " ItemItem\n", @@ -332,7 +331,7 @@ " \n", " 4\n", " 1594\n", - " 4.846334\n", + " 4.788468\n", " 2\n", " 5\n", " ItemItem\n", @@ -343,14 +342,14 @@ ], "text/plain": [ " item score user rank Algorithm\n", - "0 1125 5.014371 2 1 ItemItem\n", - "1 1449 4.967544 2 2 ItemItem\n", - "2 427 4.863028 2 3 ItemItem\n", - "3 483 4.855851 2 4 ItemItem\n", - "4 1594 4.846334 2 5 ItemItem" + "0 1449 4.994975 2 1 ItemItem\n", + "1 1398 4.866851 2 2 ItemItem\n", + "2 511 4.845399 2 3 ItemItem\n", + "3 1512 4.805413 2 4 ItemItem\n", + "4 1594 4.788468 2 5 ItemItem" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -369,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -387,7 +386,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -427,27 +426,27 @@ " ItemItem\n", " 2\n", " 100\n", - " 0.085382\n", + " 0.081186\n", " \n", " \n", - " 7\n", + " 6\n", " 100\n", - " 0.223133\n", + " 0.288946\n", " \n", " \n", " 8\n", " 100\n", - " 0.097582\n", + " 0.082112\n", " \n", " \n", - " 9\n", + " 10\n", " 100\n", - " 0.063818\n", + " 0.364167\n", " \n", " \n", - " 10\n", + " 14\n", " 100\n", - " 0.211332\n", + " 0.182636\n", " \n", " \n", "\n", @@ -456,14 +455,14 @@ "text/plain": [ " nrecs ndcg\n", "Algorithm user \n", - "ItemItem 2 100 0.085382\n", - " 7 100 0.223133\n", - " 8 100 0.097582\n", - " 9 100 0.063818\n", - " 10 100 0.211332" + "ItemItem 2 100 0.081186\n", + " 6 100 0.288946\n", + " 8 100 0.082112\n", + " 10 100 0.364167\n", + " 14 100 0.182636" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -484,19 +483,19 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Algorithm\n", - "ALS 0.140061\n", - "ItemItem 0.099664\n", + "ALS 0.132649\n", + "ItemItem 0.096963\n", "Name: ndcg, dtype: float64" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -507,7 +506,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -516,13 +515,13 @@ "" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAHhCAYAAABN6eUeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAtn0lEQVR4nO3df1SUdd7/8dfwayZL0ERBXUTw3lW8ScvhXm8wPLVrmHZn3mtFv3BPanvTaTNgve809C5tkzbNmyyB1aDWzqbsru2pVjJpSw8GJ1cC6xS33W0i3jYcg4rRLMDh+v7hcb73NGAOGvMBno9zrnO8PvO+rnlfHad5+bl+jM2yLEsAAAAGCwl2AwAAAN+FwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYLywYDdwsXR1denTTz/V0KFDZbPZgt0OAAA4D5Zl6cSJExozZoxCQnqeRxkwgeXTTz9VXFxcsNsAAAC9cPToUf3gBz/o8fUBE1iGDh0q6cwBR0ZGBrkbAABwPtxut+Li4rzf4z0ZMIHl7GmgyMhIAgsAAP3Md13OwUW3AADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADBerwJLUVGREhIS5HA45HQ6VVVV1WOty+XSHXfcoYkTJyokJEQ5OTnn3Pf27dtls9k0f/783rQGAAAGoIADS3l5uXJycpSfn6+6ujqlp6drzpw5ampq6ra+vb1dI0eOVH5+vqZOnXrOfR85ckTLli1Tenp6oG0BAIABLODAsmHDBi1evFhLlixRUlKSCgsLFRcXp+Li4m7rx48fr6eeekoLFy5UVFRUj/v1eDy68847tXr1aiUmJgbaFgAAGMACCiwdHR2qra1VRkaGz3hGRoaqq6svqJE1a9Zo5MiRWrx48XnVt7e3y+12+ywAAGBgCgukuKWlRR6PRzExMT7jMTExam5u7nUTb7/9tkpLS1VfX3/e2xQUFGj16tW9fs+BZPzyncFuAX2o8fEbgt0CAPS5Xl10a7PZfNYty/IbO18nTpzQXXfdpS1btig6Ovq8t1uxYoXa2tq8y9GjR3v1/gAAwHwBzbBER0crNDTUbzbl+PHjfrMu5+vvf/+7GhsbdeONN3rHurq6zjQXFqZDhw5pwoQJftvZ7XbZ7fZevScAAOhfApphiYiIkNPpVGVlpc94ZWWl0tLSetXApEmT9P7776u+vt67zJs3T9dee63q6+sVFxfXq/0CAICBI6AZFknKy8tTVlaWUlJSlJqaqs2bN6upqUnZ2dmSzpyqOXbsmLZu3erd5uy1KSdPntRnn32m+vp6RUREaPLkyXI4HEpOTvZ5j2HDhkmS3zgAABicAg4smZmZam1t1Zo1a+RyuZScnKyKigrFx8dLOvOguG8/k+Wqq67y/rm2tlYvvvii4uPj1djYeGHdAwCAQcFmWZYV7CYuBrfbraioKLW1tSkyMjLY7fQp7hIaXLhLCMBAcr7f3/yWEAAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGC8XgWWoqIiJSQkyOFwyOl0qqqqqsdal8ulO+64QxMnTlRISIhycnL8arZs2aL09HQNHz5cw4cP16xZs7R///7etAYAAAaggANLeXm5cnJylJ+fr7q6OqWnp2vOnDlqamrqtr69vV0jR45Ufn6+pk6d2m3Nnj17dPvtt+utt95STU2Nxo0bp4yMDB07dizQ9gAAwABksyzLCmSD6dOna9q0aSouLvaOJSUlaf78+SooKDjnttdcc42uvPJKFRYWnrPO4/Fo+PDheuaZZ7Rw4cJua9rb29Xe3u5dd7vdiouLU1tbmyIjI8//gAaA8ct3BrsF9KHGx28IdgsAcNG43W5FRUV95/d3QDMsHR0dqq2tVUZGhs94RkaGqqure9dpN06dOqXOzk5dfvnlPdYUFBQoKirKu8TFxV209wcAAGYJKLC0tLTI4/EoJibGZzwmJkbNzc0Xranly5dr7NixmjVrVo81K1asUFtbm3c5evToRXt/AABglrDebGSz2XzWLcvyG+utJ554Qtu2bdOePXvkcDh6rLPb7bLb7RflPQEAgNkCCizR0dEKDQ31m005fvy436xLb6xfv15r167VG2+8oSlTplzw/gAAwMAQ0CmhiIgIOZ1OVVZW+oxXVlYqLS3tghpZt26dHn30Ue3atUspKSkXtC8AADCwBHxKKC8vT1lZWUpJSVFqaqo2b96spqYmZWdnSzpzbcmxY8e0detW7zb19fWSpJMnT+qzzz5TfX29IiIiNHnyZElnTgOtWrVKL774osaPH++dwbnssst02WWXXegxAgCAfi7gwJKZmanW1latWbNGLpdLycnJqqioUHx8vKQzD4r79jNZrrrqKu+fa2tr9eKLLyo+Pl6NjY2SzjyIrqOjQzfffLPPdg8//LAeeeSRQFsEAAADTMDPYTHV+d7HPRDxHJbBheewABhIvpfnsAAAAAQDgQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeL0KLEVFRUpISJDD4ZDT6VRVVVWPtS6XS3fccYcmTpyokJAQ5eTkdFu3Y8cOTZ48WXa7XZMnT9af//zn3rQGAAAGoIADS3l5uXJycpSfn6+6ujqlp6drzpw5ampq6ra+vb1dI0eOVH5+vqZOndptTU1NjTIzM5WVlaWDBw8qKytLt956q955551A2wMAAAOQzbIsK5ANpk+frmnTpqm4uNg7lpSUpPnz56ugoOCc215zzTW68sorVVhY6DOemZkpt9ut1157zTt2/fXXa/jw4dq2bVu3+2pvb1d7e7t33e12Ky4uTm1tbYqMjAzkkPq98ct3BrsF9KHGx28IdgsAcNG43W5FRUV95/d3QDMsHR0dqq2tVUZGhs94RkaGqqure9epzsywfHufs2fPPuc+CwoKFBUV5V3i4uJ6/f4AAMBsYYEUt7S0yOPxKCYmxmc8JiZGzc3NvW6iubk54H2uWLFCeXl53vWzMywAMJAwgzq4MIPas4ACy1k2m81n3bIsv7Hve592u112u/2C3hMAAPQPAZ0Sio6OVmhoqN/Mx/Hjx/1mSAIRGxt70fcJAAAGjoACS0REhJxOpyorK33GKysrlZaW1usmUlNT/fa5e/fuC9onAAAYOAI+JZSXl6esrCylpKQoNTVVmzdvVlNTk7KzsyWdubbk2LFj2rp1q3eb+vp6SdLJkyf12Wefqb6+XhEREZo8ebIk6YEHHtDMmTP1m9/8RjfddJNefvllvfHGG9q3b99FOEQAANDfBRxYMjMz1draqjVr1sjlcik5OVkVFRWKj4+XdOZBcd9+JstVV13l/XNtba1efPFFxcfHq7GxUZKUlpam7du3a+XKlVq1apUmTJig8vJyTZ8+/QIODQAADBQBP4fFVOd7H/dAxF0Egwt3EQwufL4Hl8H4+f5ensMCAAAQDAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOP1KrAUFRUpISFBDodDTqdTVVVV56zfu3evnE6nHA6HEhMTVVJS4ldTWFioiRMn6pJLLlFcXJxyc3P1zTff9KY9AAAwwAQcWMrLy5WTk6P8/HzV1dUpPT1dc+bMUVNTU7f1hw8f1ty5c5Wenq66ujo99NBDWrp0qXbs2OGt+f3vf6/ly5fr4YcfVkNDg0pLS1VeXq4VK1b0/sgAAMCAERboBhs2bNDixYu1ZMkSSWdmRl5//XUVFxeroKDAr76kpETjxo1TYWGhJCkpKUkHDhzQ+vXrtWDBAklSTU2NZsyYoTvuuEOSNH78eN1+++3av39/b48LAAAMIAHNsHR0dKi2tlYZGRk+4xkZGaquru52m5qaGr/62bNn68CBA+rs7JQkXX311aqtrfUGlE8++UQVFRW64YYbeuylvb1dbrfbZwEAAANTQDMsLS0t8ng8iomJ8RmPiYlRc3Nzt9s0Nzd3W3/69Gm1tLRo9OjRuu222/TZZ5/p6quvlmVZOn36tO69914tX768x14KCgq0evXqQNoHAAD9VK8uurXZbD7rlmX5jX1X/f8d37Nnjx577DEVFRXp3Xff1UsvvaS//OUvevTRR3vc54oVK9TW1uZdjh492ptDAQAA/UBAMyzR0dEKDQ31m005fvy43yzKWbGxsd3Wh4WFacSIEZKkVatWKSsry3tdzBVXXKGvvvpKv/jFL5Sfn6+QEP9cZbfbZbfbA2kfAAD0UwHNsERERMjpdKqystJnvLKyUmlpad1uk5qa6le/e/dupaSkKDw8XJJ06tQpv1ASGhoqy7K8szEAAGDwCviUUF5enp599lmVlZWpoaFBubm5ampqUnZ2tqQzp2oWLlzorc/OztaRI0eUl5enhoYGlZWVqbS0VMuWLfPW3HjjjSouLtb27dt1+PBhVVZWatWqVZo3b55CQ0MvwmECAID+LODbmjMzM9Xa2qo1a9bI5XIpOTlZFRUVio+PlyS5XC6fZ7IkJCSooqJCubm52rRpk8aMGaONGzd6b2mWpJUrV8pms2nlypU6duyYRo4cqRtvvFGPPfbYRThEAADQ39msAXLOxe12KyoqSm1tbYqMjAx2O31q/PKdwW4Bfajx8Z5v98fAw+d7cBmMn+/z/f7mt4QAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxutVYCkqKlJCQoIcDoecTqeqqqrOWb937145nU45HA4lJiaqpKTEr+bLL7/Ufffdp9GjR8vhcCgpKUkVFRW9aQ8AAAwwAQeW8vJy5eTkKD8/X3V1dUpPT9ecOXPU1NTUbf3hw4c1d+5cpaenq66uTg899JCWLl2qHTt2eGs6Ojp03XXXqbGxUX/605906NAhbdmyRWPHju39kQEAgAEjLNANNmzYoMWLF2vJkiWSpMLCQr3++usqLi5WQUGBX31JSYnGjRunwsJCSVJSUpIOHDig9evXa8GCBZKksrIyff7556qurlZ4eLgkKT4+vrfHBAAABpiAZlg6OjpUW1urjIwMn/GMjAxVV1d3u01NTY1f/ezZs3XgwAF1dnZKkl555RWlpqbqvvvuU0xMjJKTk7V27Vp5PJ4ee2lvb5fb7fZZAADAwBRQYGlpaZHH41FMTIzPeExMjJqbm7vdprm5udv606dPq6WlRZL0ySef6E9/+pM8Ho8qKiq0cuVKPfnkk3rsscd67KWgoEBRUVHeJS4uLpBDAQAA/UivLrq12Ww+65Zl+Y19V/3/He/q6tKoUaO0efNmOZ1O3XbbbcrPz1dxcXGP+1yxYoXa2tq8y9GjR3tzKAAAoB8I6BqW6OhohYaG+s2mHD9+3G8W5azY2Nhu68PCwjRixAhJ0ujRoxUeHq7Q0FBvTVJSkpqbm9XR0aGIiAi//drtdtnt9kDaBwAA/VRAMywRERFyOp2qrKz0Ga+srFRaWlq326SmpvrV7969WykpKd4LbGfMmKGPP/5YXV1d3pqPPvpIo0eP7jasAACAwSXgU0J5eXl69tlnVVZWpoaGBuXm5qqpqUnZ2dmSzpyqWbhwobc+OztbR44cUV5enhoaGlRWVqbS0lItW7bMW3PvvfeqtbVVDzzwgD766CPt3LlTa9eu1X333XcRDhEAAPR3Ad/WnJmZqdbWVq1Zs0Yul0vJycmqqKjw3obscrl8nsmSkJCgiooK5ebmatOmTRozZow2btzovaVZkuLi4rR7927l5uZqypQpGjt2rB544AE9+OCDF+EQAQBAf2ezzl4B28+53W5FRUWpra1NkZGRwW6nT41fvjPYLaAPNT5+Q7BbQB/i8z24DMbP9/l+f/NbQgAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxehVYioqKlJCQIIfDIafTqaqqqnPW7927V06nUw6HQ4mJiSopKemxdvv27bLZbJo/f35vWgMAAANQwIGlvLxcOTk5ys/PV11dndLT0zVnzhw1NTV1W3/48GHNnTtX6enpqqur00MPPaSlS5dqx44dfrVHjhzRsmXLlJ6eHviRAACAASvgwLJhwwYtXrxYS5YsUVJSkgoLCxUXF6fi4uJu60tKSjRu3DgVFhYqKSlJS5Ys0aJFi7R+/XqfOo/HozvvvFOrV69WYmLid/bR3t4ut9vtswAAgIEpoMDS0dGh2tpaZWRk+IxnZGSourq6221qamr86mfPnq0DBw6os7PTO7ZmzRqNHDlSixcvPq9eCgoKFBUV5V3i4uICORQAANCPBBRYWlpa5PF4FBMT4zMeExOj5ubmbrdpbm7utv706dNqaWmRJL399tsqLS3Vli1bzruXFStWqK2tzbscPXo0kEMBAAD9SFhvNrLZbD7rlmX5jX1X/dnxEydO6K677tKWLVsUHR193j3Y7XbZ7fYAugYAAP1VQIElOjpaoaGhfrMpx48f95tFOSs2Nrbb+rCwMI0YMUIffPCBGhsbdeONN3pf7+rqOtNcWJgOHTqkCRMmBNImAAAYYAI6JRQRESGn06nKykqf8crKSqWlpXW7TWpqql/97t27lZKSovDwcE2aNEnvv/++6uvrvcu8efN07bXXqr6+nmtTAABA4KeE8vLylJWVpZSUFKWmpmrz5s1qampSdna2pDPXlhw7dkxbt26VJGVnZ+uZZ55RXl6e7rnnHtXU1Ki0tFTbtm2TJDkcDiUnJ/u8x7BhwyTJbxwAAAxOAQeWzMxMtba2as2aNXK5XEpOTlZFRYXi4+MlSS6Xy+eZLAkJCaqoqFBubq42bdqkMWPGaOPGjVqwYMHFOwoAADCg2ayzV8D2c263W1FRUWpra1NkZGSw2+lT45fvDHYL6EONj98Q7BbQh/h8Dy6D8fN9vt/f/JYQAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHi9CixFRUVKSEiQw+GQ0+lUVVXVOev37t0rp9Mph8OhxMRElZSU+Ly+ZcsWpaena/jw4Ro+fLhmzZql/fv396Y1AAAwAAUcWMrLy5WTk6P8/HzV1dUpPT1dc+bMUVNTU7f1hw8f1ty5c5Wenq66ujo99NBDWrp0qXbs2OGt2bNnj26//Xa99dZbqqmp0bhx45SRkaFjx471/sgAAMCAYbMsywpkg+nTp2vatGkqLi72jiUlJWn+/PkqKCjwq3/wwQf1yiuvqKGhwTuWnZ2tgwcPqqamptv38Hg8Gj58uJ555hktXLiw25r29na1t7d7191ut+Li4tTW1qbIyMhADqnfG798Z7BbQB9qfPyGYLeAPsTne3AZjJ9vt9utqKio7/z+DmiGpaOjQ7W1tcrIyPAZz8jIUHV1dbfb1NTU+NXPnj1bBw4cUGdnZ7fbnDp1Sp2dnbr88st77KWgoEBRUVHeJS4uLpBDAQAA/UhAgaWlpUUej0cxMTE+4zExMWpubu52m+bm5m7rT58+rZaWlm63Wb58ucaOHatZs2b12MuKFSvU1tbmXY4ePRrIoQAAgH4krDcb2Ww2n3XLsvzGvqu+u3FJeuKJJ7Rt2zbt2bNHDoejx33a7XbZ7fZA2gYAAP1UQIElOjpaoaGhfrMpx48f95tFOSs2Nrbb+rCwMI0YMcJnfP369Vq7dq3eeOMNTZkyJZDWAADAABbQKaGIiAg5nU5VVlb6jFdWViotLa3bbVJTU/3qd+/erZSUFIWHh3vH1q1bp0cffVS7du1SSkpKIG0BAIABLuDbmvPy8vTss8+qrKxMDQ0Nys3NVVNTk7KzsyWdubbk/97Zk52drSNHjigvL08NDQ0qKytTaWmpli1b5q154okntHLlSpWVlWn8+PFqbm5Wc3OzTp48eREOEQAA9HcBX8OSmZmp1tZWrVmzRi6XS8nJyaqoqFB8fLwkyeVy+TyTJSEhQRUVFcrNzdWmTZs0ZswYbdy4UQsWLPDWFBUVqaOjQzfffLPPez388MN65JFHenloAABgoAj4OSymOt/7uAcintMwuAzG5zQMZny+B5fB+Pn+Xp7DAgAAEAwEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADj9SqwFBUVKSEhQQ6HQ06nU1VVVees37t3r5xOpxwOhxITE1VSUuJXs2PHDk2ePFl2u12TJ0/Wn//85960BgAABqCAA0t5eblycnKUn5+vuro6paena86cOWpqauq2/vDhw5o7d67S09NVV1enhx56SEuXLtWOHTu8NTU1NcrMzFRWVpYOHjyorKws3XrrrXrnnXd6f2QAAGDAsFmWZQWywfTp0zVt2jQVFxd7x5KSkjR//nwVFBT41T/44IN65ZVX1NDQ4B3Lzs7WwYMHVVNTI0nKzMyU2+3Wa6+95q25/vrrNXz4cG3btu28+nK73YqKilJbW5siIyMDOaR+b/zyncFuAX2o8fEbgt0C+hCf78FlMH6+z/f7OyyQnXZ0dKi2tlbLly/3Gc/IyFB1dXW329TU1CgjI8NnbPbs2SotLVVnZ6fCw8NVU1Oj3Nxcv5rCwsIee2lvb1d7e7t3va2tTdKZAx9sutpPBbsF9KHB+Hd8MOPzPbgMxs/32WP+rvmTgAJLS0uLPB6PYmJifMZjYmLU3Nzc7TbNzc3d1p8+fVotLS0aPXp0jzU97VOSCgoKtHr1ar/xuLi48z0coF+KKgx2BwC+L4P5833ixAlFRUX1+HpAgeUsm83ms25Zlt/Yd9V/ezzQfa5YsUJ5eXne9a6uLn3++ecaMWLEObfDwOB2uxUXF6ejR48OulOAwEDH53twsSxLJ06c0JgxY85ZF1BgiY6OVmhoqN/Mx/Hjx/1mSM6KjY3ttj4sLEwjRow4Z01P+5Qku90uu93uMzZs2LDzPRQMEJGRkfwPDRig+HwPHueaWTkroLuEIiIi5HQ6VVlZ6TNeWVmptLS0brdJTU31q9+9e7dSUlIUHh5+zpqe9gkAAAaXgE8J5eXlKSsrSykpKUpNTdXmzZvV1NSk7OxsSWdO1Rw7dkxbt26VdOaOoGeeeUZ5eXm65557VFNTo9LSUp+7fx544AHNnDlTv/nNb3TTTTfp5Zdf1htvvKF9+/ZdpMMEAAD9WcCBJTMzU62trVqzZo1cLpeSk5NVUVGh+Ph4SZLL5fJ5JktCQoIqKiqUm5urTZs2acyYMdq4caMWLFjgrUlLS9P27du1cuVKrVq1ShMmTFB5ebmmT59+EQ4RA5HdbtfDDz/sd1oQQP/H5xvdCfg5LAAAAH2N3xICAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8Agv6pSNHjujDDz9UV1dXsFsBAPQBAguM9rvf/c7vV7t/8YtfKDExUVdccYWSk5N19OjR4DQHAOgzBBYYraSkxOc3Jnbt2qXnnntOW7du1d/+9jcNGzas21/tBtC/fPPNN1q3bp3mzp2rlJQUTZs2zWcBevVrzUBf+eijj5SSkuJdf/nllzVv3jzdeeedkqS1a9fq7rvvDlZ7AC6SRYsWqbKyUjfffLN+/OMfy2azBbslGIbAAqN9/fXXPr/WWl1drUWLFnnXExMT/X7pG0D/s3PnTlVUVGjGjBnBbgWG4pQQjBYfH6/a2lpJUktLiz744ANdffXV3tebm5vP62fJAZht7NixGjp0aLDbgMEILDDawoULdd999+nRRx/VLbfcokmTJsnpdHpfr66uVnJychA7BHAxPPnkk3rwwQd15MiRYLcCQ3FKCEZ78MEHderUKb300kuKjY3VH//4R5/X3377bd1+++1B6g7AxZKSkqJvvvlGiYmJGjJkiMLDw31e//zzz4PUGUzBrzWjX+vs7JTL5dK4ceOC3QqACzBr1iw1NTVp8eLFiomJ8bvo9uc//3mQOoMpCCzo1w4ePKhp06bJ4/EEuxUAF2DIkCGqqanR1KlTg90KDMU1LACAoJs0aZK+/vrrYLcBgxFYAABB9/jjj+tXv/qV9uzZo9bWVrndbp8F4JQQ+jVOCQEDQ0jImX8/f/vaFcuyZLPZ+IyDu4Rgtvfee++crx86dKiPOgHwfXrrrbeC3QIMxwwLjBYSEiKbzaZz/TXlX18AMPAxwwKjHT58+Dtrvvjiiz7oBMD3raqqSr/97W/1ySef6I9//KPGjh2rF154QQkJCT5PuMbgxEW3MFp8fHy3y7Bhw7Rz50797Gc/83nyLYD+aceOHZo9e7YuueQSvfvuu2pvb5cknThxQmvXrg1ydzABgQX9yptvvqm77rpLo0eP1tNPP605c+bowIEDwW4LwAX69a9/rZKSEm3ZssXnKbdpaWl69913g9gZTMEpIRjvf//3f/X888+rrKxMX331lW699VZ1dnZqx44dmjx5crDbA3ARHDp0SDNnzvQbj4yM1Jdfftn3DcE4zLDAaHPnztXkyZP14Ycf6umnn9ann36qp59+OthtAbjIRo8erY8//thvfN++fUpMTAxCRzANMyww2u7du7V06VLde++9+uEPfxjsdgB8T/7t3/5NDzzwgMrKymSz2fTpp5+qpqZGy5Yt03/+538Guz0YgMACo1VVVamsrEwpKSmaNGmSsrKylJmZGey2AFxk//Ef/6G2tjZde+21+uabbzRz5kzZ7XYtW7ZMv/zlL4PdHgzAc1jQL5w6dUrbt29XWVmZ9u/fL4/How0bNmjRokUaOnRosNsDcJGcOnVKH374obq6ujR58mRddtllwW4JhiCwoN85dOiQSktL9cILL+jLL7/Uddddp1deeSXYbQG4AIsWLdJTTz3l9w+Qr776Svfff7/KysqC1BlMQWBBv+XxePTqq6+qrKyMwAL0c6GhoXK5XBo1apTPeEtLi2JjY3X69OkgdQZTcA0L+q3Q0FDNnz9f8+fPD3YrAHrJ7XbLsixZlqUTJ07I4XB4X/N4PKqoqPALMRicCCwAgKAZNmyYbDabbDabfvSjH/m9brPZtHr16iB0BtNwSggAEDR79+6VZVn6yU9+oh07dujyyy/3vhYREaH4+HiNGTMmiB3CFAQWAEDQHTlyROPGjZPNZgt2KzAUgQUAEDTvvffeedVNmTLle+4EpiOwAACCJiQkRDabTef6KrLZbPJ4PH3YFUzERbcAgKA5fPhwsFtAP8EMCwAAMB4zLAAAI3zzzTd67733dPz4cXV1dfm8Nm/evCB1BVMQWAAAQbdr1y4tXLhQLS0tfq9xDQskKSTYDQAA8Mtf/lK33HKLXC6Xurq6fBbCCiSuYQEAGCAyMlJ1dXWaMGFCsFuBoZhhAQAE3c0336w9e/YEuw0YjBkWAEDQnTp1SrfccotGjhypK664QuHh4T6vL126NEidwRQEFgBA0D377LPKzs7WJZdcohEjRvg8ot9ms+mTTz4JYncwAYEFABB0sbGxWrp0qZYvX66QEK5WgD/+VgAAgq6jo0OZmZmEFfSIvxkAgKD7+c9/rvLy8mC3AYPx4DgAQNB5PB498cQTev311zVlyhS/i243bNgQpM5gCq5hAQAE3bXXXtvjazabTW+++WYfdgMTEVgAAIDxuIYFAGCMjz/+WK+//rq+/vprSRL/psZZBBYAQNC1trbqpz/9qX70ox9p7ty5crlckqQlS5boV7/6VZC7gwkILACAoMvNzVV4eLiampo0ZMgQ73hmZqZ27doVxM5gCu4SAgAE3e7du/X666/rBz/4gc/4D3/4Qx05ciRIXcEkzLAAAILuq6++8plZOaulpUV2uz0IHcE0BBYAQNDNnDlTW7du9a7bbDZ1dXVp3bp157zlGYMHtzUDAILuww8/1DXXXCOn06k333xT8+bN0wcffKDPP/9cb7/9tiZMmBDsFhFkBBYAgBGam5tVXFys2tpadXV1adq0abrvvvs0evToYLcGAxBYAABB19TUpLi4ONlstm5fGzduXBC6gkkILACAoAsNDZXL5dKoUaN8xltbWzVq1Ch5PJ4gdQZTcNEtACDoLMvqdnbl5MmTcjgcQegIpuE5LACAoMnLy5N05q6gVatW+dza7PF49M477+jKK68MUncwCYEFABA0dXV1ks7MsLz//vuKiIjwvhYREaGpU6dq2bJlwWoPBuEaFgBA0N19993auHGjhg4dGuxWYCgCCwAgaH72s5+dV91LL730PXcC03FKCAAQNFFRUcFuAf0EMywAAMB43NYMAACMR2ABAADGI7AAAADjEVgAAIDxCCwALtiePXtks9n05ZdfGvNe48ePV2Fh4ffeD4C+QWABcN6qq6sVGhqq66+/Pmg9pKWlyeVyeW+Hff755zVs2LCg9QOgbxBYAJy3srIy3X///dq3b5+ampr6/P07OzsVERGh2NjYbn8oD8DARWABcF6++uor/eEPf9C9996rf/mXf9Hzzz9/zvotW7YoLi5OQ4YM0b/+679qw4YNfjMhxcXFmjBhgiIiIjRx4kS98MILPq/bbDaVlJTopptu0qWXXqpf//rXPqeE9uzZo7vvvlttbW2y2Wyy2Wx65JFHvNufOnVKixYt0tChQzVu3Dht3rzZ+1pjY6NsNpv+8Ic/KD09XZdccon+6Z/+SR999JH+9re/KSUlRZdddpmuv/56ffbZZxf6nw/AhbIA4DyUlpZaKSkplmVZ1quvvmqNHz/e6urqsizLst566y1LkvXFF19YlmVZ+/bts0JCQqx169ZZhw4dsjZt2mRdfvnlVlRUlHd/L730khUeHm5t2rTJOnTokPXkk09aoaGh1ptvvumtkWSNGjXKKi0ttf7+979bjY2NPu/V3t5uFRYWWpGRkZbL5bJcLpd14sQJy7IsKz4+3rr88sutTZs2Wf/zP/9jFRQUWCEhIVZDQ4NlWZZ1+PBhS5I1adIka9euXdaHH35o/fM//7M1bdo065prrrH27dtnvfvuu9Y//MM/WNnZ2X3wXxjAuRBYAJyXtLQ0q7Cw0LIsy+rs7LSio6OtyspKy7L8A0tmZqZ1ww03+Gx/5513+gSWtLQ065577vGpueWWW6y5c+d61yVZOTk5PjXffq/nnnvOZ79nxcfHW3fddZd3vauryxo1apRVXFxsWdb/DyzPPvust2bbtm2WJOuvf/2rd6ygoMCaOHHiuf7TAOgDnBIC8J0OHTqk/fv367bbbpMkhYWFKTMzU2VlZT3W//jHP/YZ+/Z6Q0ODZsyY4TM2Y8YMNTQ0+IylpKT0uu8pU6Z4/2yz2RQbG6vjx4/3WBMTEyNJuuKKK3zGvr0NgL7Hjx8C+E6lpaU6ffq0xo4d6x2zLEvh4eH64osv/Ooty/K7KNbq5mfLuqv59till17a677Dw8P93q+rq6vHmrPv/e2xb28DoO8xwwLgnE6fPq2tW7fqySefVH19vXc5ePCg4uPj9fvf/95vm0mTJmn//v0+YwcOHPBZT0pK0r59+3zGqqurlZSUFFB/ERER8ng8AW0DoP9hhgXAOf3lL3/RF198ocWLF3uffXLWzTffrNLSUv3Xf/2Xz/j999+vmTNnasOGDbrxxhv15ptv6rXXXvOZPfn3f/933XrrrZo2bZp++tOf6tVXX9VLL72kN954I6D+xo8fr5MnT+qvf/2rpk6dqiFDhmjIkCG9P2AARmKGBcA5lZaWatasWX5hRZIWLFig+vp6vfvuuz7jM2bMUElJiTZs2KCpU6dq165dys3NlcPh8NbMnz9fTz31lNatW6d//Md/1G9/+1s999xzuuaaawLqLy0tTdnZ2crMzNTIkSP1xBNP9Oo4AZjNZnV3YhkALrJ77rlH//3f/62qqqpgtwKgH+KUEIDvxfr163Xdddfp0ksv1Wuvvabf/e53KioqCnZbAPopZlgAfC9uvfVW7dmzRydOnFBiYqLuv/9+ZWdnB7stAP0UgQUAABiPi24BAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOP9Pz0IvsSzsXRNAAAAAElFTkSuQmCC", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAHhCAYAAABN6eUeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAA9hAAAPYQGoP6dpAAArW0lEQVR4nO3df1BV953/8dcV+RETwd+gFhHcVnGJpl62Lhgc0xqMZmPcaiS/sBM1XTI2CtRdf6CbxDSSqnGpiUBVSGqmUdpiJ8lKVKzR0cDEStB0Ims2GxXXXMZAEvBHBLyc7x+O97s3F4wXDecDPB8zZ8b7Oe9z7vs4XnnxOT+uw7IsSwAAAAbrYXcDAAAA34bAAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgvJ52N3CrtLS06LPPPlPv3r3lcDjsbgcAANwAy7J0/vx5DRkyRD16tD2P0mUCy2effabIyEi72wAAAO1w5swZfe9732tzfZcJLL1795Z09YBDQ0Nt7gYAANyIhoYGRUZGen6Ot6XLBJZrp4FCQ0MJLAAAdDLfdjkHF90CAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjNfT7gZw84Yv3Wl3C+hAp1683+4WAKDDMcMCAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB47Qosubm5io6OVkhIiJxOpw4ePNhmrcvl0qOPPqqRI0eqR48eSk9P96nZvHmzkpKS1LdvX/Xt21eTJ0/W4cOH29MaAADogvwOLEVFRUpPT1dWVpYqKyuVlJSkqVOnqrq6utX6xsZGDRw4UFlZWRo7dmyrNfv379cjjzyid999V+Xl5Ro2bJiSk5N19uxZf9sDAABdkMOyLMufDcaPH69x48YpLy/PMxYbG6sZM2YoOzv7uttOmjRJd911l3Jycq5b53a71bdvX73yyiuaM2dOqzWNjY1qbGz0vG5oaFBkZKTq6+sVGhp64wfUBQxfutPuFtCBTr14v90tAMAt09DQoLCwsG/9+e3XDEtTU5MqKiqUnJzsNZ6cnKyysrL2ddqKS5cuqbm5Wf369WuzJjs7W2FhYZ4lMjLylr0/AAAwi1+Bpba2Vm63W+Hh4V7j4eHhqqmpuWVNLV26VEOHDtXkyZPbrFm2bJnq6+s9y5kzZ27Z+wMAALP0bM9GDofD67VlWT5j7bVmzRpt27ZN+/fvV0hISJt1wcHBCg4OviXvCQAAzOZXYBkwYIACAgJ8ZlPOnTvnM+vSHuvWrdPq1au1d+9ejRkz5qb3BwAAuga/TgkFBQXJ6XSqtLTUa7y0tFSJiYk31cjatWv1/PPPa9euXYqPj7+pfQEAgK7F71NCmZmZSk1NVXx8vBISErRp0yZVV1crLS1N0tVrS86ePautW7d6tjl69Kgk6cKFC/r888919OhRBQUFafTo0ZKungZauXKl3njjDQ0fPtwzg3PHHXfojjvuuNljBAAAnZzfgSUlJUV1dXVatWqVXC6X4uLiVFJSoqioKElXHxT3zWey/PCHP/T8uaKiQm+88YaioqJ06tQpSVcfRNfU1KRZs2Z5bffMM8/o2Wef9bdFAADQxfj9HBZT3eh93F0Rz2HpXngOC4Cu5Dt5DgsAAIAdCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeO0KLLm5uYqOjlZISIicTqcOHjzYZq3L5dKjjz6qkSNHqkePHkpPT2+1rri4WKNHj1ZwcLBGjx6tP//5z+1pDQAAdEF+B5aioiKlp6crKytLlZWVSkpK0tSpU1VdXd1qfWNjowYOHKisrCyNHTu21Zry8nKlpKQoNTVVx44dU2pqqmbPnq3333/f3/YAAEAX5LAsy/Jng/Hjx2vcuHHKy8vzjMXGxmrGjBnKzs6+7raTJk3SXXfdpZycHK/xlJQUNTQ06J133vGM3Xffferbt6+2bdvW6r4aGxvV2Njoed3Q0KDIyEjV19crNDTUn0Pq9IYv3Wl3C+hAp1683+4WAOCWaWhoUFhY2Lf+/PZrhqWpqUkVFRVKTk72Gk9OTlZZWVn7OtXVGZZv7nPKlCnX3Wd2drbCwsI8S2RkZLvfHwAAmM2vwFJbWyu3263w8HCv8fDwcNXU1LS7iZqaGr/3uWzZMtXX13uWM2fOtPv9AQCA2Xq2ZyOHw+H12rIsn7Hvep/BwcEKDg6+qfcEAACdg18zLAMGDFBAQIDPzMe5c+d8Zkj8ERERccv3CQAAug6/AktQUJCcTqdKS0u9xktLS5WYmNjuJhISEnz2uWfPnpvaJwAA6Dr8PiWUmZmp1NRUxcfHKyEhQZs2bVJ1dbXS0tIkXb225OzZs9q6datnm6NHj0qSLly4oM8//1xHjx5VUFCQRo8eLUlatGiRJk6cqF//+td68MEH9eabb2rv3r06dOjQLThEAADQ2fkdWFJSUlRXV6dVq1bJ5XIpLi5OJSUlioqKknT1QXHffCbLD3/4Q8+fKyoq9MYbbygqKkqnTp2SJCUmJmr79u1asWKFVq5cqREjRqioqEjjx4+/iUMDAABdhd/PYTHVjd7H3RXxHJbuheewAOhKbvTnd7vuEgIAdAx+Iele+IWkbXz5IQAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADBeuwJLbm6uoqOjFRISIqfTqYMHD163/sCBA3I6nQoJCVFMTIzy8/N9anJycjRy5EjddtttioyMVEZGhi5fvtye9gAAQBfjd2ApKipSenq6srKyVFlZqaSkJE2dOlXV1dWt1p88eVLTpk1TUlKSKisrtXz5ci1cuFDFxcWemt///vdaunSpnnnmGVVVVamgoEBFRUVatmxZ+48MAAB0GT393WD9+vWaN2+e5s+fL+nqzMju3buVl5en7Oxsn/r8/HwNGzZMOTk5kqTY2FgdOXJE69at08yZMyVJ5eXlmjBhgh599FFJ0vDhw/XII4/o8OHDbfbR2NioxsZGz+uGhgZ/DwUAAHQSfs2wNDU1qaKiQsnJyV7jycnJKisra3Wb8vJyn/opU6boyJEjam5uliTdfffdqqio8ASUTz/9VCUlJbr//vvb7CU7O1thYWGeJTIy0p9DAQAAnYhfgaW2tlZut1vh4eFe4+Hh4aqpqWl1m5qamlbrr1y5otraWknSww8/rOeff1533323AgMDNWLECN1zzz1aunRpm70sW7ZM9fX1nuXMmTP+HAoAAOhE/D4lJEkOh8PrtWVZPmPfVv9/x/fv368XXnhBubm5Gj9+vD755BMtWrRIgwcP1sqVK1vdZ3BwsIKDg9vTPgAA6GT8CiwDBgxQQECAz2zKuXPnfGZRromIiGi1vmfPnurfv78kaeXKlUpNTfVcF3PnnXfq4sWL+vnPf66srCz16MHd1wAAdGd+JYGgoCA5nU6VlpZ6jZeWlioxMbHVbRISEnzq9+zZo/j4eAUGBkqSLl265BNKAgICZFmWZzYGAAB0X35PXWRmZmrLli0qLCxUVVWVMjIyVF1drbS0NElXry2ZM2eOpz4tLU2nT59WZmamqqqqVFhYqIKCAi1evNhT88ADDygvL0/bt2/XyZMnVVpaqpUrV2r69OkKCAi4BYcJAAA6M7+vYUlJSVFdXZ1WrVoll8uluLg4lZSUKCoqSpLkcrm8nskSHR2tkpISZWRkaOPGjRoyZIg2bNjguaVZklasWCGHw6EVK1bo7NmzGjhwoB544AG98MILt+AQAQBAZ+ewusg5l4aGBoWFham+vl6hoaF2t9Ohhi/daXcL6ECnXmz7dn90PXy+u5fu+Pm+0Z/fXM0KAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxmtXYMnNzVV0dLRCQkLkdDp18ODB69YfOHBATqdTISEhiomJUX5+vk/NV199pQULFmjw4MEKCQlRbGysSkpK2tMeAADoYvwOLEVFRUpPT1dWVpYqKyuVlJSkqVOnqrq6utX6kydPatq0aUpKSlJlZaWWL1+uhQsXqri42FPT1NSke++9V6dOndKf/vQnnThxQps3b9bQoUPbf2QAAKDL6OnvBuvXr9e8efM0f/58SVJOTo52796tvLw8ZWdn+9Tn5+dr2LBhysnJkSTFxsbqyJEjWrdunWbOnClJKiws1BdffKGysjIFBgZKkqKiotp7TAAAoIvxa4alqalJFRUVSk5O9hpPTk5WWVlZq9uUl5f71E+ZMkVHjhxRc3OzJOmtt95SQkKCFixYoPDwcMXFxWn16tVyu91t9tLY2KiGhgavBQAAdE1+BZba2lq53W6Fh4d7jYeHh6umpqbVbWpqalqtv3LlimprayVJn376qf70pz/J7XarpKREK1as0EsvvaQXXnihzV6ys7MVFhbmWSIjI/05FAAA0Im066Jbh8Ph9dqyLJ+xb6v/v+MtLS0aNGiQNm3aJKfTqYcfflhZWVnKy8trc5/Lli1TfX29Zzlz5kx7DgUAAHQCfl3DMmDAAAUEBPjMppw7d85nFuWaiIiIVut79uyp/v37S5IGDx6swMBABQQEeGpiY2NVU1OjpqYmBQUF+ew3ODhYwcHB/rQPAAA6Kb9mWIKCguR0OlVaWuo1XlpaqsTExFa3SUhI8Knfs2eP4uPjPRfYTpgwQZ988olaWlo8NR9//LEGDx7calgBAADdi9+nhDIzM7VlyxYVFhaqqqpKGRkZqq6uVlpamqSrp2rmzJnjqU9LS9Pp06eVmZmpqqoqFRYWqqCgQIsXL/bUPPXUU6qrq9OiRYv08ccfa+fOnVq9erUWLFhwCw4RAAB0dn7f1pySkqK6ujqtWrVKLpdLcXFxKikp8dyG7HK5vJ7JEh0drZKSEmVkZGjjxo0aMmSINmzY4LmlWZIiIyO1Z88eZWRkaMyYMRo6dKgWLVqkJUuW3IJDBAAAnZ3DunYFbCfX0NCgsLAw1dfXKzQ01O52OtTwpTvtbgEd6NSL99vdAjoQn+/upTt+vm/05zffJQQAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGa1dgyc3NVXR0tEJCQuR0OnXw4MHr1h84cEBOp1MhISGKiYlRfn5+m7Xbt2+Xw+HQjBkz2tMaAADogvwOLEVFRUpPT1dWVpYqKyuVlJSkqVOnqrq6utX6kydPatq0aUpKSlJlZaWWL1+uhQsXqri42Kf29OnTWrx4sZKSkvw/EgAA0GX5HVjWr1+vefPmaf78+YqNjVVOTo4iIyOVl5fXan1+fr6GDRumnJwcxcbGav78+Zo7d67WrVvnVed2u/XYY4/pueeeU0xMTPuOBgAAdEl+BZampiZVVFQoOTnZazw5OVllZWWtblNeXu5TP2XKFB05ckTNzc2esVWrVmngwIGaN2/eDfXS2NiohoYGrwUAAHRNfgWW2tpaud1uhYeHe42Hh4erpqam1W1qamparb9y5Ypqa2slSe+9954KCgq0efPmG+4lOztbYWFhniUyMtKfQwEAAJ1Iuy66dTgcXq8ty/IZ+7b6a+Pnz5/X448/rs2bN2vAgAE33MOyZctUX1/vWc6cOePHEQAAgM6kpz/FAwYMUEBAgM9syrlz53xmUa6JiIhotb5nz57q37+/PvroI506dUoPPPCAZ31LS8vV5nr21IkTJzRixAif/QYHBys4ONif9gEAQCfl1wxLUFCQnE6nSktLvcZLS0uVmJjY6jYJCQk+9Xv27FF8fLwCAwM1atQo/e1vf9PRo0c9y/Tp03XPPffo6NGjnOoBAAD+zbBIUmZmplJTUxUfH6+EhARt2rRJ1dXVSktLk3T1VM3Zs2e1detWSVJaWppeeeUVZWZm6sknn1R5ebkKCgq0bds2SVJISIji4uK83qNPnz6S5DMOAAC6J78DS0pKiurq6rRq1Sq5XC7FxcWppKREUVFRkiSXy+X1TJbo6GiVlJQoIyNDGzdu1JAhQ7RhwwbNnDnz1h0FAADo0hzWtStgO7mGhgaFhYWpvr5eoaGhdrfToYYv3Wl3C+hAp1683+4W0IH4fHcv3fHzfaM/v/kuIQAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADBeuwJLbm6uoqOjFRISIqfTqYMHD163/sCBA3I6nQoJCVFMTIzy8/O91m/evFlJSUnq27ev+vbtq8mTJ+vw4cPtaQ0AAHRBfgeWoqIipaenKysrS5WVlUpKStLUqVNVXV3dav3Jkyc1bdo0JSUlqbKyUsuXL9fChQtVXFzsqdm/f78eeeQRvfvuuyovL9ewYcOUnJyss2fPtv/IAABAl+GwLMvyZ4Px48dr3LhxysvL84zFxsZqxowZys7O9qlfsmSJ3nrrLVVVVXnG0tLSdOzYMZWXl7f6Hm63W3379tUrr7yiOXPm3FBfDQ0NCgsLU319vUJDQ/05pE5v+NKddreADnTqxfvtbgEdiM9399IdP983+vPbrxmWpqYmVVRUKDk52Ws8OTlZZWVlrW5TXl7uUz9lyhQdOXJEzc3NrW5z6dIlNTc3q1+/fm320tjYqIaGBq8FAAB0TX4FltraWrndboWHh3uNh4eHq6amptVtampqWq2/cuWKamtrW91m6dKlGjp0qCZPntxmL9nZ2QoLC/MskZGR/hwKAADoRNp10a3D4fB6bVmWz9i31bc2Lklr1qzRtm3btGPHDoWEhLS5z2XLlqm+vt6znDlzxp9DAAAAnUhPf4oHDBiggIAAn9mUc+fO+cyiXBMREdFqfc+ePdW/f3+v8XXr1mn16tXau3evxowZc91egoODFRwc7E/7AACgk/JrhiUoKEhOp1OlpaVe46WlpUpMTGx1m4SEBJ/6PXv2KD4+XoGBgZ6xtWvX6vnnn9euXbsUHx/vT1sAAKCL8/uUUGZmprZs2aLCwkJVVVUpIyND1dXVSktLk3T1VM3/vbMnLS1Np0+fVmZmpqqqqlRYWKiCggItXrzYU7NmzRqtWLFChYWFGj58uGpqalRTU6MLFy7cgkMEAACdnV+nhCQpJSVFdXV1WrVqlVwul+Li4lRSUqKoqChJksvl8nomS3R0tEpKSpSRkaGNGzdqyJAh2rBhg2bOnOmpyc3NVVNTk2bNmuX1Xs8884yeffbZdh4aAADoKvx+DoupeA4Luovu+JyG7ozPd/fSHT/f38lzWAAAAOxAYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGa1dgyc3NVXR0tEJCQuR0OnXw4MHr1h84cEBOp1MhISGKiYlRfn6+T01xcbFGjx6t4OBgjR49Wn/+85/b0xoAAOiC/A4sRUVFSk9PV1ZWliorK5WUlKSpU6equrq61fqTJ09q2rRpSkpKUmVlpZYvX66FCxequLjYU1NeXq6UlBSlpqbq2LFjSk1N1ezZs/X++++3/8gAAECX4bAsy/Jng/Hjx2vcuHHKy8vzjMXGxmrGjBnKzs72qV+yZIneeustVVVVecbS0tJ07NgxlZeXS5JSUlLU0NCgd955x1Nz3333qW/fvtq2bdsN9dXQ0KCwsDDV19crNDTUn0Pq9IYv3Wl3C+hAp1683+4W0IH4fHcv3fHzfaM/v3v6s9OmpiZVVFRo6dKlXuPJyckqKytrdZvy8nIlJyd7jU2ZMkUFBQVqbm5WYGCgysvLlZGR4VOTk5PTZi+NjY1qbGz0vK6vr5d09cC7m5bGS3a3gA7UHf+Nd2d8vruX7vj5vnbM3zZ/4ldgqa2tldvtVnh4uNd4eHi4ampqWt2mpqam1forV66otrZWgwcPbrOmrX1KUnZ2tp577jmf8cjIyBs9HKBTCsuxuwMA35Xu/Pk+f/68wsLC2lzvV2C5xuFweL22LMtn7Nvqvznu7z6XLVumzMxMz+uWlhZ98cUX6t+//3W3Q9fQ0NCgyMhInTlzptudAgS6Oj7f3YtlWTp//ryGDBly3Tq/AsuAAQMUEBDgM/Nx7tw5nxmSayIiIlqt79mzp/r373/dmrb2KUnBwcEKDg72GuvTp8+NHgq6iNDQUP5DA7ooPt/dx/VmVq7x6y6hoKAgOZ1OlZaWeo2XlpYqMTGx1W0SEhJ86vfs2aP4+HgFBgZet6atfQIAgO7F71NCmZmZSk1NVXx8vBISErRp0yZVV1crLS1N0tVTNWfPntXWrVslXb0j6JVXXlFmZqaefPJJlZeXq6CgwOvun0WLFmnixIn69a9/rQcffFBvvvmm9u7dq0OHDt2iwwQAAJ2Z34ElJSVFdXV1WrVqlVwul+Li4lRSUqKoqChJksvl8nomS3R0tEpKSpSRkaGNGzdqyJAh2rBhg2bOnOmpSUxM1Pbt27VixQqtXLlSI0aMUFFRkcaPH38LDhFdUXBwsJ555hmf04IAOj8+32iN389hAQAA6Gh8lxAAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWNApnT59WsePH1dLS4vdrQAAOgCBBUb73e9+5/Ot3T//+c8VExOjO++8U3FxcTpz5ow9zQEAOgyBBUbLz8/3+o6JXbt26dVXX9XWrVv117/+VX369Gn1W7sBdC6XL1/W2rVrNW3aNMXHx2vcuHFeC9Cub2sGOsrHH3+s+Ph4z+s333xT06dP12OPPSZJWr16tZ544gm72gNwi8ydO1elpaWaNWuWfvSjH8nhcNjdEgxDYIHRvv76a69vay0rK9PcuXM9r2NiYny+6RtA57Nz506VlJRowoQJdrcCQ3FKCEaLiopSRUWFJKm2tlYfffSR7r77bs/6mpqaG/pacgBmGzp0qHr37m13GzAYgQVGmzNnjhYsWKDnn39eDz30kEaNGiWn0+lZX1ZWpri4OBs7BHArvPTSS1qyZIlOnz5tdyswFKeEYLQlS5bo0qVL2rFjhyIiIvTHP/7Ra/17772nRx55xKbuANwq8fHxunz5smJiYtSrVy8FBgZ6rf/iiy9s6gym4Nua0ak1NzfL5XJp2LBhdrcC4CZMnjxZ1dXVmjdvnsLDw30uuv3Zz35mU2cwBYEFndqxY8c0btw4ud1uu1sBcBN69eql8vJyjR071u5WYCiuYQEA2G7UqFH6+uuv7W4DBiOwAABs9+KLL+qXv/yl9u/fr7q6OjU0NHgtAKeE0KlxSgjoGnr0uPr78zevXbEsSw6Hg884uEsIZvvwww+vu/7EiRMd1AmA79K7775rdwswHDMsMFqPHj3kcDh0vX+m/PYFAF0fMyww2smTJ7+15ssvv+yATgB81w4ePKjf/va3+vTTT/XHP/5RQ4cO1euvv67o6GivJ1yje+KiWxgtKiqq1aVPnz7auXOnfvrTn3o9+RZA51RcXKwpU6botttu0wcffKDGxkZJ0vnz57V69Wqbu4MJCCzoVPbt26fHH39cgwcP1ssvv6ypU6fqyJEjdrcF4Cb96le/Un5+vjZv3uz1lNvExER98MEHNnYGU3BKCMb73//9X7322msqLCzUxYsXNXv2bDU3N6u4uFijR4+2uz0At8CJEyc0ceJEn/HQ0FB99dVXHd8QjMMMC4w2bdo0jR49WsePH9fLL7+szz77TC+//LLdbQG4xQYPHqxPPvnEZ/zQoUOKiYmxoSOYhhkWGG3Pnj1auHChnnrqKX3/+9+3ux0A35F/+Zd/0aJFi1RYWCiHw6HPPvtM5eXlWrx4sf793//d7vZgAAILjHbw4EEVFhYqPj5eo0aNUmpqqlJSUuxuC8At9m//9m+qr6/XPffco8uXL2vixIkKDg7W4sWL9Ytf/MLu9mAAnsOCTuHSpUvavn27CgsLdfjwYbndbq1fv15z585V79697W4PwC1y6dIlHT9+XC0tLRo9erTuuOMOu1uCIQgs6HROnDihgoICvf766/rqq69077336q233rK7LQA3Ye7cufrNb37j8wvIxYsX9fTTT6uwsNCmzmAKAgs6LbfbrbfffluFhYUEFqCTCwgIkMvl0qBBg7zGa2trFRERoStXrtjUGUzBNSzotAICAjRjxgzNmDHD7lYAtFNDQ4Msy5JlWTp//rxCQkI869xut0pKSnxCDLonAgsAwDZ9+vSRw+GQw+HQD37wA5/1DodDzz33nA2dwTScEgIA2ObAgQOyLEs//vGPVVxcrH79+nnWBQUFKSoqSkOGDLGxQ5iCwAIAsN3p06c1bNgwORwOu1uBoQgsAADbfPjhhzdUN2bMmO+4E5iOwAIAsE2PHj3kcDh0vR9FDodDbre7A7uCibjoFgBgm5MnT9rdAjoJZlgAAIDxmGEBABjh8uXL+vDDD3Xu3Dm1tLR4rZs+fbpNXcEUBBYAgO127dqlOXPmqLa21mcd17BAknrY3QAAAL/4xS/00EMPyeVyqaWlxWshrEDiGhYAgAFCQ0NVWVmpESNG2N0KDMUMCwDAdrNmzdL+/fvtbgMGY4YFAGC7S5cu6aGHHtLAgQN15513KjAw0Gv9woULbeoMpiCwAABst2XLFqWlpem2225T//79vR7R73A49Omnn9rYHUxAYAEA2C4iIkILFy7U0qVL1aMHVyvAF/8qAAC2a2pqUkpKCmEFbeJfBgDAdj/72c9UVFRkdxswGA+OAwDYzu12a82aNdq9e7fGjBnjc9Ht+vXrbeoMpuAaFgCA7e6555421zkcDu3bt68Du4GJCCwAAMB4XMMCADDGJ598ot27d+vrr7+WJPE7Na4hsAAAbFdXV6ef/OQn+sEPfqBp06bJ5XJJkubPn69f/vKXNncHExBYAAC2y8jIUGBgoKqrq9WrVy/PeEpKinbt2mVjZzAFdwkBAGy3Z88e7d69W9/73ve8xr///e/r9OnTNnUFkzDDAgCw3cWLF71mVq6pra1VcHCwDR3BNAQWAIDtJk6cqK1bt3peOxwOtbS0aO3atde95RndB7c1AwBsd/z4cU2aNElOp1P79u3T9OnT9dFHH+mLL77Qe++9pxEjRtjdImxGYAEAGKGmpkZ5eXmqqKhQS0uLxo0bpwULFmjw4MF2twYDEFgAALarrq5WZGSkHA5Hq+uGDRtmQ1cwCYEFAGC7gIAAuVwuDRo0yGu8rq5OgwYNktvttqkzmIKLbgEAtrMsq9XZlQsXLigkJMSGjmAansMCALBNZmampKt3Ba1cudLr1ma32633339fd911l03dwSQEFgCAbSorKyVdnWH529/+pqCgIM+6oKAgjR07VosXL7arPRiEa1gAALZ74okntGHDBvXu3dvuVmAoAgsAwDY//elPb6hux44d33EnMB2nhAAAtgkLC7O7BXQSzLAAAADjcVszAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAuGn79++Xw+HQV199Zcx7DR8+XDk5Od95PwA6BoEFwA0rKytTQECA7rvvPtt6SExMlMvl8twO+9prr6lPnz629QOgYxBYANywwsJCPf300zp06JCqq6s7/P2bm5sVFBSkiIiIVr8oD0DXRWABcEMuXryoP/zhD3rqqaf0T//0T3rttdeuW79582ZFRkaqV69e+ud//metX7/eZyYkLy9PI0aMUFBQkEaOHKnXX3/da73D4VB+fr4efPBB3X777frVr37ldUpo//79euKJJ1RfXy+HwyGHw6Fnn33Ws/2lS5c0d+5c9e7dW8OGDdOmTZs8606dOiWHw6E//OEPSkpK0m233aZ/+Id/0Mcff6y//vWvio+P1x133KH77rtPn3/++c3+9QG4WRYA3ICCggIrPj7esizLevvtt63hw4dbLS0tlmVZ1rvvvmtJsr788kvLsizr0KFDVo8ePay1a9daJ06csDZu3Gj169fPCgsL8+xvx44dVmBgoLVx40brxIkT1ksvvWQFBARY+/bt89RIsgYNGmQVFBRY//M//2OdOnXK670aGxutnJwcKzQ01HK5XJbL5bLOnz9vWZZlRUVFWf369bM2btxo/fd//7eVnZ1t9ejRw6qqqrIsy7JOnjxpSbJGjRpl7dq1yzp+/Lj1j//4j9a4ceOsSZMmWYcOHbI++OAD6+/+7u+stLS0DvgbBnA9BBYANyQxMdHKycmxLMuympubrQEDBlilpaWWZfkGlpSUFOv+++/32v6xxx7zCiyJiYnWk08+6VXz0EMPWdOmTfO8lmSlp6d71XzzvV599VWv/V4TFRVlPf74457XLS0t1qBBg6y8vDzLsv5/YNmyZYunZtu2bZYk6y9/+YtnLDs72xo5cuT1/moAdABOCQH4VidOnNDhw4f18MMPS5J69uyplJQUFRYWtln/ox/9yGvsm6+rqqo0YcIEr7EJEyaoqqrKayw+Pr7dfY8ZM8bzZ4fDoYiICJ07d67NmvDwcEnSnXfe6TX2zW0AdDy+/BDAtyooKNCVK1c0dOhQz5hlWQoMDNSXX37pU29Zls9FsVYrX1vWWs03x26//fZ29x0YGOjzfi0tLW3WXHvvb459cxsAHY8ZFgDXdeXKFW3dulUvvfSSjh496lmOHTumqKgo/f73v/fZZtSoUTp8+LDX2JEjR7xex8bG6tChQ15jZWVlio2N9au/oKAgud1uv7YB0PkwwwLguv7zP/9TX375pebNm+d59sk1s2bNUkFBgf7jP/7Da/zpp5/WxIkTtX79ej3wwAPat2+f3nnnHa/Zk3/913/V7NmzNW7cOP3kJz/R22+/rR07dmjv3r1+9Td8+HBduHBBf/nLXzR27Fj16tVLvXr1av8BAzASMywArqugoECTJ0/2CSuSNHPmTB09elQffPCB1/iECROUn5+v9evXa+zYsdq1a5cyMjIUEhLiqZkxY4Z+85vfaO3atfr7v/97/fa3v9Wrr76qSZMm+dVfYmKi0tLSlJKSooEDB2rNmjXtOk4AZnNYrZ1YBoBb7Mknn9R//dd/6eDBg3a3AqAT4pQQgO/EunXrdO+99+r222/XO++8o9/97nfKzc21uy0AnRQzLAC+E7Nnz9b+/ft1/vx5xcTE6Omnn1ZaWprdbQHopAgsAADAeFx0CwAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAY7/8BsXXWBXROp/wAAAAASUVORK5CYII=", "text/plain": [ "
" ] @@ -534,13 +533,6 @@ "source": [ "results.groupby('Algorithm').ndcg.mean().plot.bar()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/datasets.rst b/docs/datasets.rst deleted file mode 100644 index c851faa49..000000000 --- a/docs/datasets.rst +++ /dev/null @@ -1,79 +0,0 @@ -Loading Data -============ - -LensKit can work with any data in a :py:class:`pandas.DataFrame` with the expected -columns. LensKit algorithms expect a ``ratings`` frame to contain the following -columns (in any order): - -* ``user``, containing user identifiers. No requirements are placed on user IDs — - if an algorithm requires something specific, such as contiguous 0-based identifiers - for indexing into an array — it will use a :py:class:`pandas.Index` to map them. -* ``item``, containing item identifiers. The same comments apply as for ``user``. -* ``rating``, containing user ratings (if available). Implicit-feedback code will - not require ratings. - -‘Rating’ data can contain other columns as well, and is a catch-all for any user-item -interaction data. Algorithms will document any non-standard columns they can make -use of. - -:py:meth:`lenskit.algorithms.Recommender.fit` can also accept additional data objects -as keyword arguments, and algorithms that wrap other algorithms will pass this data -through unchanged. Algorithms ignore extra data objects they receive. This allows -you to build algorithms that train on data besides user-item interactions, such as -user metadata or item content. - -Data Loaders ------------- - -.. module:: lenskit.datasets - -The :py:mod:`lenskit.datasets` module provides utilities for reading a variety -of commonly-used LensKit data sets. It does not package or automatically -download them, but loads them from a local directory where you have unpacked -the data set. Each data set class or function takes a ``path`` parameter -specifying the location of the data set. - -The normal mode of operation for these utilities is to provide a class for the -data set; this class then exposes the data set's data as attributes. These -attributes are cached internally, so e.g. accessing :py:attr:`MovieLens.ratings` -twice will only load the data file once. - -These data files have normalized column names to fit with LensKit's general -conventions. These are the following: - -- User ID columns are called ``user``. -- Item ID columns are called ``item``. -- Rating columns are called ``rating``. -- Timestamp columns are called ``timestamp``. - -Other column names are unchanged. Data tables that provide information about -specific things, such as a table of movie titles, are indexed by the relevant -ID (e.g. :py:attr:`MovieLens.ratings` is indexed by ``item``). - -Data sets supported: - -* :class:`MovieLens` -* :class:`ML100K` -* :class:`ML1M` -* :class:`ML10M` - -MovieLens Data Sets -------------------- - -The GroupLens research group provides several data sets extracted from the -MovieLens service :cite:p:`movielens`. -These can be downloaded from https://grouplens.org/datasets/movielens/. - -.. autoclass:: MovieLens - :members: - -.. autoclass:: ML100K - :members: - -.. autoclass:: ML1M - :inherited-members: - :members: - -.. autoclass:: ML10M - :inherited-members: - :members: diff --git a/docs/index.rst b/docs/index.rst index 8bf792201..cc85cb3ce 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -39,7 +39,6 @@ Resources :caption: Running Experiments data - datasets crossfold batch evaluation/index diff --git a/docs/releases/2024.rst b/docs/releases/2024.rst index f3ec639b2..c99c8626b 100644 --- a/docs/releases/2024.rst +++ b/docs/releases/2024.rst @@ -31,6 +31,14 @@ Significant Changes * :py:class:`~lenskit.algorithms.als.ImplicitMF` * :py:class:`~lenskit.algorithms.als.BiasedMF` +* :class:`~lenskit.data.Dataset`. LensKit now provides an abstraction for + training data instead of working with Pandas data frames directly, that + allows components to reduce code duplication and recomputation, access data + in multiple formats (Pandas, NumPy, and PyTorch), and provided standardized + structures like mappings of user or item IDs to array indices. This also + supersedes the old bespoke dataset loading support, with functions like + :func:`~lenskit.data.load_movielens` to load standard datasets. + * Many LensKit components (batch running, model training, etc.) now report progress with :py:mod:`progress_api`, and can be connected to TQDM or Enlighten. diff --git a/lenskit-funksvd/tests/test_funksvd.py b/lenskit-funksvd/tests/test_funksvd.py index 0c80dfd59..bfa87121d 100644 --- a/lenskit-funksvd/tests/test_funksvd.py +++ b/lenskit-funksvd/tests/test_funksvd.py @@ -13,7 +13,7 @@ from pytest import approx, mark -from lenskit.data.dataset import from_interactions_df +from lenskit.data.dataset import Dataset, from_interactions_df import lenskit.funksvd as svd import lenskit.util.test as lktu @@ -139,16 +139,16 @@ def test_fsvd_predict_bad_user(): @lktu.wantjit @mark.slow -def test_fsvd_save_load(): - ratings = lktu.ml_test.ratings - +def test_fsvd_save_load(ml_ds: Dataset): original = svd.FunkSVD(20, iterations=20) - original.fit(from_interactions_df(ratings)) + original.fit(ml_ds) assert original.bias is not None - assert original.bias.mean_ == approx(ratings.rating.mean()) - assert original.item_features_.shape == (ratings.item.nunique(), 20) - assert original.user_features_.shape == (ratings.user.nunique(), 20) + assert original.bias.mean_ == approx( + ml_ds.interaction_matrix("scipy", field="rating").data.mean() + ) + assert original.item_features_.shape == (ml_ds.item_count, 20) + assert original.user_features_.shape == (ml_ds.user_count, 20) mod = pickle.dumps(original) _log.info("serialized to %d bytes", len(mod)) @@ -165,8 +165,8 @@ def test_fsvd_save_load(): @lktu.wantjit @mark.slow -def test_fsvd_train_binary(): - ratings = lktu.ml_test.ratings.drop(columns=["rating", "timestamp"]) +def test_fsvd_train_binary(ml_ratings: pd.DataFrame): + ratings = ml_ratings.drop(columns=["rating", "timestamp"]) original = svd.FunkSVD(20, iterations=20, bias=False) original.fit(from_interactions_df(ratings)) @@ -178,10 +178,10 @@ def test_fsvd_train_binary(): @lktu.wantjit @mark.slow -def test_fsvd_known_preds(): +def test_fsvd_known_preds(ml_ds: Dataset): algo = svd.FunkSVD(15, iterations=125, lrate=0.001) _log.info("training %s on ml data", algo) - algo.fit(from_interactions_df(lktu.ml_test.ratings)) + algo.fit(ml_ds) dir = Path(__file__).parent pred_file = dir / "funksvd-preds.csv" @@ -207,15 +207,12 @@ def test_fsvd_known_preds(): @lktu.wantjit @mark.slow @mark.eval -@mark.skipif(not lktu.ml100k.available, reason="ML100K data not present") -def test_fsvd_batch_accuracy(): +def test_fsvd_batch_accuracy(ml_100k: pd.DataFrame): import lenskit.crossfold as xf import lenskit.metrics.predict as pm from lenskit import batch from lenskit.algorithms import basic, bias - ratings = lktu.ml100k.ratings - svd_algo = svd.FunkSVD(25, 125, damping=10) algo = basic.Fallback(svd_algo, bias.Bias(damping=10)) @@ -225,7 +222,7 @@ def eval(train, test): _log.info("testing %d users", test.user.nunique()) return batch.predict(algo, test) - folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) + folds = xf.partition_users(ml_100k, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.74, abs=0.025) diff --git a/lenskit-hpf/tests/test_hpf.py b/lenskit-hpf/tests/test_hpf.py index aa615f647..26d576804 100644 --- a/lenskit-hpf/tests/test_hpf.py +++ b/lenskit-hpf/tests/test_hpf.py @@ -20,10 +20,9 @@ @mark.slow -def test_hpf_train_large(tmp_path): +def test_hpf_train_large(tmp_path, ml_ratings): algo = hpf.HPF(20) - ratings = lktu.ml_test.ratings - ratings = ratings.assign(rating=ratings.rating + 0.5) + ratings = ml_ratings.assign(rating=ml_ratings.rating + 0.5) ds = from_interactions_df(ratings) algo.fit(ds) @@ -51,9 +50,9 @@ def test_hpf_train_large(tmp_path): @mark.slow -def test_hpf_train_binary(tmp_path): +def test_hpf_train_binary(tmp_path, ml_ratings): algo = hpf.HPF(20) - ratings = lktu.ml_test.ratings.drop(columns=["timestamp", "rating"]) + ratings = ml_ratings.drop(columns=["timestamp", "rating"]) ds = from_interactions_df(ratings) algo.fit(ds) diff --git a/lenskit-implicit/tests/test_implicit.py b/lenskit-implicit/tests/test_implicit.py index 0a09adf09..15d74017f 100644 --- a/lenskit-implicit/tests/test_implicit.py +++ b/lenskit-implicit/tests/test_implicit.py @@ -19,13 +19,11 @@ @mark.slow -def test_implicit_als_train_rec(): +def test_implicit_als_train_rec(ml_ds): algo = ALS(25) assert algo.factors == 25 - ratings = lktu.ml_test.ratings - ds = from_interactions_df(ratings) - ret = algo.fit(ds) + ret = algo.fit(ml_ds) assert ret is algo recs = algo.recommend(100, n=20) @@ -46,14 +44,11 @@ def test_implicit_als_train_rec(): @mark.slow @mark.eval -@mark.skipif(not lktu.ml100k.available, reason="ML100K not downloaded") @mark.parametrize("n_jobs", [1, None]) -def test_implicit_als_batch_accuracy(n_jobs): +def test_implicit_als_batch_accuracy(ml_100k, n_jobs): import lenskit.crossfold as xf from lenskit import batch, topn - ratings = lktu.ml100k.ratings - algo_t = ALS(25) def eval(train, test): @@ -66,7 +61,7 @@ def eval(train, test): recs = batch.recommend(algo, users, 100, n_jobs=n_jobs) return recs - folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) + folds = list(xf.partition_users(ml_100k, 5, xf.SampleFrac(0.2))) test = pd.concat(f.test for f in folds) recs = pd.concat(eval(train, test) for (train, test) in folds) @@ -81,12 +76,11 @@ def eval(train, test): @mark.slow -def test_implicit_bpr_train_rec(): +def test_implicit_bpr_train_rec(ml_ds): algo = BPR(25, use_gpu=False) assert algo.factors == 25 - ratings = lktu.ml_test.ratings - algo.fit(from_interactions_df(ratings)) + algo.fit(ml_ds) recs = algo.recommend(100, n=20) assert len(recs) == 20 diff --git a/lenskit/lenskit/algorithms/basic.py b/lenskit/lenskit/algorithms/basic.py index 4d7bfa0c0..023a7cf36 100644 --- a/lenskit/lenskit/algorithms/basic.py +++ b/lenskit/lenskit/algorithms/basic.py @@ -10,6 +10,7 @@ import logging from collections.abc import Iterable, Sequence +from typing import overload import numpy as np import pandas as pd @@ -112,7 +113,13 @@ class Fallback(Predictor): missing values, and so forth. """ - def __init__(self, algorithms, *others): + algorithms: list[Predictor] + + @overload + def __init__(self, algorithms: Iterable[Predictor]): ... + @overload + def __init__(self, algorithms: Predictor, *others: Predictor): ... + def __init__(self, algorithms: Predictor | Iterable[Predictor], *others): """ Args: algorithms: a list of component algorithms. Each one will be trained. @@ -120,12 +127,11 @@ def __init__(self, algorithms, *others): additional algorithms, in which case ``algorithms`` is taken to be a single algorithm. """ - if others: - self.algorithms = [algorithms] + list(others) - elif isinstance(algorithms, Iterable) or isinstance(algorithms, Sequence): - self.algorithms = algorithms + if isinstance(algorithms, Iterable) or isinstance(algorithms, Sequence): + assert not others + self.algorithms = list(algorithms) else: - self.algorithms = [algorithms] + self.algorithms = [algorithms] + list(others) @override def fit(self, data: Dataset, **kwargs): @@ -172,7 +178,7 @@ def fit(self, data: Dataset, **kwarsg): @override def candidates(self, user, ratings=None): - return np.array([], dtype=self.dtype_) + return np.array([], dtype=self.dtype_) # type: ignore class UnratedItemCandidateSelector(CandidateSelector): diff --git a/lenskit/lenskit/algorithms/bias.py b/lenskit/lenskit/algorithms/bias.py index 9ffd3f90a..0e934922b 100644 --- a/lenskit/lenskit/algorithms/bias.py +++ b/lenskit/lenskit/algorithms/bias.py @@ -92,7 +92,7 @@ def fit(self, data: Dataset, **kwargs): """ _logger.info("building bias model for %d ratings", data.interaction_count) ratings = data.interaction_matrix("scipy", layout="coo", field="rating") - nrows, ncols = ratings.shape + nrows, ncols = ratings.shape # type: ignore self.mean_ = float(np.mean(ratings.data)) _logger.info("global mean: %.3f", self.mean_) @@ -242,7 +242,7 @@ def inverse_transform_user(self, user, ratings, user_bias=None): def fit_transform(self, data: Dataset, **kwargs) -> pd.DataFrame: """ - Fit with ratings and return the training data transformed. + Fit with ratings and return the training data matrix transformed. """ # FIXME: make this more efficient, don't rename things. self.fit(data) diff --git a/lenskit/lenskit/batch/_predict.py b/lenskit/lenskit/batch/_predict.py index 5fe352f90..4cc64df1c 100644 --- a/lenskit/lenskit/batch/_predict.py +++ b/lenskit/lenskit/batch/_predict.py @@ -42,9 +42,9 @@ def predict(algo, pairs, *, n_jobs=None, **kwargs): >>> from lenskit.algorithms.bias import Bias >>> from lenskit.metrics.predict import rmse - >>> from lenskit import datasets >>> from lenskit.data import from_interactions_df - >>> ratings = datasets.MovieLens('data/ml-latest-small').ratings + >>> from lenskit.data.movielens import load_movielens_df + >>> ratings = load_movielens_df('data/ml-latest-small') >>> bias = Bias() >>> bias.fit(from_interactions_df(ratings[:-1000])) diff --git a/lenskit/lenskit/crossfold.py b/lenskit/lenskit/crossfold.py index e13c0e3dd..817d8477d 100644 --- a/lenskit/lenskit/crossfold.py +++ b/lenskit/lenskit/crossfold.py @@ -67,8 +67,8 @@ def sample_rows(data, partitions, size, disjoint=True, *, rng_spec=None): We can loop over a sequence of train-test pairs:: - >>> from lenskit import datasets - >>> ratings = datasets.MovieLens('data/ml-latest-small').ratings + >>> from lenskit.data.movielens import load_movielens_df + >>> ratings = load_movielens_df('data/ml-latest-small') >>> for train, test in sample_rows(ratings, 5, 1000): ... print(len(test)) 1000 diff --git a/lenskit/lenskit/data/fetch.py b/lenskit/lenskit/data/fetch.py index 2bbed6b17..531f7abc9 100644 --- a/lenskit/lenskit/data/fetch.py +++ b/lenskit/lenskit/data/fetch.py @@ -9,20 +9,10 @@ import sys from pathlib import Path from urllib.request import urlopen -from zipfile import ZipFile _log = logging.getLogger("lenskit.data.fetch") ML_LOC = "http://files.grouplens.org/datasets/movielens/" -ML_DATASETS = { - "ml-100k": "ml-100k/u.data", - "ml-1m": "ml-1m/ratings.dat", - "ml-10m": "ml-10M100K/ratings.dat", - "ml-20m": "ml-20m/ratings.csv", - "ml-25m": "ml-25m/ratings.csv", - "ml-latest": "ml-latest/ratings.csv", - "ml-latest-small": "ml-latest-small/ratings.csv", -} def fetch_ml(name: str, base_dir: Path): @@ -41,15 +31,14 @@ def fetch_ml(name: str, base_dir: Path): name: The name of the dataset. base_dir: - The base directory into which data should be extracted. + The base directory into which data should be downloaded. """ zipname = f"{name}.zip" zipfile = base_dir / zipname zipurl = ML_LOC + zipname - test_file = base_dir / ML_DATASETS[name] - if test_file.exists(): - _log.info("%s already exists", test_file) + if zipfile.exists(): + _log.info("%s already exists", zipfile) return _log.info("downloading data set %s", name) @@ -61,10 +50,6 @@ def fetch_ml(name: str, base_dir: Path): zf.write(block) block = res.read(8 * 1024 * 1024) - _log.info("unpacking data set") - with ZipFile(zipfile, "r") as zf: - zf.extractall(base_dir) - def _fetch_main(): logging.basicConfig(stream=sys.stderr, level=logging.INFO) diff --git a/lenskit/lenskit/data/movielens.py b/lenskit/lenskit/data/movielens.py index f62a0c065..e5f13861c 100644 --- a/lenskit/lenskit/data/movielens.py +++ b/lenskit/lenskit/data/movielens.py @@ -10,6 +10,8 @@ import logging import re +from dataclasses import dataclass +from enum import Enum from pathlib import Path from typing import TypeAlias from zipfile import ZipFile @@ -24,6 +26,37 @@ LOC: TypeAlias = Path | tuple[ZipFile, str] +class MLVersion(Enum): + ML_100K = "ml-100k" + ML_1M = "ml-1m" + ML_10M = "ml-10m" + ML_20M = "ml-20m" + ML_25M = "ml-25m" + ML_LATEST_SMALL = "ml-latest-small" + ML_LATEST = "ml-latest" + ML_MODERN = "ml-modern" + + +@dataclass +class MLData: + version: MLVersion + source: Path | ZipFile + prefix: str = "" + + def __enter__(self): + return self + + def __exit__(self, *args): + if isinstance(self.source, ZipFile): + self.source.close() + + def open_file(self, name: str): + if isinstance(self.source, Path): + return open(self.source / (self.prefix + name), "r") + else: + return self.source.open(self.prefix + name) + + def load_movielens(path: str | Path) -> Dataset: """ Load a MovieLens dataset. The appropriate MovieLens format is detected @@ -37,10 +70,42 @@ def load_movielens(path: str | Path) -> Dataset: Returns: The dataset. """ + df = load_movielens_df(path) + return from_interactions_df(df) + + +def load_movielens_df(path: str | Path) -> pd.DataFrame: + """ + Load the ratings from a MovieLens dataset as a raw data frame. The + appropriate MovieLens format is detected based on the file contents. + + Args: + path: + The path to the dataset, either as an unpacked directory or a zip + file. + + Returns: + The ratings, with columns ``user``, ``item``, ``rating``, and + ``timestamp``. + """ + with _ml_detect_and_open(path) as ml: + match ml.version: + case MLVersion.ML_100K: + return _load_ml_100k(ml) + case MLVersion.ML_1M | MLVersion.ML_10M: + return _load_ml_million(ml) + case _: + return _load_ml_modern(ml) + + +def _ml_detect_and_open(path: str | Path) -> MLData: loc = Path(path) + ds: MLVersion + if loc.is_file() and loc.suffix == ".zip": _log.debug("opening zip file at %s", loc) - with ZipFile(loc, "r") as zf: + zf = ZipFile(loc, "r") + try: infos = zf.infolist() first = infos[0] if not first.is_dir: @@ -53,88 +118,76 @@ def load_movielens(path: str | Path) -> Dataset: _log.error("%s: invalid directory name %s", loc, first.filename) raise RuntimeError("invalid ML zip file") - ds = dsm.group(1).lower() + ds = MLVersion(dsm.group(1).lower()) _log.debug("%s: found ML data set %s", loc, ds) - return _load_for_type((zf, first.filename), ds) + return MLData(ds, zf, first.filename) + except Exception as e: # pragma nocover + zf.close() + raise e else: _log.debug("loading from directory %s", loc) dsm = re.match(r"^(ml-\d+[MmKk])", loc.name) if dsm: - ds = dsm.group(1) + ds = MLVersion(dsm.group(1)) _log.debug("%s: inferred data set %s from dir name", loc, ds) else: _log.debug("%s: checking contents for data type", loc) if (loc / "u.data").exists(): _log.debug("%s: found u.data, interpreting as 100K") - ds = "ml-100k" + ds = MLVersion.ML_100K elif (loc / "ratings.dat").exists(): if (loc / "tags.dat").exists(): _log.debug("%s: found ratings.dat and tags.dat, interpreting as 10M", loc) - ds = "ml-10m" + ds = MLVersion.ML_10M else: _log.debug("%s: found ratings.dat but no tags, interpreting as 1M", loc) - ds = "ml-1m" + ds = MLVersion.ML_1M elif (loc / "ratings.csv").exists(): _log.debug("%s: found ratings.csv, interpreting as modern (20M and later)", loc) - ds = "ml-modern" + ds = MLVersion.ML_MODERN else: _log.error("%s: could not detect MovieLens data", loc) raise RuntimeError("invalid ML directory") - return _load_for_type(loc, ds) - - -def _load_for_type(loc: LOC, ds: str) -> Dataset: - "Load the specified MovieLens data set" - match ds: - case "ml-100k": - return _load_ml_100k(loc) - case "ml-1m" | "ml-10m": - return _load_ml_million(loc) - case _: - return _load_ml_modern(loc) + return MLData(ds, loc) -def _load_ml_100k(loc: LOC) -> Dataset: - with _open_file(loc, "u.data") as data: - rates_df = pd.read_csv( +def _load_ml_100k(ml: MLData) -> pd.DataFrame: + with ml.open_file("u.data") as data: + return pd.read_csv( data, sep="\t", header=None, - names=["user_id", "item_id", "rating", "timestamp"], + names=["user", "item", "rating", "timestamp"], dtype={ - "user_id": np.int32, - "item_id": np.int32, + "user": np.int32, + "item": np.int32, "rating": np.float32, "timestamp": np.int32, }, ) - return from_interactions_df(rates_df) - -def _load_ml_million(loc: LOC) -> Dataset: - with _open_file(loc, "ratings.dat") as data: - rates_df = pd.read_csv( +def _load_ml_million(ml: MLData) -> pd.DataFrame: + with ml.open_file("ratings.dat") as data: + return pd.read_csv( data, sep=":", header=None, - names=["user_id", "_ui", "item_id", "_ir", "rating", "_rt", "timestamp"], + names=["user", "_ui", "item", "_ir", "rating", "_rt", "timestamp"], usecols=[0, 2, 4, 6], dtype={ - "user_id": np.int32, - "item_id": np.int32, + "user": np.int32, + "item": np.int32, "rating": np.float32, "timestamp": np.int32, }, ) - return from_interactions_df(rates_df) - -def _load_ml_modern(loc: LOC) -> Dataset: - with _open_file(loc, "ratings.csv") as data: - rates_df = pd.read_csv( +def _load_ml_modern(ml: MLData) -> pd.DataFrame: + with ml.open_file("ratings.csv") as data: + return pd.read_csv( data, dtype={ "userId": np.int32, @@ -142,14 +195,4 @@ def _load_ml_modern(loc: LOC) -> Dataset: "rating": np.float32, "timestamp": np.int64, }, - ) - - return from_interactions_df(rates_df, item_col="movieId") - - -def _open_file(loc: LOC, name: str): - if isinstance(loc, Path): - return open(loc / name, "r") - else: - zf, root = loc - return zf.open(root + name) + ).rename(columns={"userId": "user", "movieId": "item"}) diff --git a/lenskit/lenskit/datasets/__init__.py b/lenskit/lenskit/datasets/__init__.py deleted file mode 100644 index 8171dfcf8..000000000 --- a/lenskit/lenskit/datasets/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file is part of LensKit. -# Copyright (C) 2018-2023 Boise State University -# Copyright (C) 2023-2024 Drexel University -# Licensed under the MIT license, see LICENSE.md for details. -# SPDX-License-Identifier: MIT - -from .movielens import * # noqa: F403 diff --git a/lenskit/lenskit/datasets/movielens.py b/lenskit/lenskit/datasets/movielens.py deleted file mode 100644 index 4155cd5f2..000000000 --- a/lenskit/lenskit/datasets/movielens.py +++ /dev/null @@ -1,444 +0,0 @@ -# This file is part of LensKit. -# Copyright (C) 2018-2023 Boise State University -# Copyright (C) 2023-2024 Drexel University -# Licensed under the MIT license, see LICENSE.md for details. -# SPDX-License-Identifier: MIT - -""" -Code to import commonly-used RecSys data sets into LensKit-compatible data frames. -""" - -import logging -import os.path -from pathlib import Path - -import numpy as np -import pandas as pd - -from lenskit.util import cached - -_log = logging.getLogger(__name__) - -__doctest_skip__ = [] -if not os.path.exists("data/ml-100k"): - __doctest_skip__.append("ML100K.*") -if not os.path.exists("data/ml-20m"): - __doctest_skip__.append("MovieLens.tag_genome") -if not os.path.exists("data/ml-1m.*"): - __doctest_skip__.append("ML1M.*") -if not os.path.exists("data/ml-10M100K"): - __doctest_skip__.append("ML10M.*") - __doctest_skip__.append("MLM.*") - -__all__ = ["MovieLens", "ML100K", "ML1M", "ML10M"] - - -class MovieLens: - """ - Code for reading current MovieLens data sets, including ML-20M, ML-Latest, and - ML-Latest-Small. - - Parameters: - path(str or pathlib.Path): Path to the directory containing the data set. - """ - - def __init__(self, path="data/ml-20m"): - self.path = Path(path) - - @cached - def ratings(self): - """ - The rating table. - - >>> mlsmall = MovieLens('data/ml-latest-small') - >>> mlsmall.ratings - user item rating timestamp - 0 1 31 2.5 1260759144 - 1 1 1029 3.0 1260759179 - 2 1 1061 3.0 1260759182 - 3 1 1129 2.0 1260759185 - 4 1 1172 4.0 1260759205 - ... - [100004 rows x 4 columns] - """ - - fn = self.path / "ratings.csv" - ratings = pd.read_csv( - fn, - dtype={ - "movieId": np.int32, - "userId": np.int32, - "rating": np.float64, - "timestamp": np.int32, - }, - ) - ratings.rename(columns={"userId": "user", "movieId": "item"}, inplace=True) - _log.debug("loaded %s, takes %d bytes", fn, ratings.memory_usage().sum()) - return ratings - - @cached - def movies(self): - """ - The movie table, with titles and genres. It is indexed by movie ID. - - >>> mlsmall = MovieLens('data/ml-latest-small') - >>> mlsmall.movies - title genres - item - 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy - 2 Jumanji (1995) Adventure|Children|Fantasy - 3 Grumpier Old Men (1995) Comedy|Romance - 4 Waiting to Exhale (1995) Comedy|Drama|Romance - 5 Father of the Bride Part II (1995) Comedy - ... - [9125 rows x 2 columns] - """ # noqa: E501 - - fn = self.path / "movies.csv" - movies = pd.read_csv( - fn, - dtype={ - "movieId": np.int32, - "title": object, - "genres": object, - }, - ) - movies.rename(columns={"movieId": "item"}, inplace=True) - movies.set_index("item", inplace=True) - _log.debug("loaded %s, takes %d bytes", fn, movies.memory_usage().sum()) - return movies - - @cached - def links(self): - """ - The movie link table, connecting movie IDs to external identifiers. It is indexed - by movie ID. - - >>> mlsmall = MovieLens('data/ml-latest-small') - >>> mlsmall.links - imdbId tmdbId - item - 1 114709 862 - 2 113497 8844 - 3 113228 15602 - 4 114885 31357 - 5 113041 11862 - ... - [9125 rows x 2 columns] - """ - - fn = self.path / "links.csv" - links = pd.read_csv( - fn, dtype={"movieId": np.int32, "imdbId": np.int64, "tmdbId": pd.Int64Dtype()} - ) - links.rename(columns={"movieId": "item"}, inplace=True) - links.set_index("item", inplace=True) - _log.debug("loaded %s, takes %d bytes", fn, links.memory_usage().sum()) - return links - - @cached - def tags(self): - """ - The tag application table, recording user-supplied tags for movies. - - - >>> mlsmall = MovieLens('data/ml-latest-small') - >>> mlsmall.tags - user ... timestamp - 0 15 ... 1138537770 - 1 15 ... 1193435061 - 2 15 ... 1170560997 - 3 15 ... 1170626366 - 4 15 ... 1141391765 - ... - [1296 rows x 4 columns] - """ - - fn = self.path / "tags.csv" - tags = pd.read_csv( - fn, - dtype={ - "movieId": np.int32, - "userId": np.int32, - "tag": object, - "timestamp": np.int32, - }, - ) - tags.rename(columns={"userId": "user", "movieId": "item"}, inplace=True) - _log.debug("loaded %s, takes %d bytes", fn, tags.memory_usage().sum()) - return tags - - @cached - def tag_genome(self): - """ - The tag genome table, recording inferred item-tag relevance scores. This gets returned - as a wide Pandas data frame, with rows indexed by item ID. - - >>> ml20m = MovieLens('data/ml-20m') - >>> ml20m.tag_genome - tag 007 007 (series) 18th century ... wwii zombie zombies - item ... - 1 0.02500 0.02500 0.05775 ... 0.03625 0.07775 0.02300 - 2 0.03975 0.04375 0.03775 ... 0.01475 0.09025 0.01875 - 3 0.04350 0.05475 0.02800 ... 0.01950 0.09700 0.01850 - 4 0.03725 0.03950 0.03675 ... 0.01525 0.06450 0.01300 - 5 0.04200 0.05275 0.05925 ... 0.01675 0.10750 0.01825 - ... - [10381 rows x 1128 columns] - """ - - fn = self.path / "genome-scores.csv" - tags = pd.read_csv(self.path / "genome-tags.csv") - tags = tags.set_index("tagId") - tags = tags["tag"].astype("category") - genome = pd.read_csv( - fn, - dtype={ - "movieId": np.int32, - "tagId": np.int32, - "relevance": np.float64, - }, - ) - genome.rename(columns={"userId": "user", "movieId": "item"}, inplace=True) - genome = genome.join(tags, on="tagId") - genome = genome.pivot(index="item", columns="tag", values="relevance") - _log.debug("loaded %s, takes %d bytes", fn, genome.memory_usage().sum()) - return genome - - -class ML100K: - """ - The MovieLens 100K data set. This older data set is in a different format from - the more current data sets loaded by :class:`MovieLens`. - """ - - def __init__(self, path="data/ml-100k"): - self.path = Path(path) - - @property - def available(self): - "Query whether the data set exists." - return (self.path / "u.data").exists() - - @cached - def ratings(self): - """ - Return the rating data (from ``u.data``). - - >>> ml = ML100K() - >>> ml.ratings - user item rating timestamp - 0 196 242 3.0 881250949 - 1 186 302 3.0 891717742 - 2 22 377 1.0 878887116 - 3 244 51 2.0 880606923 - 4 166 346 1.0 886397596 - ... - [100000 rows x 4 columns] - """ - fn = self.path / "u.data" - ratings = pd.read_csv( - fn, - sep="\t", - header=None, - names=["user", "item", "rating", "timestamp"], - dtype={"user": np.int32, "item": np.int32, "rating": np.float32, "timestamp": np.int32}, - ) - _log.debug("loaded %s", fn) - return ratings - - @cached - def users(self): - """ - Return the user data (from ``u.user``). - - >>> ml = ML100K() - >>> ml.users - age gender occupation zip - user - 1 24 M technician 85711 - 2 53 F other 94043 - 3 23 M writer 32067 - 4 24 M technician 43537 - 5 33 F other 15213 - ... - [943 rows x 4 columns] - """ - fn = self.path / "u.user" - users = pd.read_csv( - fn, - sep="|", - header=None, - names=["user", "age", "gender", "occupation", "zip"], - dtype={"user": np.int32, "age": np.int8, "occupation": "category"}, - ) - _log.debug("loaded %s", fn) - return users.set_index("user") - - @cached - def movies(self): - """ - Return the user data (from ``u.user``). - - >>> ml = ML100K() - >>> ml.movies - title release ... War Western - item ... - 1 Toy Story (1995) 01-Jan-1995 ... 0 0 - 2 GoldenEye (1995) 01-Jan-1995 ... 0 0 - 3 Four Rooms (1995) 01-Jan-1995 ... 0 0 - 4 Get Shorty (1995) 01-Jan-1995 ... 0 0 - 5 Copycat (1995) 01-Jan-1995 ... 0 0 - ... - [1682 rows x 23 columns] - """ - fn = self.path / "u.item" - genres = [ - "unknown", - "Action", - "Adventure", - "Animation", - "Children's", - "Comedy", - "Crime", - "Documentary", - "Drama", - "Fantasy", - "Film-Noir", - "Horror", - "Musical", - "Mystery", - "Romance", - "Sci-Fi", - "Thriller", - "War", - "Western", - ] - items = pd.read_csv( - fn, - sep="|", - header=None, - encoding="latin1", - names=["item", "title", "release", "vidrelease", "imdb"] + genres, - ) - _log.debug("loaded %s", fn) - return items.set_index("item") - - -class MLM: - """ - Base classes for ML1M and ML10M. - """ - - def __init__(self, path): - self.path = Path(path) - - @cached - def ratings(self): - """ - Return the rating data (from ``ratings.dat``). - - >>> ml = ML10M() - >>> ml.ratings - user item rating timestamp - 0 1 122 5.0 838985046 - 1 1 185 5.0 838983525 - 2 1 231 5.0 838983392 - 3 1 292 5.0 838983421 - 4 1 316 5.0 838983392 - ... - [10000054 rows x 4 columns] - """ - fn = self.path / "ratings.dat" - ratings = pd.read_csv( - fn, - sep=":", - header=None, - names=["user", "_ui", "item", "_ir", "rating", "_rt", "timestamp"], - usecols=[0, 2, 4, 6], - dtype={"user": np.int32, "item": np.int32, "rating": np.float32, "timestamp": np.int32}, - ) - _log.debug("loaded %s", fn) - return ratings - - @cached - def movies(self): - """ - Return the movie data (from ``movies.dat``). Indexed by movie ID. - - >>> ml = ML10M() - >>> ml.movies - title genres - item - 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy - 2 Jumanji (1995) Adventure|Children|Fantasy - 3 Grumpier Old Men (1995) Comedy|Romance - 4 Waiting to Exhale (1995) Comedy|Drama|Romance - 5 Father of the Bride Part II (1995) Comedy - ... - [10681 rows x 2 columns] - """ # noqa: E501 - fn = self.path / "movies.dat" - movies = pd.read_csv( - fn, - sep=":", - header=None, - names=["item", "_ir", "title", "_tg", "genres"], - usecols=[0, 2, 4], - dtype={"item": np.int32}, - ) - movies.set_index("item", inplace=True) - _log.debug("loaded %s", fn) - return movies - - -class ML10M(MLM): - """ - MovieLens 10M100K data set. - """ - - def __init__(self, path="data/ml-10M100K"): - super().__init__(path) - - -class ML1M(MLM): - """ - MovieLens 1M data set. - - .. note:: - Some documentation examples use ML-10M100K; that is because this class shares implementation - with the 10M data set. - """ - - def __init__(self, path="data/ml-1m"): - super().__init__(path) - - @cached - def users(self): - """ - Return the movie data (from ``users.dat``). Indexed by user ID. - - >>> ml = ML1M() - >>> ml.users - gender age zip - user - 1 F 1 48067 - 2 M 56 70072 - 3 M 25 55117 - 4 M 45 02460 - 5 M 25 55455 - ... - [6040 rows x 3 columns] - """ - fn = self.path / "users.dat" - users = pd.read_csv( - fn, - sep=":", - header=None, - names=["user", "_ug", "gender", "_ga", "age", "_ao", "occupation", "_oz", "zip"], - usecols=[0, 2, 4, 8], - dtype={"user": np.int32, "gender": "category", "age": np.int8, "timestamp": np.int32}, - ) - users.set_index("user", inplace=True) - _log.debug("loaded %s", fn) - return users diff --git a/lenskit/lenskit/util/test.py b/lenskit/lenskit/util/test.py index 3dbe6986b..d153302ce 100644 --- a/lenskit/lenskit/util/test.py +++ b/lenskit/lenskit/util/test.py @@ -26,34 +26,63 @@ from lenskit.algorithms.ranking import PlackettLuce from lenskit.batch import recommend from lenskit.crossfold import simple_test_pair -from lenskit.data.dataset import from_interactions_df +from lenskit.data.dataset import Dataset, LazyDataset, from_interactions_df from lenskit.data.matrix import torch_sparse_from_scipy -from lenskit.datasets import ML100K, MovieLens +from lenskit.data.movielens import load_movielens, load_movielens_df -ml_test = MovieLens("data/ml-latest-small") -ml100k = ML100K("data/ml-100k") +ml_test_dir = here("data/ml-latest-small") +ml_100k_zip = here("data/ml-100k.zip") +ml_test: Dataset = LazyDataset(lambda: load_movielens(ml_test_dir)) -@pytest.fixture(scope="module") + +@pytest.fixture(scope="session") def ml_ratings(): """ - Fixture to load the test MovieLens ratings. + Fixture to load the test MovieLens ratings as a data frame. To use this, + just include it as a parameter in your test:: + + def test_thing_with_data(ml_ratings: pd.DataFrame): + ... + + .. note:: + This is imported in ``conftest.py`` so it is always available in LensKit tests. """ - path = here("data/ml-latest-small") - yield pd.read_csv(path / "ratings.csv") + yield load_movielens_df(ml_test_dir) -@pytest.fixture +@pytest.fixture(scope="module") def ml_ds(ml_ratings: pd.DataFrame): - return from_interactions_df(ml_ratings, item_col="movieId") + """ + Fixture to load the MovieLens test dataset. To use this, just include it as + a parameter in your test:: + + def test_thing_with_data(ml_ds: Dataset): + ... + + .. note:: + This is imported in ``conftest.py`` so it is always available in LensKit tests. + """ + yield from_interactions_df(ml_ratings) + + +@pytest.fixture +def ml_100k(): + """ + Fixture to load the MovieLens 100K dataset (currently as a data frame). It skips + the test if the ML100K data is not available. + """ + if not ml_100k_zip.exists(): + pytest.skip("ML100K data not available") + yield load_movielens_df(ml_100k_zip) @pytest.fixture(scope="session") -def demo_recs(): +def demo_recs(ml_ratings: pd.DataFrame): """ A demo set of train, test, and recommendation data. """ - train, test = simple_test_pair(ml_test.ratings, f_rates=0.5) + train, test = simple_test_pair(ml_ratings, f_rates=0.5) users = test["user"].unique() algo = PopScore() diff --git a/lenskit/tests/test_als_explicit.py b/lenskit/tests/test_als_explicit.py index e393d3f55..0ef45a28e 100644 --- a/lenskit/tests/test_als_explicit.py +++ b/lenskit/tests/test_als_explicit.py @@ -13,7 +13,8 @@ from pytest import approx, mark -from lenskit.data.dataset import from_interactions_df +from lenskit.data.dataset import Dataset, from_interactions_df +from lenskit.data.movielens import load_movielens_df import lenskit.util.test as lktu from lenskit import batch from lenskit.algorithms import als @@ -116,14 +117,14 @@ def test_als_predict_for_new_users_with_new_ratings(): n_users = 3 n_items = 2 new_u_id = -1 - ratings = lktu.ml_test.ratings np.random.seed(45) - users = np.random.choice(ratings.user.unique(), n_users) - items = np.random.choice(ratings.item.unique(), n_items) + users = np.random.choice(lktu.ml_test.users.ids(), n_users) + items = np.random.choice(lktu.ml_test.items.ids(), n_items) + ratings = lktu.ml_test.interaction_log("pandas", original_ids=True) algo = als.BiasedMF(20, epochs=10) - algo.fit(from_interactions_df(ratings)) + algo.fit(lktu.ml_test) _log.debug("Items: " + str(items)) assert algo.bias is not None assert algo.users_ is not None @@ -133,12 +134,12 @@ def test_als_predict_for_new_users_with_new_ratings(): _log.debug(f"user: {u}") preds = algo.predict_for_user(u, items) - user_data = ratings[ratings.user == u] + user_data = ratings[ratings.user_id == u] _log.debug("user_features from fit: " + str(algo.user_features_[algo.users_.number(u), :])) new_ratings = pd.Series( - user_data.rating.to_numpy(), index=user_data.item + user_data.rating.to_numpy(), index=user_data.item_id ) # items as index and ratings as values new_preds = algo.predict_for_user(new_u_id, items, new_ratings) @@ -178,33 +179,33 @@ def test_als_predict_no_user_features_basic(): n_users = 1 n_items = 2 new_u_id = -1 - ratings = lktu.ml_test.ratings np.random.seed(45) - u = np.random.choice(ratings.user.unique(), n_users)[0] - items = np.random.choice(ratings.item.unique(), n_items) + u = np.random.choice(lktu.ml_test.users.ids(), n_users)[0] + items = np.random.choice(lktu.ml_test.items.ids(), n_items) algo = als.BiasedMF(5, epochs=10) - algo.fit(from_interactions_df(ratings)) + algo.fit(lktu.ml_test) _log.debug("Items: " + str(items)) assert algo.bias is not None assert algo.users_ is not None assert algo.user_features_ is not None algo_no_user_features = als.BiasedMF(5, epochs=10, save_user_features=False) - algo_no_user_features.fit(from_interactions_df(ratings)) + algo_no_user_features.fit(lktu.ml_test) assert algo_no_user_features.user_features_ is None _log.debug(f"user: {u}") preds = algo.predict_for_user(u, items) - user_data = ratings[ratings.user == u] + ratings = lktu.ml_test.interaction_log("pandas", original_ids=True) + user_data = ratings[ratings.user_id == u] _log.debug("user_features from fit: " + str(algo.user_features_[algo.users_.number(u), :])) new_ratings = pd.Series( - user_data.rating.to_numpy(), index=user_data.item + user_data.rating.to_numpy(), index=user_data.item_id ) # items as index and ratings as values new_preds = algo_no_user_features.predict_for_user(new_u_id, items, new_ratings) @@ -216,23 +217,22 @@ def test_als_predict_no_user_features_basic(): @lktu.wantjit @mark.slow -def test_als_train_large(): +def test_als_train_large(ml_ratings): algo = als.BiasedMF(20, epochs=10) - ratings = lktu.ml_test.ratings - algo.fit(from_interactions_df(ratings)) + algo.fit(lktu.ml_test) assert algo.bias is not None assert algo.users_ is not None assert algo.user_features_ is not None - assert algo.bias.mean_ == approx(ratings.rating.mean()) + assert algo.bias.mean_ == approx(ml_ratings.rating.mean()) assert algo.n_features == 20 - assert algo.n_items == ratings.item.nunique() - assert algo.n_users == ratings.user.nunique() + assert algo.n_items == ml_ratings.item.nunique() + assert algo.n_users == ml_ratings.user.nunique() - icounts = ratings.groupby("item").rating.count() - isums = ratings.groupby("item").rating.sum() - is2 = isums - icounts * ratings.rating.mean() + icounts = ml_ratings.groupby("item").rating.count() + isums = ml_ratings.groupby("item").rating.sum() + is2 = isums - icounts * ml_ratings.rating.mean() imeans = is2 / (icounts + 5) ibias = pd.Series(algo.bias.item_offsets_, index=algo.items_.index) imeans, ibias = imeans.align(ibias) @@ -240,13 +240,12 @@ def test_als_train_large(): # don't use wantjit, use this to do a non-JIT test -def test_als_save_load(): +def test_als_save_load(ml_ratings: pd.DataFrame): original = als.BiasedMF(5, epochs=5) - ratings = lktu.ml_test.ratings - original.fit(from_interactions_df(ratings)) + original.fit(lktu.ml_test) assert original.bias is not None - assert original.bias.mean_ == approx(ratings.rating.mean()) + assert original.bias.mean_ == approx(ml_ratings.rating.mean()) assert original.users_ is not None mod = pickle.dumps(original) @@ -268,12 +267,11 @@ def test_als_save_load(): @mark.slow @mark.eval -@mark.skipif(not lktu.ml100k.available, reason="ML100K data not present") -def test_als_batch_accuracy(): +def test_als_batch_accuracy(ml_100k): import lenskit.crossfold as xf import lenskit.metrics.predict as pm - ratings = lktu.ml100k.ratings + ratings = load_movielens_df(lktu.ml_100k_zip) algo = als.BiasedMF(25, epochs=20, damping=5) diff --git a/lenskit/tests/test_als_implicit.py b/lenskit/tests/test_als_implicit.py index 0b4c130d0..37453f9a6 100644 --- a/lenskit/tests/test_als_implicit.py +++ b/lenskit/tests/test_als_implicit.py @@ -11,9 +11,10 @@ import pandas as pd import torch -from pytest import mark +from pytest import approx, mark -from lenskit.data.dataset import from_interactions_df +from lenskit.data.dataset import Dataset, from_interactions_df +from lenskit.data.movielens import load_movielens_df import lenskit.util.test as lktu from lenskit.algorithms import Recommender, als @@ -86,7 +87,7 @@ def test_als_predict_basic_for_new_user_with_new_ratings(): assert abs(preds.loc[i] - new_preds.loc[i]) <= 0.1 -def test_als_predict_for_new_users_with_new_ratings(): +def test_als_predict_for_new_users_with_new_ratings(ml_ratings: pd.DataFrame, ml_ds: Dataset): """ Test if ImplicitMF predictions using the same ratings for a new user is the same as a user in ml-latest-small dataset. @@ -95,14 +96,13 @@ def test_als_predict_for_new_users_with_new_ratings(): n_users = 3 n_items = 2 new_u_id = -1 - ratings = lktu.ml_test.ratings np.random.seed(45) - users = np.random.choice(ratings.user.unique(), n_users) - items = np.random.choice(ratings.item.unique(), n_items) + users = np.random.choice(ml_ds.users, n_users) + items = np.random.choice(ml_ds.items, n_items) algo = als.ImplicitMF(20, epochs=10, use_ratings=False) - algo.fit(from_interactions_df(ratings)) + algo.fit(ml_ds) assert algo.users_ is not None assert algo.user_features_ is not None @@ -114,7 +114,7 @@ def test_als_predict_for_new_users_with_new_ratings(): upos = algo.users_.number(u) # get the user's rating series - user_data = ratings[ratings.user == u] + user_data = ml_ratings[ml_ratings.user == u] new_ratings = user_data.set_index("item")["rating"].copy() nr_info = new_ratings.to_frame() @@ -142,7 +142,9 @@ def test_als_predict_for_new_users_with_new_ratings(): assert all(diffs <= 0.1) -def test_als_recs_topn_for_new_users_with_new_ratings(rng): +def test_als_recs_topn_for_new_users_with_new_ratings( + rng, ml_ratings: pd.DataFrame, ml_ds: Dataset +): """ Test if ImplicitMF topn recommendations using the same ratings for a new user is the same as a user in ml-latest-small dataset. @@ -154,13 +156,12 @@ def test_als_recs_topn_for_new_users_with_new_ratings(rng): n_users = 10 new_u_id = -1 - ratings = lktu.ml_test.ratings - users = rng.choice(np.unique(ratings.user), n_users) + users = rng.choice(ml_ds.users, n_users) algo = als.ImplicitMF(20, epochs=10, use_ratings=True) rec_algo = basic.TopN(algo) - rec_algo.fit(from_interactions_df(ratings)) + rec_algo.fit(ml_ds) assert algo.users_ is not None assert algo.user_features_ is not None # _log.debug("Items: " + str(items)) @@ -168,7 +169,7 @@ def test_als_recs_topn_for_new_users_with_new_ratings(rng): correlations = pd.Series(np.nan, index=users) for u in users: recs = rec_algo.recommend(u, 10) - user_data = ratings[ratings.user == u] + user_data = ml_ratings[ml_ratings.user == u] upos = algo.users_.number(u) _log.info("user %s: %s ratings", u, len(user_data)) @@ -215,47 +216,45 @@ def test_als_predict_bad_user(): assert np.isnan(preds.loc[3]) -def test_als_predict_no_user_features_basic(): - ratings = lktu.ml_test.ratings +def test_als_predict_no_user_features_basic(ml_ratings: pd.DataFrame, ml_ds: Dataset): np.random.seed(45) - u = np.random.choice(ratings.user.unique(), 1)[0] - items = np.random.choice(ratings.item.unique(), 2) + u = np.random.choice(ml_ds.users, 1)[0] + items = np.random.choice(ml_ds.items, 2) - algo = als.ImplicitMF(5, epochs=10, use_ratings=True) - algo.fit(from_interactions_df(ratings)) + algo = als.ImplicitMF(5, epochs=10) + algo.fit(ml_ds) preds = algo.predict_for_user(u, items) - user_data = ratings[ratings.user == u] + user_data = ml_ratings[ml_ratings.user == u] new_ratings = user_data.set_index("item")["rating"].copy() algo_no_user_features = als.ImplicitMF(5, epochs=10, save_user_features=False) - algo_no_user_features.fit(from_interactions_df(ratings)) + algo_no_user_features.fit(ml_ds) preds_no_user_features = algo_no_user_features.predict_for_user(u, items, new_ratings) assert algo_no_user_features.user_features_ is None + assert preds_no_user_features.values == approx(preds, abs=0.1) diffs = np.abs(preds - preds_no_user_features) assert all(diffs <= 0.1) @lktu.wantjit -def test_als_train_large(): +def test_als_train_large(ml_ds: Dataset): algo = als.ImplicitMF(20, epochs=20, use_ratings=False) - ratings = lktu.ml_test.ratings - algo.fit(from_interactions_df(ratings)) + algo.fit(ml_ds) assert algo.users_ is not None assert algo.user_features_ is not None - assert len(algo.users_.index) == ratings.user.nunique() - assert len(algo.items_.index) == ratings.item.nunique() - assert algo.user_features_.shape == (ratings.user.nunique(), 20) - assert algo.item_features_.shape == (ratings.item.nunique(), 20) + assert len(algo.users_.index) == ml_ds.user_count + assert len(algo.items_.index) == ml_ds.item_count + assert algo.user_features_.shape == (ml_ds.user_count, 20) + assert algo.item_features_.shape == (ml_ds.item_count, 20) -def test_als_save_load(tmp_path): +def test_als_save_load(tmp_path, ml_ds: Dataset): "Test saving and loading ALS models, and regularized training." algo = als.ImplicitMF(5, epochs=5, reg=(2, 1), use_ratings=False) - ratings = lktu.ml_test.ratings - algo.fit(from_interactions_df(ratings)) + algo.fit(ml_ds) assert algo.users_ is not None fn = tmp_path / "model.bpk" @@ -272,42 +271,38 @@ def test_als_save_load(tmp_path): @lktu.wantjit -def test_als_train_large_noratings(): +def test_als_train_large_noratings(ml_ds: Dataset): algo = als.ImplicitMF(20, epochs=20) - ratings = lktu.ml_test.ratings - ratings = ratings.loc[:, ["user", "item"]] - algo.fit(from_interactions_df(ratings)) + algo.fit(ml_ds) assert algo.users_ is not None assert algo.user_features_ is not None - assert len(algo.users_.index) == ratings.user.nunique() - assert len(algo.items_.index) == ratings.item.nunique() - assert algo.user_features_.shape == (ratings.user.nunique(), 20) - assert algo.item_features_.shape == (ratings.item.nunique(), 20) + assert len(algo.users_.index) == ml_ds.user_count + assert len(algo.items_.index) == ml_ds.item_count + assert algo.user_features_.shape == (ml_ds.user_count, 20) + assert algo.item_features_.shape == (ml_ds.item_count, 20) @lktu.wantjit -def test_als_train_large_ratings(): +def test_als_train_large_ratings(ml_ds): algo = als.ImplicitMF(20, epochs=20, use_ratings=True) - ratings = lktu.ml_test.ratings - algo.fit(from_interactions_df(ratings)) + algo.fit(ml_ds) assert algo.users_ is not None assert algo.user_features_ is not None - assert len(algo.users_.index) == ratings.user.nunique() - assert len(algo.items_.index) == ratings.item.nunique() - assert algo.user_features_.shape == (ratings.user.nunique(), 20) - assert algo.item_features_.shape == (ratings.item.nunique(), 20) + assert len(algo.users_.index) == ml_ds.user_count + assert len(algo.items_.index) == ml_ds.item_count + assert algo.user_features_.shape == (ml_ds.user_count, 20) + assert algo.item_features_.shape == (ml_ds.item_count, 20) @mark.slow @mark.eval -@mark.skipif(not lktu.ml100k.available, reason="ML100K data not present") -def test_als_implicit_batch_accuracy(): +def test_als_implicit_batch_accuracy(ml_100k): import lenskit.crossfold as xf from lenskit import batch, topn - ratings = lktu.ml100k.ratings + ratings = load_movielens_df(lktu.ml_100k_zip) def eval(train, test): train = train.astype({"rating": np.float_}) diff --git a/lenskit/tests/test_batch_predict.py b/lenskit/tests/test_batch_predict.py index 9619925b3..ddd7dec12 100644 --- a/lenskit/tests/test_batch_predict.py +++ b/lenskit/tests/test_batch_predict.py @@ -122,17 +122,14 @@ def test_predict_include_rating(mlb: MLB): assert all(preds.rating.values == urv.loc[preds.index, :].rating.values) -@pytest.mark.skipif(not lktu.ml100k.available, reason="ML-100K required") @pytest.mark.eval @pytest.mark.parametrize("ncpus", [None, 1, 2]) -def test_bias_batch_predict(ncpus): +def test_bias_batch_predict(ml_100k, ncpus): import lenskit.crossfold as xf import lenskit.metrics.predict as pm from lenskit import batch from lenskit.algorithms import bias - ratings = lktu.ml100k.ratings - algo = bias.Bias(damping=5) def eval(train, test): @@ -143,7 +140,7 @@ def eval(train, test): return recs preds = pd.concat( - (eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) + (eval(train, test) for (train, test) in xf.partition_users(ml_100k, 5, xf.SampleFrac(0.2))) ) _log.info("analyzing predictions") diff --git a/lenskit/tests/test_batch_recommend.py b/lenskit/tests/test_batch_recommend.py index 6fd0e1928..b3a693545 100644 --- a/lenskit/tests/test_batch_recommend.py +++ b/lenskit/tests/test_batch_recommend.py @@ -19,7 +19,6 @@ from lenskit.algorithms import Recommender from lenskit.algorithms.basic import PopScore, TopN from lenskit.algorithms.bias import Bias -from lenskit.util.test import ml_ratings, ml_ds # noqa: F401 _log = logging.getLogger(__name__) @@ -73,11 +72,8 @@ def check_positive_ndcg(self, recs): @pytest.fixture -def ml_folds() -> MLFolds: - if not lktu.ml100k.available: - pytest.skip("ML-100K not available") - ratings = lktu.ml100k.ratings - return MLFolds(ratings) +def ml_folds(ml_100k) -> MLFolds: + return MLFolds(ml_100k) def test_recommend_single(mlb: MLB): diff --git a/lenskit/tests/test_bias.py b/lenskit/tests/test_bias.py index ef2424f62..7cabc9181 100644 --- a/lenskit/tests/test_bias.py +++ b/lenskit/tests/test_bias.py @@ -15,8 +15,7 @@ from lenskit import util as lku from lenskit.algorithms.bias import Bias -from lenskit.data.dataset import from_interactions_df -from lenskit.util.test import ml_test, ml_ds, ml_ratings # noqa: F401 +from lenskit.data.dataset import Dataset, from_interactions_df _log = logging.getLogger(__name__) @@ -149,6 +148,8 @@ def test_bias_global_predict(): def test_bias_item_predict(): algo = Bias(users=False) algo.fit(simple_ds) + assert algo.item_offsets_ is not None + p = algo.predict_for_user(10, [1, 2, 3]) assert len(p) == 3 @@ -172,6 +173,7 @@ def test_bias_user_predict(): def test_bias_new_user_predict(): algo = Bias() algo.fit(simple_ds) + assert algo.item_offsets_ is not None ratings = pd.DataFrame({"item": [1, 2, 3], "rating": [1.5, 2.5, 3.5]}) ratings = ratings.set_index("item").rating @@ -188,6 +190,7 @@ def test_bias_new_user_predict(): def test_bias_predict_unknown_item(): algo = Bias() algo.fit(simple_ds) + assert algo.item_offsets_ is not None p = algo.predict_for_user(10, [1, 3, 4]) @@ -200,6 +203,7 @@ def test_bias_predict_unknown_item(): def test_bias_predict_unknown_user(): algo = Bias() algo.fit(simple_ds) + assert algo.item_offsets_ is not None p = algo.predict_for_user(15, [1, 3]) @@ -207,47 +211,51 @@ def test_bias_predict_unknown_user(): assert p.values == approx((algo.item_offsets_.loc[[1, 3]] + algo.mean_).values) -def test_bias_train_ml_ratings(): +def test_bias_train_ml_ratings(ml_ratings: pd.DataFrame, ml_ds: Dataset): algo = Bias() - ratings = ml_test.ratings - algo.fit(from_interactions_df(ratings)) + algo.fit(ml_ds) + assert algo.item_offsets_ is not None - assert algo.mean_ == approx(ratings.rating.mean()) - imeans_data = ratings.groupby("item").rating.mean() + assert algo.mean_ == approx(ml_ratings.rating.mean()) + imeans_data = ml_ds.item_stats()["mean_rating"] imeans_algo = algo.item_offsets_ + algo.mean_ ares, data = imeans_algo.align(imeans_data) assert ares.values == approx(data.values) - urates = ratings.set_index("user").loc[2].set_index("item").rating + urates = ml_ratings.set_index("user").loc[2].set_index("item").rating umean = (urates - imeans_data[urates.index]).mean() p = algo.predict_for_user(2, [10, 11, -1]) assert len(p) == 3 assert p.iloc[0] == approx(imeans_data.loc[10] + umean) assert p.iloc[1] == approx(imeans_data.loc[11] + umean) - assert p.iloc[2] == approx(ratings.rating.mean() + umean) + assert p.iloc[2] == approx(ml_ratings.rating.mean() + umean) -def test_bias_transform(): +def test_bias_transform(ml_ds: Dataset): algo = Bias() - ratings = ml_test.ratings - normed = algo.fit_transform(from_interactions_df(ratings)) + normed = algo.fit_transform(ml_ds) - assert all(normed["user"] == ratings["user"]) - assert all(normed["item"] == ratings["item"]) + ratings = ml_ds.interaction_log("pandas", original_ids=True) + assert all(normed["user"] == ratings["user_id"]) + assert all(normed["item"] == ratings["item_id"]) denorm = algo.inverse_transform(normed) assert denorm["rating"].values == approx(ratings["rating"], 1.0e-6) - n2 = ratings.join(algo.item_offsets_, on="item") - n2 = n2.join(algo.user_offsets_, on="user") + assert algo.item_offsets_ is not None + assert algo.user_offsets_ is not None + n2 = ratings.join(algo.item_offsets_, on="item_id") + n2 = n2.join(algo.user_offsets_, on="user_id") nr = n2.rating - algo.mean_ - n2.i_off - n2.u_off assert normed["rating"].values == approx(nr.values) -def test_bias_transform_tensor(ml_ratings, ml_ds): +def test_bias_transform_tensor(ml_ds): algo = Bias() algo.fit(ml_ds) + assert algo.item_offsets_ is not None + assert algo.user_offsets_ is not None mat = ml_ds.interaction_matrix("torch", layout="coo") normed = algo.transform(mat) @@ -262,40 +270,49 @@ def test_bias_transform_tensor(ml_ratings, ml_ds): assert recon.values().numpy() == approx(mat.values().numpy()) -def test_bias_transform_indexes(): +def test_bias_transform_indexes(ml_ds: Dataset): algo = Bias() - ratings = ml_test.ratings - normed = algo.fit_transform(from_interactions_df(ratings), indexes=True) + normed = algo.fit_transform(ml_ds, indexes=True) + assert algo.item_offsets_ is not None + assert algo.user_offsets_ is not None + + ratings = ml_ds.interaction_log("pandas", original_ids=True) - assert all(normed["user"] == ratings["user"]) - assert all(normed["item"] == ratings["item"]) - assert all(normed["uidx"] == algo.user_offsets_.index.get_indexer(ratings["user"])) - assert all(normed["iidx"] == algo.item_offsets_.index.get_indexer(ratings["item"])) + assert all(normed["user"] == ratings["user_id"]) + assert all(normed["item"] == ratings["item_id"]) + assert all(normed["uidx"] == ml_ds.users.numbers(ratings["user_id"])) + assert all(normed["iidx"] == ml_ds.items.numbers(ratings["item_id"])) denorm = algo.inverse_transform(normed) assert denorm["rating"].values == approx(ratings["rating"].values, 1.0e-6) @mark.parametrize(["users", "items"], [(True, False), (False, True), (False, False)]) -def test_bias_transform_disable(users, items): +def test_bias_transform_disable(ml_ds: Dataset, users: bool, items: bool): algo = Bias(users=users, items=items) - ratings = ml_test.ratings - normed = algo.fit_transform(from_interactions_df(ratings)) + normed = algo.fit_transform(ml_ds) - assert all(normed["user"] == ratings["user"]) - assert all(normed["item"] == ratings["item"]) + ratings = ml_ds.interaction_log("pandas", original_ids=True) + assert all(normed["user"] == ratings["user_id"]) + assert all(normed["item"] == ratings["item_id"]) denorm = algo.inverse_transform(normed) assert denorm["rating"].values == approx(ratings["rating"], 1.0e-6) n2 = ratings nr = n2.rating - algo.mean_ if items: - n2 = n2.join(algo.item_offsets_, on="item") + assert algo.item_offsets_ is not None + n2 = n2.join(algo.item_offsets_, on="item_id") nr = nr - n2.i_off + else: + assert algo.item_offsets_ is None if users: - n2 = n2.join(algo.user_offsets_, on="user") + assert algo.user_offsets_ is not None + n2 = n2.join(algo.user_offsets_, on="user_id") nr = nr - n2.u_off + else: + assert algo.user_offsets_ is None assert normed["rating"].values == approx(nr.values) @@ -381,6 +398,8 @@ def test_transform_user_without_user_bias(): user = 12 algo = Bias() algo.fit(simple_ds) + assert algo.item_offsets_ is not None + assert algo.user_offsets_ is not None new_ratings = pd.Series([-0.5, 1.5], index=[2, 3]) # items as index and ratings as values diff --git a/lenskit/tests/test_candidate_selector.py b/lenskit/tests/test_candidate_selector.py index 52f47e891..f158e77e8 100644 --- a/lenskit/tests/test_candidate_selector.py +++ b/lenskit/tests/test_candidate_selector.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from lenskit.data.dataset import from_interactions_df +from lenskit.data.dataset import Dataset, from_interactions_df import lenskit.util.test as lktu from lenskit.algorithms import basic @@ -53,14 +53,13 @@ def test_unrated_override(): assert set(sel.candidates(10, [2])) == set([1, 3]) -def test_unrated_big(): - ratings = lktu.ml_test.ratings - users = ratings.user.unique() - items = ratings.item.unique() - user_items = ratings.set_index("user").item +def test_unrated_big(ml_ds: Dataset): + users = ml_ds.users.ids() + items = ml_ds.items.ids() + user_items = ml_ds.interaction_matrix("pandas", original_ids=True).set_index("user_id").item_id sel = basic.UnratedItemCandidateSelector() - s2 = sel.fit(from_interactions_df(ratings)) + s2 = sel.fit(ml_ds) assert s2 is sel # test 100 random users diff --git a/lenskit/tests/test_crossfold.py b/lenskit/tests/test_crossfold.py index 75d555c80..ff512dd66 100644 --- a/lenskit/tests/test_crossfold.py +++ b/lenskit/tests/test_crossfold.py @@ -10,21 +10,21 @@ import numpy as np +import pandas as pd import pytest import lenskit.crossfold as xf import lenskit.util.test as lktu -def test_partition_rows(): - ratings = lktu.ml_test.ratings - splits = xf.partition_rows(ratings, 5) +def test_partition_rows(ml_ratings: pd.DataFrame): + splits = xf.partition_rows(ml_ratings, 5) splits = list(splits) assert len(splits) == 5 for s in splits: - assert len(s.test) + len(s.train) == len(ratings) - assert all(s.test.index.union(s.train.index) == ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) test_idx = s.test.set_index(["user", "item"]).index train_idx = s.train.set_index(["user", "item"]).index assert len(test_idx.intersection(train_idx)) == 0 @@ -40,18 +40,17 @@ def test_partition_rows(): assert len(inter) == 0 union = ft.reduce(lambda i1, i2: i1.union(i2), (s.test.index for s in splits)) - assert len(union.unique()) == len(ratings) + assert len(union.unique()) == len(ml_ratings) -def test_sample_rows(): - ratings = lktu.ml_test.ratings - splits = xf.sample_rows(ratings, partitions=5, size=1000) +def test_sample_rows(ml_ratings: pd.DataFrame): + splits = xf.sample_rows(ml_ratings, partitions=5, size=1000) splits = list(splits) assert len(splits) == 5 for s in splits: assert len(s.test) == 1000 - assert len(s.test) + len(s.train) == len(ratings) + assert len(s.test) + len(s.train) == len(ml_ratings) test_idx = s.test.set_index(["user", "item"]).index train_idx = s.train.set_index(["user", "item"]).index assert len(test_idx.intersection(train_idx)) == 0 @@ -66,15 +65,14 @@ def test_sample_rows(): assert len(inter) == 0 -def test_sample_rows_more_smaller_parts(): - ratings = lktu.ml_test.ratings - splits = xf.sample_rows(ratings, partitions=10, size=500) +def test_sample_rows_more_smaller_parts(ml_ratings: pd.DataFrame): + splits = xf.sample_rows(ml_ratings, partitions=10, size=500) splits = list(splits) assert len(splits) == 10 for s in splits: assert len(s.test) == 500 - assert len(s.test) + len(s.train) == len(ratings) + assert len(s.test) + len(s.train) == len(ml_ratings) test_idx = s.test.set_index(["user", "item"]).index train_idx = s.train.set_index(["user", "item"]).index assert len(test_idx.intersection(train_idx)) == 0 @@ -89,15 +87,14 @@ def test_sample_rows_more_smaller_parts(): assert len(inter) == 0 -def test_sample_non_disjoint(): - ratings = lktu.ml_test.ratings - splits = xf.sample_rows(ratings, partitions=10, size=1000, disjoint=False) +def test_sample_non_disjoint(ml_ratings: pd.DataFrame): + splits = xf.sample_rows(ml_ratings, partitions=10, size=1000, disjoint=False) splits = list(splits) assert len(splits) == 10 for s in splits: assert len(s.test) == 1000 - assert len(s.test) + len(s.train) == len(ratings) + assert len(s.test) + len(s.train) == len(ml_ratings) test_idx = s.test.set_index(["user", "item"]).index train_idx = s.train.set_index(["user", "item"]).index assert len(test_idx.intersection(train_idx)) == 0 @@ -112,28 +109,25 @@ def test_sample_non_disjoint(): @pytest.mark.slow -def test_sample_oversize(): - ratings = lktu.ml_test.ratings - splits = xf.sample_rows(ratings, 50, 10000) +def test_sample_oversize(ml_ratings: pd.DataFrame): + splits = xf.sample_rows(ml_ratings, 50, 10000) splits = list(splits) assert len(splits) == 50 for s in splits: - assert len(s.test) + len(s.train) == len(ratings) - assert all(s.test.index.union(s.train.index) == ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) test_idx = s.test.set_index(["user", "item"]).index train_idx = s.train.set_index(["user", "item"]).index assert len(test_idx.intersection(train_idx)) == 0 -def test_sample_n(): - ratings = lktu.ml_test.ratings - - users = np.random.choice(ratings.user.unique(), 5, replace=False) +def test_sample_n(ml_ratings: pd.DataFrame): + users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) s5 = xf.SampleN(5) for u in users: - udf = ratings[ratings.user == u] + udf = ml_ratings[ml_ratings.user == u] tst = s5(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) == 5 @@ -141,20 +135,19 @@ def test_sample_n(): s10 = xf.SampleN(10) for u in users: - udf = ratings[ratings.user == u] + udf = ml_ratings[ml_ratings.user == u] tst = s10(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) == 10 assert len(tst) + len(trn) == len(udf) -def test_sample_frac(): - ratings = lktu.ml_test.ratings - users = np.random.choice(ratings.user.unique(), 5, replace=False) +def test_sample_frac(ml_ratings: pd.DataFrame): + users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) samp = xf.SampleFrac(0.2) for u in users: - udf = ratings[ratings.user == u] + udf = ml_ratings[ml_ratings.user == u] tst = samp(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) + len(trn) == len(udf) @@ -163,7 +156,7 @@ def test_sample_frac(): samp = xf.SampleFrac(0.5) for u in users: - udf = ratings[ratings.user == u] + udf = ml_ratings[ml_ratings.user == u] tst = samp(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) + len(trn) == len(udf) @@ -171,13 +164,12 @@ def test_sample_frac(): assert len(tst) <= math.ceil(len(udf) * 0.5) -def test_last_n(): - ratings = lktu.ml_test.ratings - users = np.random.choice(ratings.user.unique(), 5, replace=False) +def test_last_n(ml_ratings: pd.DataFrame): + users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) samp = xf.LastN(5) for u in users: - udf = ratings[ratings.user == u] + udf = ml_ratings[ml_ratings.user == u] tst = samp(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) == 5 @@ -186,7 +178,7 @@ def test_last_n(): samp = xf.LastN(7) for u in users: - udf = ratings[ratings.user == u] + udf = ml_ratings[ml_ratings.user == u] tst = samp(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) == 7 @@ -194,13 +186,12 @@ def test_last_n(): assert tst.timestamp.min() >= trn.timestamp.max() -def test_last_frac(): - ratings = lktu.ml_test.ratings - users = np.random.choice(ratings.user.unique(), 5, replace=False) +def test_last_frac(ml_ratings: pd.DataFrame): + users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) samp = xf.LastFrac(0.2, "timestamp") for u in users: - udf = ratings[ratings.user == u] + udf = ml_ratings[ml_ratings.user == u] tst = samp(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) + len(trn) == len(udf) @@ -210,7 +201,7 @@ def test_last_frac(): samp = xf.LastFrac(0.5, "timestamp") for u in users: - udf = ratings[ratings.user == u] + udf = ml_ratings[ml_ratings.user == u] tst = samp(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) + len(trn) == len(udf) @@ -219,40 +210,38 @@ def test_last_frac(): assert tst.timestamp.min() >= trn.timestamp.max() -def test_partition_users(): - ratings = lktu.ml_test.ratings - splits = xf.partition_users(ratings, 5, xf.SampleN(5)) +def test_partition_users(ml_ratings: pd.DataFrame): + splits = xf.partition_users(ml_ratings, 5, xf.SampleN(5)) splits = list(splits) assert len(splits) == 5 for s in splits: ucounts = s.test.groupby("user").agg("count") assert all(ucounts == 5) - assert all(s.test.index.union(s.train.index) == ratings.index) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) assert all(s.train["user"].isin(s.train["user"].unique())) - assert len(s.test) + len(s.train) == len(ratings) + assert len(s.test) + len(s.train) == len(ml_ratings) users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) - assert len(users) == ratings.user.nunique() - assert users == set(ratings.user) + assert len(users) == ml_ratings.user.nunique() + assert users == set(ml_ratings.user) -def test_partition_may_skip_train(): - """Partitioning when users may not have enough ratings to be in the train set and test set.""" - ratings = lktu.ml_test.ratings +def test_partition_may_skip_train(ml_ratings: pd.DataFrame): + """Partitioning when users may not have enough ml_ratings to be in the train set and test set.""" # make a data set where some users only have 1 rating - ratings = ratings.sample(frac=0.1) - users = ratings.groupby("user")["rating"].count() + ml_ratings = ml_ratings.sample(frac=0.1) + users = ml_ratings.groupby("user")["rating"].count() assert users.min() == 1.0 # we should have some small users! users.name = "ur_count" - splits = xf.partition_users(ratings, 5, xf.SampleN(1)) + splits = xf.partition_users(ml_ratings, 5, xf.SampleN(1)) splits = list(splits) assert len(splits) == 5 - # now we go make sure we're missing some users! And don't have any NaN ratings + # now we go make sure we're missing some users! And don't have any NaN ml_ratings for train, test in splits: - # no null ratings + # no null ml_ratings assert all(train["rating"].notna()) # see if test users with 1 rating are missing from train test = test.join(users, on="user") @@ -261,30 +250,28 @@ def test_partition_may_skip_train(): assert all(test.loc[test["ur_count"] > 1, "user"].isin(train["user"].unique())) -def test_partition_users_frac(): - ratings = lktu.ml_test.ratings - splits = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) +def test_partition_users_frac(ml_ratings: pd.DataFrame): + splits = xf.partition_users(ml_ratings, 5, xf.SampleFrac(0.2)) splits = list(splits) assert len(splits) == 5 - ucounts = ratings.groupby("user").item.count() + ucounts = ml_ratings.groupby("user").item.count() uss = ucounts * 0.2 for s in splits: tucs = s.test.groupby("user").item.count() assert all(tucs >= uss.loc[tucs.index] - 1) assert all(tucs <= uss.loc[tucs.index] + 1) - assert all(s.test.index.union(s.train.index) == ratings.index) - assert len(s.test) + len(s.train) == len(ratings) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) # we have all users users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) - assert len(users) == ratings.user.nunique() - assert users == set(ratings.user) + assert len(users) == ml_ratings.user.nunique() + assert users == set(ml_ratings.user) -def test_sample_users(): - ratings = lktu.ml_test.ratings - splits = xf.sample_users(ratings, 5, 100, xf.SampleN(5)) +def test_sample_users(ml_ratings: pd.DataFrame): + splits = xf.sample_users(ml_ratings, 5, 100, xf.SampleN(5)) splits = list(splits) assert len(splits) == 5 @@ -293,8 +280,8 @@ def test_sample_users(): assert len(s.test) == 5 * 100 assert len(ucounts) == 100 assert all(ucounts == 5) - assert all(s.test.index.union(s.train.index) == ratings.index) - assert len(s.test) + len(s.train) == len(ratings) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) # no overlapping users for s1, s2 in it.product(splits, splits): @@ -305,12 +292,11 @@ def test_sample_users(): assert len(np.intersect1d(us1, us2)) == 0 -def test_sample_users_frac(): - ratings = lktu.ml_test.ratings - splits = xf.sample_users(ratings, 5, 100, xf.SampleFrac(0.2)) +def test_sample_users_frac(ml_ratings: pd.DataFrame): + splits = xf.sample_users(ml_ratings, 5, 100, xf.SampleFrac(0.2)) splits = list(splits) assert len(splits) == 5 - ucounts = ratings.groupby("user").item.count() + ucounts = ml_ratings.groupby("user").item.count() uss = ucounts * 0.2 for s in splits: @@ -318,8 +304,8 @@ def test_sample_users_frac(): assert len(tucs) == 100 assert all(tucs >= uss.loc[tucs.index] - 1) assert all(tucs <= uss.loc[tucs.index] + 1) - assert all(s.test.index.union(s.train.index) == ratings.index) - assert len(s.test) + len(s.train) == len(ratings) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) # no overlapping users for s1, s2 in it.product(splits, splits): @@ -331,9 +317,8 @@ def test_sample_users_frac(): @pytest.mark.slow -def test_sample_users_frac_oversize(): - ratings = lktu.ml_test.ratings - splits = xf.sample_users(ratings, 20, 100, xf.SampleN(5)) +def test_sample_users_frac_oversize(ml_ratings: pd.DataFrame): + splits = xf.sample_users(ml_ratings, 20, 100, xf.SampleN(5)) splits = list(splits) assert len(splits) == 20 @@ -341,12 +326,12 @@ def test_sample_users_frac_oversize(): ucounts = s.test.groupby("user").agg("count") assert len(ucounts) < 100 assert all(ucounts == 5) - assert all(s.test.index.union(s.train.index) == ratings.index) - assert len(s.test) + len(s.train) == len(ratings) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) - assert len(users) == ratings.user.nunique() - assert users == set(ratings.user) + assert len(users) == ml_ratings.user.nunique() + assert users == set(ml_ratings.user) for s1, s2 in it.product(splits, splits): if s1 is s2: continue @@ -356,9 +341,8 @@ def test_sample_users_frac_oversize(): assert len(np.intersect1d(us1, us2)) == 0 -def test_sample_users_frac_oversize_ndj(): - ratings = lktu.ml_test.ratings - splits = xf.sample_users(ratings, 20, 100, xf.SampleN(5), disjoint=False) +def test_sample_users_frac_oversize_ndj(ml_ratings: pd.DataFrame): + splits = xf.sample_users(ml_ratings, 20, 100, xf.SampleN(5), disjoint=False) splits = list(splits) assert len(splits) == 20 @@ -367,41 +351,37 @@ def test_sample_users_frac_oversize_ndj(): assert len(ucounts) == 100 assert len(s.test) == 5 * 100 assert all(ucounts == 5) - assert all(s.test.index.union(s.train.index) == ratings.index) - assert len(s.test) + len(s.train) == len(ratings) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) -def test_non_unique_index_partition_users(): +def test_non_unique_index_partition_users(ml_ratings: pd.DataFrame): """Partitioning users when dataframe has non-unique indices""" - ratings = lktu.ml_test.ratings - ratings = ratings.set_index("user") ##forces non-unique index + ml_ratings = ml_ratings.set_index("user") ##forces non-unique index with pytest.raises(ValueError): - for split in xf.partition_users(ratings, 5, xf.SampleN(5)): + for split in xf.partition_users(ml_ratings, 5, xf.SampleN(5)): pass -def test_sample_users_dup_index(): +def test_sample_users_dup_index(ml_ratings: pd.DataFrame): """Sampling users when dataframe has non-unique indices""" - ratings = lktu.ml_test.ratings - ratings = ratings.set_index("user") ##forces non-unique index + ml_ratings = ml_ratings.set_index("user") ##forces non-unique index with pytest.raises(ValueError): - for split in xf.sample_users(ratings, 5, 100, xf.SampleN(5)): + for split in xf.sample_users(ml_ratings, 5, 100, xf.SampleN(5)): pass -def test_sample_rows_dup_index(): - """Sampling ratings when dataframe has non-unique indices""" - ratings = lktu.ml_test.ratings - ratings = ratings.set_index("user") ##forces non-unique index +def test_sample_rows_dup_index(ml_ratings: pd.DataFrame): + """Sampling ml_ratings when dataframe has non-unique indices""" + ml_ratings = ml_ratings.set_index("user") ##forces non-unique index with pytest.raises(ValueError): - for split in xf.sample_rows(ratings, partitions=5, size=1000): + for split in xf.sample_rows(ml_ratings, partitions=5, size=1000): pass -def test_partition_users_dup_index(): - """Partitioning ratings when dataframe has non-unique indices""" - ratings = lktu.ml_test.ratings - ratings = ratings.set_index("user") ##forces non-unique index +def test_partition_users_dup_index(ml_ratings: pd.DataFrame): + """Partitioning ml_ratings when dataframe has non-unique indices""" + ml_ratings = ml_ratings.set_index("user") ##forces non-unique index with pytest.raises(ValueError): - for split in xf.partition_users(ratings, 5, xf.SampleN(5)): + for split in xf.partition_users(ml_ratings, 5, xf.SampleN(5)): pass diff --git a/lenskit/tests/test_dataset_ids.py b/lenskit/tests/test_dataset_ids.py index 205fc29b9..1605674d7 100644 --- a/lenskit/tests/test_dataset_ids.py +++ b/lenskit/tests/test_dataset_ids.py @@ -12,21 +12,21 @@ def test_from_ratings_default_names(ml_ratings: pd.DataFrame): - ratings = ml_ratings.rename(columns={"userId": "user_id", "movieId": "item_id"}) + ratings = ml_ratings.rename(columns={"user": "user_id", "item": "item_id"}) ds = from_interactions_df(ratings) assert ds.item_count == ratings["item_id"].nunique() assert ds.user_count == ratings["user_id"].nunique() def test_from_ratings_nosuffix(ml_ratings: pd.DataFrame): - ratings = ml_ratings.rename(columns={"userId": "user", "movieId": "item"}) + ratings = ml_ratings.rename(columns={"user": "user", "item": "item"}) ds = from_interactions_df(ratings) assert ds.item_count == ratings["item"].nunique() assert ds.user_count == ratings["user"].nunique() def test_from_ratings_names_upper(ml_ratings: pd.DataFrame): - ratings = ml_ratings.rename(columns={"userId": "USER", "movieId": "ITEM"}) + ratings = ml_ratings.rename(columns={"user": "USER", "item": "ITEM"}) ds = from_interactions_df(ratings) assert ds.item_count == ratings["ITEM"].nunique() assert ds.user_count == ratings["USER"].nunique() diff --git a/lenskit/tests/test_dataset_lazy.py b/lenskit/tests/test_dataset_lazy.py index c24b0f89c..fc9a83e56 100644 --- a/lenskit/tests/test_dataset_lazy.py +++ b/lenskit/tests/test_dataset_lazy.py @@ -15,24 +15,22 @@ def test_item_stats(ml_ratings: pd.DataFrame): - ml_ds = LazyDataset(lambda: from_interactions_df(ml_ratings, item_col="movieId")) + ml_ds = LazyDataset(lambda: from_interactions_df(ml_ratings, item_col="item")) stats = ml_ds.item_stats() stats.info() assert len(stats) == ml_ds.item_count assert np.all(stats.index == ml_ds.items.index) - assert np.all(stats["count"] == ml_ratings["movieId"].value_counts().reindex(ml_ds.items)) - assert np.all(stats["user_count"] == ml_ratings["movieId"].value_counts().reindex(ml_ds.items)) - assert np.all( - stats["rating_count"] == ml_ratings["movieId"].value_counts().reindex(ml_ds.items) - ) + assert np.all(stats["count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items)) + assert np.all(stats["user_count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items)) + assert np.all(stats["rating_count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items)) assert stats["mean_rating"].values == approx( - ml_ratings.groupby("movieId")["rating"].mean().reindex(ml_ds.items).values + ml_ratings.groupby("item")["rating"].mean().reindex(ml_ds.items).values ) - ts = ml_ratings.groupby("movieId")["timestamp"].min().reindex(ml_ds.items) + ts = ml_ratings.groupby("item")["timestamp"].min().reindex(ml_ds.items) bad = stats["first_time"] != ts nbad = np.sum(bad) if nbad: @@ -42,23 +40,23 @@ def test_item_stats(ml_ratings: pd.DataFrame): def test_user_stats(ml_ratings: pd.DataFrame): - ml_ds = LazyDataset(lambda: from_interactions_df(ml_ratings, item_col="movieId")) + ml_ds = LazyDataset(lambda: from_interactions_df(ml_ratings, item_col="item")) stats = ml_ds.user_stats() stats.info() assert len(stats) == ml_ds.user_count assert np.all(stats.index == ml_ds.users.index) - assert np.all(stats["count"] == ml_ratings["userId"].value_counts().reindex(ml_ds.users)) - assert np.all(stats["user_count"] == ml_ratings["userId"].value_counts().reindex(ml_ds.users)) - assert np.all(stats["rating_count"] == ml_ratings["userId"].value_counts().reindex(ml_ds.users)) + assert np.all(stats["count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users)) + assert np.all(stats["user_count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users)) + assert np.all(stats["rating_count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users)) assert stats["mean_rating"].values == approx( - ml_ratings.groupby("userId")["rating"].mean().reindex(ml_ds.users).values + ml_ratings.groupby("user")["rating"].mean().reindex(ml_ds.users).values ) assert np.all( - stats["first_time"] == ml_ratings.groupby("userId")["timestamp"].min().reindex(ml_ds.users) + stats["first_time"] == ml_ratings.groupby("user")["timestamp"].min().reindex(ml_ds.users) ) assert np.all( - stats["last_time"] == ml_ratings.groupby("userId")["timestamp"].max().reindex(ml_ds.users) + stats["last_time"] == ml_ratings.groupby("user")["timestamp"].max().reindex(ml_ds.users) ) diff --git a/lenskit/tests/test_dataset_log.py b/lenskit/tests/test_dataset_log.py index e56ac69ba..3ce3db7c7 100644 --- a/lenskit/tests/test_dataset_log.py +++ b/lenskit/tests/test_dataset_log.py @@ -25,9 +25,9 @@ def test_pandas_log_defaults(ml_ratings: pd.DataFrame, ml_ds: Dataset): uids = ml_ds.users.ids(int_df["user_num"]) iids = ml_ds.items.ids(int_df["item_num"]) - ml_df = ml_ratings.sort_values(["userId", "movieId"]) - assert np.all(uids == ml_df["userId"]) - assert np.all(iids == ml_df["movieId"]) + ml_df = ml_ratings.sort_values(["user", "item"]) + assert np.all(uids == ml_df["user"]) + assert np.all(iids == ml_df["item"]) assert np.all(int_df["rating"] == ml_df["rating"]) assert np.all(int_df["timestamp"] == ml_df["timestamp"]) @@ -48,9 +48,9 @@ def test_pandas_log_ids(ml_ratings: pd.DataFrame, ml_ds: Dataset): # the interact int_df = int_df.sort_values(["user_id", "item_id"]) - ml_df = ml_ratings.sort_values(["userId", "movieId"]) - assert np.all(int_df["user_id"] == ml_df["userId"]) - assert np.all(int_df["item_id"] == ml_df["movieId"]) + ml_df = ml_ratings.sort_values(["user", "item"]) + assert np.all(int_df["user_id"] == ml_df["user"]) + assert np.all(int_df["item_id"] == ml_df["item"]) assert np.all(int_df["rating"] == ml_df["rating"]) assert np.all(int_df["timestamp"] == ml_df["timestamp"]) @@ -72,9 +72,9 @@ def test_pandas_log_no_ts(ml_ratings: pd.DataFrame, ml_ds: Dataset): uids = ml_ds.users.ids(int_df["user_num"]) iids = ml_ds.items.ids(int_df["item_num"]) - ml_df = ml_ratings.sort_values(["userId", "movieId"]) - assert np.all(uids == ml_df["userId"]) - assert np.all(iids == ml_df["movieId"]) + ml_df = ml_ratings.sort_values(["user", "item"]) + assert np.all(uids == ml_df["user"]) + assert np.all(iids == ml_df["item"]) assert np.all(int_df["rating"] == ml_df["rating"]) # and the total length diff --git a/lenskit/tests/test_dataset_matrix.py b/lenskit/tests/test_dataset_matrix.py index de7b458c7..71b7f8df5 100644 --- a/lenskit/tests/test_dataset_matrix.py +++ b/lenskit/tests/test_dataset_matrix.py @@ -19,40 +19,40 @@ def _check_user_offset_counts(ml_ds: Dataset, ml_ratings: pd.DataFrame, offsets: ArrayLike): - user_counts = ml_ratings["userId"].value_counts().reindex(ml_ds.users.index) + user_counts = ml_ratings["user"].value_counts().reindex(ml_ds.users.index) row_lens = np.diff(offsets) assert np.all(row_lens == user_counts) def _check_user_number_counts(ml_ds: Dataset, ml_ratings: pd.DataFrame, nums: ArrayLike): users, counts = np.unique(nums, return_counts=True) - user_counts = ml_ratings["userId"].value_counts().reindex(ml_ds.users.ids(users)) + user_counts = ml_ratings["user"].value_counts().reindex(ml_ds.users.ids(users)) assert np.all(counts == user_counts) def _check_item_number_counts(ml_ds: Dataset, ml_ratings: pd.DataFrame, nums: ArrayLike): items, counts = np.unique(nums, return_counts=True) - item_counts = ml_ratings["movieId"].value_counts().reindex(ml_ds.items.ids(items)) + item_counts = ml_ratings["item"].value_counts().reindex(ml_ds.items.ids(items)) assert np.all(counts == item_counts) def _check_user_ids(ml_ds: Dataset, ml_ratings: pd.DataFrame, nums: ArrayLike): - ml_ratings = ml_ratings.sort_values(["userId", "movieId"]) - assert np.all(ml_ds.users.ids(np.asarray(nums)) == ml_ratings["userId"]) + ml_ratings = ml_ratings.sort_values(["user", "item"]) + assert np.all(ml_ds.users.ids(np.asarray(nums)) == ml_ratings["user"]) def _check_item_ids(ml_ds: Dataset, ml_ratings: pd.DataFrame, nums: ArrayLike): - ml_ratings = ml_ratings.sort_values(["userId", "movieId"]) - assert np.all(ml_ds.items.ids(np.asarray(nums)) == ml_ratings["movieId"]) + ml_ratings = ml_ratings.sort_values(["user", "item"]) + assert np.all(ml_ds.items.ids(np.asarray(nums)) == ml_ratings["item"]) def _check_ratings(ml_ds: Dataset, ml_ratings: pd.DataFrame, rates: ArrayLike): - ml_ratings = ml_ratings.sort_values(["userId", "movieId"]) + ml_ratings = ml_ratings.sort_values(["user", "item"]) assert np.all(rates == ml_ratings["rating"]) def _check_timestamp(ml_ds: Dataset, ml_ratings: pd.DataFrame, ts: ArrayLike): - ml_ratings = ml_ratings.sort_values(["userId", "movieId"]) + ml_ratings = ml_ratings.sort_values(["user", "item"]) assert np.all(ts == ml_ratings["timestamp"]) @@ -68,8 +68,8 @@ def test_matrix_structure(ml_ratings: pd.DataFrame, ml_ds: Dataset): assert isinstance(log, CSRStructure) assert log.nnz == len(ml_ratings) - assert log.nrows == ml_ratings["userId"].nunique() - assert log.ncols == ml_ratings["movieId"].nunique() + assert log.nrows == ml_ratings["user"].nunique() + assert log.ncols == ml_ratings["item"].nunique() _check_user_offset_counts(ml_ds, ml_ratings, log.rowptrs) _check_item_number_counts(ml_ds, ml_ratings, log.colinds) @@ -148,7 +148,7 @@ def test_matrix_pandas_indicator(ml_ratings: pd.DataFrame, ml_ds: Dataset): def test_matrix_pandas_missing_rating(ml_ratings: pd.DataFrame): - ml_ds = from_interactions_df(ml_ratings[["userId", "movieId", "timestamp"]], item_col="movieId") + ml_ds = from_interactions_df(ml_ratings[["user", "item", "timestamp"]], item_col="item") log = ml_ds.interaction_matrix(format="pandas", field="rating") assert isinstance(log, pd.DataFrame) assert len(log) == len(ml_ratings) @@ -169,8 +169,8 @@ def test_matrix_scipy_coo(ml_ratings: pd.DataFrame, ml_ds: Dataset, generation): assert log.nnz == len(ml_ratings) nrows, ncols = cast(tuple[int, int], log.shape) - assert nrows == ml_ratings["userId"].nunique() - assert ncols == ml_ratings["movieId"].nunique() + assert nrows == ml_ratings["user"].nunique() + assert ncols == ml_ratings["item"].nunique() assert log.row.dtype == np.int32 assert log.col.dtype == np.int32 @@ -191,8 +191,8 @@ def test_matrix_scipy_csr(ml_ratings: pd.DataFrame, ml_ds: Dataset, generation): assert log.nnz == len(ml_ratings) nrows, ncols = cast(tuple[int, int], log.shape) - assert nrows == ml_ratings["userId"].nunique() - assert ncols == ml_ratings["movieId"].nunique() + assert nrows == ml_ratings["user"].nunique() + assert ncols == ml_ratings["item"].nunique() assert log.indptr.dtype == np.int32 assert log.indices.dtype == np.int32 @@ -209,8 +209,8 @@ def test_matrix_scipy_timestamp(ml_ratings: pd.DataFrame, ml_ds: Dataset, genera assert log.nnz == len(ml_ratings) nrows, ncols = cast(tuple[int, int], log.shape) - assert nrows == ml_ratings["userId"].nunique() - assert ncols == ml_ratings["movieId"].nunique() + assert nrows == ml_ratings["user"].nunique() + assert ncols == ml_ratings["item"].nunique() _check_user_offset_counts(ml_ds, ml_ratings, log.indptr) _check_item_number_counts(ml_ds, ml_ratings, log.indices) @@ -225,8 +225,8 @@ def test_matrix_scipy_indicator(ml_ratings: pd.DataFrame, ml_ds: Dataset, genera assert log.nnz == len(ml_ratings) nrows, ncols = cast(tuple[int, int], log.shape) - assert nrows == ml_ratings["userId"].nunique() - assert ncols == ml_ratings["movieId"].nunique() + assert nrows == ml_ratings["user"].nunique() + assert ncols == ml_ratings["item"].nunique() _check_user_offset_counts(ml_ds, ml_ratings, log.indptr) _check_item_number_counts(ml_ds, ml_ratings, log.indices) @@ -238,14 +238,14 @@ def test_matrix_scipy_indicator(ml_ratings: pd.DataFrame, ml_ds: Dataset, genera @mark.parametrize("generation", ["modern", "legacy"]) def test_matrix_scipy_missing_rating(ml_ratings: pd.DataFrame, generation): - ml_ds = from_interactions_df(ml_ratings[["userId", "movieId", "timestamp"]], item_col="movieId") + ml_ds = from_interactions_df(ml_ratings[["user", "item", "timestamp"]], item_col="item") log = ml_ds.interaction_matrix(format="scipy", field="rating", legacy=generation == "legacy") assert isinstance(log, sps.csr_array if generation == "modern" else sps.csr_matrix) assert log.nnz == len(ml_ratings) nrows, ncols = cast(tuple[int, int], log.shape) - assert nrows == ml_ratings["userId"].nunique() - assert ncols == ml_ratings["movieId"].nunique() + assert nrows == ml_ratings["user"].nunique() + assert ncols == ml_ratings["item"].nunique() _check_user_offset_counts(ml_ds, ml_ratings, log.indptr) _check_item_number_counts(ml_ds, ml_ratings, log.indices) @@ -260,8 +260,8 @@ def test_matrix_torch_csr(ml_ratings: pd.DataFrame, ml_ds: Dataset): assert log.values().shape == torch.Size([len(ml_ratings)]) nrows, ncols = log.shape - assert nrows == ml_ratings["userId"].nunique() - assert ncols == ml_ratings["movieId"].nunique() + assert nrows == ml_ratings["user"].nunique() + assert ncols == ml_ratings["item"].nunique() _check_user_offset_counts(ml_ds, ml_ratings, log.crow_indices()) _check_item_number_counts(ml_ds, ml_ratings, log.col_indices()) @@ -279,8 +279,8 @@ def test_matrix_torch_indicator(ml_ratings: pd.DataFrame, ml_ds: Dataset): assert log.values().shape == torch.Size([len(ml_ratings)]) nrows, ncols = log.shape - assert nrows == ml_ratings["userId"].nunique() - assert ncols == ml_ratings["movieId"].nunique() + assert nrows == ml_ratings["user"].nunique() + assert ncols == ml_ratings["item"].nunique() _check_user_offset_counts(ml_ds, ml_ratings, log.crow_indices()) _check_item_number_counts(ml_ds, ml_ratings, log.col_indices()) @@ -295,8 +295,8 @@ def test_matrix_torch_coo(ml_ratings: pd.DataFrame, ml_ds: Dataset): assert log.values().shape == torch.Size([len(ml_ratings)]) nrows, ncols = cast(tuple[int, int], log.shape) - assert nrows == ml_ratings["userId"].nunique() - assert ncols == ml_ratings["movieId"].nunique() + assert nrows == ml_ratings["user"].nunique() + assert ncols == ml_ratings["item"].nunique() _check_user_number_counts(ml_ds, ml_ratings, log.indices()[0, :]) _check_user_ids(ml_ds, ml_ratings, log.indices()[0, :]) @@ -306,15 +306,15 @@ def test_matrix_torch_coo(ml_ratings: pd.DataFrame, ml_ds: Dataset): def test_matrix_torch_missing_rating(ml_ratings: pd.DataFrame): - ml_ds = from_interactions_df(ml_ratings[["userId", "movieId", "timestamp"]], item_col="movieId") + ml_ds = from_interactions_df(ml_ratings[["user", "item", "timestamp"]], item_col="item") log = ml_ds.interaction_matrix(format="torch", field="rating") assert isinstance(log, torch.Tensor) assert log.is_sparse_csr assert log.values().shape == torch.Size([len(ml_ratings)]) nrows, ncols = cast(tuple[int, int], log.shape) - assert nrows == ml_ratings["userId"].nunique() - assert ncols == ml_ratings["movieId"].nunique() + assert nrows == ml_ratings["user"].nunique() + assert ncols == ml_ratings["item"].nunique() _check_user_offset_counts(ml_ds, ml_ratings, log.crow_indices()) _check_item_number_counts(ml_ds, ml_ratings, log.col_indices()) @@ -329,8 +329,8 @@ def test_matrix_torch_timestamp(ml_ratings: pd.DataFrame, ml_ds: Dataset): assert log.values().shape == torch.Size([len(ml_ratings)]) nrows, ncols = log.shape - assert nrows == ml_ratings["userId"].nunique() - assert ncols == ml_ratings["movieId"].nunique() + assert nrows == ml_ratings["user"].nunique() + assert ncols == ml_ratings["item"].nunique() _check_user_offset_counts(ml_ds, ml_ratings, log.crow_indices()) _check_item_number_counts(ml_ds, ml_ratings, log.col_indices()) diff --git a/lenskit/tests/test_dataset_stats.py b/lenskit/tests/test_dataset_stats.py index fad32704d..321fec6b9 100644 --- a/lenskit/tests/test_dataset_stats.py +++ b/lenskit/tests/test_dataset_stats.py @@ -18,17 +18,15 @@ def test_item_stats(ml_ratings: pd.DataFrame, ml_ds: Dataset): assert len(stats) == ml_ds.item_count assert np.all(stats.index == ml_ds.items.index) - assert np.all(stats["count"] == ml_ratings["movieId"].value_counts().reindex(ml_ds.items)) - assert np.all(stats["user_count"] == ml_ratings["movieId"].value_counts().reindex(ml_ds.items)) - assert np.all( - stats["rating_count"] == ml_ratings["movieId"].value_counts().reindex(ml_ds.items) - ) + assert np.all(stats["count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items)) + assert np.all(stats["user_count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items)) + assert np.all(stats["rating_count"] == ml_ratings["item"].value_counts().reindex(ml_ds.items)) assert stats["mean_rating"].values == approx( - ml_ratings.groupby("movieId")["rating"].mean().reindex(ml_ds.items).values + ml_ratings.groupby("item")["rating"].mean().reindex(ml_ds.items).values ) - ts = ml_ratings.groupby("movieId")["timestamp"].min().reindex(ml_ds.items) + ts = ml_ratings.groupby("item")["timestamp"].min().reindex(ml_ds.items) bad = stats["first_time"] != ts nbad = np.sum(bad) if nbad: @@ -44,16 +42,16 @@ def test_user_stats(ml_ratings: pd.DataFrame, ml_ds: Dataset): assert len(stats) == ml_ds.user_count assert np.all(stats.index == ml_ds.users.index) - assert np.all(stats["count"] == ml_ratings["userId"].value_counts().reindex(ml_ds.users)) - assert np.all(stats["user_count"] == ml_ratings["userId"].value_counts().reindex(ml_ds.users)) - assert np.all(stats["rating_count"] == ml_ratings["userId"].value_counts().reindex(ml_ds.users)) + assert np.all(stats["count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users)) + assert np.all(stats["user_count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users)) + assert np.all(stats["rating_count"] == ml_ratings["user"].value_counts().reindex(ml_ds.users)) assert stats["mean_rating"].values == approx( - ml_ratings.groupby("userId")["rating"].mean().reindex(ml_ds.users).values + ml_ratings.groupby("user")["rating"].mean().reindex(ml_ds.users).values ) assert np.all( - stats["first_time"] == ml_ratings.groupby("userId")["timestamp"].min().reindex(ml_ds.users) + stats["first_time"] == ml_ratings.groupby("user")["timestamp"].min().reindex(ml_ds.users) ) assert np.all( - stats["last_time"] == ml_ratings.groupby("userId")["timestamp"].max().reindex(ml_ds.users) + stats["last_time"] == ml_ratings.groupby("user")["timestamp"].max().reindex(ml_ds.users) ) diff --git a/lenskit/tests/test_fallback.py b/lenskit/tests/test_fallback.py index 5de40de33..2a50edbd9 100644 --- a/lenskit/tests/test_fallback.py +++ b/lenskit/tests/test_fallback.py @@ -11,6 +11,7 @@ from pytest import approx +from lenskit.data.dataset import Dataset import lenskit.util.test as lktu from lenskit import util as lku from lenskit.algorithms import basic @@ -85,8 +86,10 @@ def test_fallback_predict(ml_ratings, ml_ds): def exp_val(user, item): v = bias.mean_ if user is not None: + assert bias.user_offsets_ is not None v += bias.user_offsets_.loc[user] if item is not None: + assert bias.item_offsets_ is not None v += bias.item_offsets_.loc[item] return v @@ -114,7 +117,7 @@ def exp_val(user, item): assert preds.loc[-23081] == approx(exp_val(10, None)) -def test_fallback_save_load(tmp_path, ml_ds): +def test_fallback_save_load(tmp_path, ml_ratings: pd.DataFrame, ml_ds: Dataset): original = basic.Fallback(basic.Memorized(simple_df), Bias()) original.fit(ml_ds) @@ -125,7 +128,7 @@ def test_fallback_save_load(tmp_path, ml_ds): algo = pickle.load(pf) bias = algo.algorithms[1] - assert bias.mean_ == approx(lktu.ml_test.ratings.rating.mean()) + assert bias.mean_ == approx(ml_ratings.rating.mean()) def exp_val(user, item): v = bias.mean_ diff --git a/lenskit/tests/test_knn_item_item.py b/lenskit/tests/test_knn_item_item.py index d12c6cff0..e69ca3d16 100644 --- a/lenskit/tests/test_knn_item_item.py +++ b/lenskit/tests/test_knn_item_item.py @@ -56,9 +56,9 @@ @fixture(scope="module") def ml_subset(ml_ratings): "Fixture that returns a subset of the MovieLens database." - icounts = ml_ratings.groupby("movieId").rating.count() + icounts = ml_ratings.groupby("item").rating.count() top = icounts.nlargest(500) - top_rates = ml_ratings[ml_ratings["movieId"].isin(top.index)] + top_rates = ml_ratings[ml_ratings["item"].isin(top.index)] _log.info("top 500 items yield %d of %d ratings", len(top_rates), len(ml_ratings)) return top_rates @@ -200,13 +200,11 @@ def test_ii_warns_wa_with_no_use_ratings(): @lktu.wantjit @mark.slow -@mark.skipif(not lktu.ml100k.available, reason="ML100K data not present") -def test_ii_train_ml100k(tmp_path): +def test_ii_train_ml100k(tmp_path, ml_100k): "Test an unbounded model on ML-100K" - ratings = lktu.ml100k.ratings algo = knn.ItemItem(30) _log.info("training model") - algo.fit(from_interactions_df(ratings)) + algo.fit(from_interactions_df(ml_100k)) _log.info("testing model") @@ -218,7 +216,7 @@ def test_ii_train_ml100k(tmp_path): assert algo.item_counts_.sum() == len(algo.sim_matrix_.values()) - means = ratings.groupby("item").rating.mean() + means = ml_100k.groupby("item").rating.mean() assert means[algo.items_.ids()].values == approx(algo.item_means_) # save @@ -258,18 +256,18 @@ def test_ii_large_models(rng, ml_ratings, ml_ds): # a little tolerance assert algo_lim.sim_matrix_.values().max() <= 1 - means = ml_ratings.groupby("movieId").rating.mean() + means = ml_ratings.groupby("item").rating.mean() assert means[algo_lim.items_.ids()].values == approx(algo_lim.item_means_) assert all(np.logical_not(np.isnan(algo_ub.sim_matrix_.values()))) assert algo_ub.sim_matrix_.values().min() > 0 assert algo_ub.sim_matrix_.values().max() <= 1 - means = ml_ratings.groupby("movieId").rating.mean() + means = ml_ratings.groupby("item").rating.mean() assert means[algo_ub.items_.ids()].values == approx(algo_ub.item_means_) mc_rates = ( - ml_ratings.set_index("movieId") + ml_ratings.set_index("item") .join(pd.DataFrame({"item_mean": means})) .assign(rating=lambda df: df.rating - df.item_mean) ) @@ -297,7 +295,7 @@ def test_ii_large_models(rng, ml_ratings, ml_ds): ipos = algo_ub.items_.number(i) _log.debug("checking item %d at position %d", i, ipos) assert ipos == algo_lim.items_.number(i) - irates = mc_rates.loc[[i], :].set_index("userId").rating + irates = mc_rates.loc[[i], :].set_index("user").rating ub_row = mat_ub[ipos] b_row = mat_lim[ipos] @@ -322,7 +320,7 @@ def test_ii_large_models(rng, ml_ratings, ml_ds): _log.debug("checking equal similarities") for n in rng.choice(ub_cols, min(10, len(ub_cols))): n_id = algo_ub.items_.id(n) - n_rates = mc_rates.loc[n_id, :].set_index("userId").rating + n_rates = mc_rates.loc[n_id, :].set_index("user").rating ir, nr = irates.align(n_rates, fill_value=0) cor = ir.corr(nr) assert mat_ub[ipos, n].item() == approx(cor, abs=1.0e-6) @@ -371,10 +369,10 @@ def test_ii_implicit_large(rng, ml_ratings): algo = knn.ItemItem(NBRS, feedback="implicit") _log.info("agg: %s", algo.aggregate) algo = Recommender.adapt(algo) - algo.fit(from_interactions_df(ml_ratings[["userId", "movieId"]], item_col="movieId")) + algo.fit(from_interactions_df(ml_ratings[["user", "item"]], item_col="item")) assert isinstance(algo, TopN) - users = rng.choice(ml_ratings["userId"].unique(), NUSERS) + users = rng.choice(ml_ratings["user"].unique(), NUSERS) items: Vocabulary[EntityId] = algo.predictor.items_ mat: torch.Tensor = algo.predictor.sim_matrix_.to_dense() @@ -383,9 +381,9 @@ def test_ii_implicit_large(rng, ml_ratings): recs = algo.recommend(user, NRECS) _log.info("user %s recs\n%s", user, recs) assert len(recs) == NRECS - urates = ml_ratings[ml_ratings["userId"] == user] + urates = ml_ratings[ml_ratings["user"] == user] - smat = mat[torch.from_numpy(items.numbers(urates["movieId"].values)), :] + smat = mat[torch.from_numpy(items.numbers(urates["item"].values)), :] for row in recs.itertuples(): col = smat[:, items.number(row.item)] top, _is = torch.topk(col, NBRS) @@ -407,7 +405,7 @@ def test_ii_save_load(tmp_path, ml_ratings, ml_subset): "Save and load a model" original = knn.ItemItem(30, save_nbrs=500) _log.info("building model") - original.fit(from_interactions_df(ml_subset, item_col="movieId")) + original.fit(from_interactions_df(ml_subset, item_col="item")) fn = tmp_path / "ii.mod" _log.info("saving model to %s", fn) @@ -435,7 +433,7 @@ def test_ii_save_load(tmp_path, ml_ratings, ml_subset): o_mat = original.sim_matrix_ assert all(r_mat.crow_indices() == o_mat.crow_indices()) - means = ml_ratings.groupby("movieId").rating.mean() + means = ml_ratings.groupby("item").rating.mean() assert means[algo.items_.ids()].values == approx(original.item_means_) @@ -444,7 +442,7 @@ def test_ii_implicit_save_load(tmp_path, ml_subset): "Save and load a model" original = knn.ItemItem(30, save_nbrs=500, center=False, aggregate="sum") _log.info("building model") - original.fit(from_interactions_df(ml_subset.loc[:, ["userId", "movieId"]], item_col="movieId")) + original.fit(from_interactions_df(ml_subset.loc[:, ["user", "item"]], item_col="item")) fn = tmp_path / "ii.mod" _log.info("saving model to %s", fn) @@ -475,9 +473,9 @@ def test_ii_implicit_save_load(tmp_path, ml_subset): @mark.slow def test_ii_old_implicit(ml_ratings): algo = knn.ItemItem(20, save_nbrs=100, center=False, aggregate="sum") - data = ml_ratings.loc[:, ["userId", "movieId"]] + data = ml_ratings.loc[:, ["user", "item"]] - algo.fit(from_interactions_df(data, item_col="movieId")) + algo.fit(from_interactions_df(data, item_col="item")) assert algo.item_counts_.sum() == algo.sim_matrix_.values().shape[0] assert all(algo.sim_matrix_.values() > 0) assert all(algo.item_counts_ <= 100) @@ -490,7 +488,7 @@ def test_ii_old_implicit(ml_ratings): @mark.slow def test_ii_no_ratings(ml_ratings, ml_ds): a1 = knn.ItemItem(20, save_nbrs=100, center=False, aggregate="sum") - a1.fit(from_interactions_df(ml_ratings.loc[:, ["userId", "movieId"]], item_col="movieId")) + a1.fit(from_interactions_df(ml_ratings.loc[:, ["user", "item"]], item_col="item")) algo = knn.ItemItem(20, save_nbrs=100, feedback="implicit") @@ -508,15 +506,12 @@ def test_ii_no_ratings(ml_ratings, ml_ds): @mark.slow @mark.eval -@mark.skipif(not lktu.ml100k.available, reason="ML100K data not present") -def test_ii_batch_accuracy(): +def test_ii_batch_accuracy(ml_100k): import lenskit.crossfold as xf import lenskit.metrics.predict as pm from lenskit import batch from lenskit.algorithms import basic, bias - ratings = lktu.ml100k.ratings - ii_algo = knn.ItemItem(30) algo = basic.Fallback(ii_algo, bias.Bias()) @@ -527,7 +522,7 @@ def eval(train, test): return batch.predict(algo, test, n_jobs=1) preds = pd.concat( - (eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) + (eval(train, test) for (train, test) in xf.partition_users(ml_100k, 5, xf.SampleFrac(0.2))) ) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.70, abs=0.025) @@ -583,14 +578,11 @@ def test_ii_known_preds(ml_ds): @lktu.wantjit @mark.slow @mark.eval -@mark.skipif(not lktu.ml100k.available, reason="ML100K not available") @mark.parametrize("ncpus", [1, 2]) -def test_ii_batch_recommend(ncpus): +def test_ii_batch_recommend(ml_100k, ncpus): import lenskit.crossfold as xf from lenskit import topn - ratings = lktu.ml100k.ratings - def eval(train, test): _log.info("running training") algo = knn.ItemItem(30) @@ -602,7 +594,7 @@ def eval(train, test): test_frames = [] recs = [] - for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)): + for train, test in xf.partition_users(ml_100k, 5, xf.SampleFrac(0.2)): test_frames.append(test) recs.append(eval(train, test)) diff --git a/lenskit/tests/test_knn_user_user.py b/lenskit/tests/test_knn_user_user.py index 5c6db519b..6ce8932de 100644 --- a/lenskit/tests/test_knn_user_user.py +++ b/lenskit/tests/test_knn_user_user.py @@ -15,7 +15,8 @@ from pytest import approx, fail, mark import lenskit.algorithms.knn.user as knn -from lenskit.data.dataset import from_interactions_df +from lenskit.algorithms.ranking import TopN +from lenskit.data.dataset import Dataset, from_interactions_df import lenskit.util.test as lktu from lenskit.algorithms import Recommender from lenskit.util import clone @@ -69,20 +70,20 @@ def test_uu_train(ml_ratings, ml_ds): # it should have computed correct means u_stats = ml_ds.user_stats() mlmeans = pd.Series(algo.user_means_.numpy(), index=algo.users_.ids(), name="mean") - mlmeans.index.name = "userId" + mlmeans.index.name = "user" umeans, mlmeans = u_stats["mean_rating"].align(mlmeans) assert mlmeans.values == approx(umeans.values) # we should be able to reconstruct rating values - uir = ml_ratings.set_index(["userId", "movieId"]).rating + uir = ml_ratings.set_index(["user", "item"]).rating rates = algo.user_ratings_.to_sparse_coo() ui_rbdf = pd.DataFrame( { - "userId": algo.users_.ids(rates.indices()[0]), - "movieId": algo.items_.ids(rates.indices()[1]), + "user": algo.users_.ids(rates.indices()[0]), + "item": algo.items_.ids(rates.indices()[1]), "nrating": rates.values(), } - ).set_index(["userId", "movieId"]) + ).set_index(["user", "item"]) ui_rbdf = ui_rbdf.join(mlmeans) ui_rbdf["rating"] = ui_rbdf["nrating"] + ui_rbdf["mean"] ui_rbdf["orig_rating"] = uir @@ -96,6 +97,7 @@ def test_uu_train_adapt(ml_ds): uu = knn.UserUser(30) uu = Recommender.adapt(uu) ret = uu.fit(ml_ds) + assert isinstance(uu, TopN) assert ret is uu assert isinstance(uu.predictor, knn.UserUser) @@ -132,10 +134,10 @@ def test_uu_predict_too_few_blended(ml_ds): def test_uu_predict_live_ratings(ml_ratings): algo = knn.UserUser(30, min_nbrs=2) - no4 = ml_ratings[ml_ratings.userId != 4] - algo.fit(from_interactions_df(no4, item_col="movieId")) + no4 = ml_ratings[ml_ratings.user != 4] + algo.fit(from_interactions_df(no4, item_col="item")) - ratings = ml_ratings[ml_ratings.userId == 4].set_index("movieId").rating + ratings = ml_ratings[ml_ratings.user == 4].set_index("item").rating preds = algo.predict_for_user(20381, [1016, 2091], ratings) assert len(preds) == 2 @@ -162,20 +164,20 @@ def test_uu_save_load(tmp_path, ml_ratings, ml_ds): # it should have computed correct means umeans = ml_ds.user_stats()["mean_rating"] mlmeans = pd.Series(algo.user_means_, index=algo.users_, name="mean") - mlmeans.index.name = "userId" + mlmeans.index.name = "user" umeans, mlmeans = umeans.align(mlmeans) assert mlmeans.values == approx(umeans.values) # we should be able to reconstruct rating values - uir = ml_ratings.set_index(["userId", "movieId"]).rating + uir = ml_ratings.set_index(["user", "item"]).rating rates = algo.user_ratings_.to_sparse_coo() ui_rbdf = pd.DataFrame( { - "userId": algo.users_.ids(rates.indices()[0]), - "movieId": algo.items_.ids(rates.indices()[1]), + "user": algo.users_.ids(rates.indices()[0]), + "item": algo.items_.ids(rates.indices()[1]), "nrating": rates.values(), } - ).set_index(["userId", "movieId"]) + ).set_index(["user", "item"]) ui_rbdf = ui_rbdf.join(mlmeans) ui_rbdf["rating"] = ui_rbdf["nrating"] + ui_rbdf["mean"] ui_rbdf["orig_rating"] = uir @@ -200,9 +202,9 @@ def test_uu_predict_unknown_empty(ml_ds): def test_uu_implicit(ml_ratings): "Train and use user-user on an implicit data set." algo = knn.UserUser(20, feedback="implicit") - data = ml_ratings.loc[:, ["userId", "movieId"]] + data = ml_ratings.loc[:, ["user", "item"]] - algo.fit(from_interactions_df(data, item_col="movieId")) + algo.fit(from_interactions_df(data, item_col="item")) assert algo.user_means_ is None mat = algo.user_vectors_ @@ -218,9 +220,9 @@ def test_uu_implicit(ml_ratings): def test_uu_save_load_implicit(tmp_path, ml_ratings): "Save and load user-user on an implicit data set." orig = knn.UserUser(20, feedback="implicit") - data = ml_ratings.loc[:, ["userId", "movieId"]] + data = ml_ratings.loc[:, ["user", "item"]] - orig.fit(from_interactions_df(data, item_col="movieId")) + orig.fit(from_interactions_df(data, item_col="item")) ser = pickle.dumps(orig) algo = pickle.loads(ser) @@ -231,12 +233,12 @@ def test_uu_save_load_implicit(tmp_path, ml_ratings): @mark.slow -def test_uu_known_preds(): +def test_uu_known_preds(ml_ds: Dataset): from lenskit import batch algo = knn.UserUser(30, min_sim=1.0e-6) _log.info("training %s on ml data", algo) - algo.fit(from_interactions_df(lktu.ml_test.ratings)) + algo.fit(ml_ds) dir = Path(__file__).parent pred_file = dir / "user-user-preds.csv" @@ -275,18 +277,15 @@ def __batch_eval(job): @mark.slow @mark.eval -@mark.skipif(not lktu.ml100k.available, reason="ML100K data not present") -def test_uu_batch_accuracy(): +def test_uu_batch_accuracy(ml_100k: pd.DataFrame): import lenskit.crossfold as xf import lenskit.metrics.predict as pm from lenskit.algorithms import basic, bias - ratings = lktu.ml100k.ratings - uu_algo = knn.UserUser(30) algo = basic.Fallback(uu_algo, bias.Bias()) - folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) + folds = xf.partition_users(ml_100k, 5, xf.SampleFrac(0.2)) preds = [__batch_eval((algo, train, test)) for (train, test) in folds] preds = pd.concat(preds) mae = pm.mae(preds.prediction, preds.rating) @@ -298,16 +297,13 @@ def test_uu_batch_accuracy(): @mark.slow @mark.eval -@mark.skipif(not lktu.ml100k.available, reason="ML100K data not present") -def test_uu_implicit_batch_accuracy(): +def test_uu_implicit_batch_accuracy(ml_100k: pd.DataFrame): import lenskit.crossfold as xf from lenskit import batch, topn - ratings = lktu.ml100k.ratings - algo = knn.UserUser(30, center=False, aggregate="sum") - folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) + folds = list(xf.partition_users(ml_100k, 5, xf.SampleFrac(0.2))) all_test = pd.concat(f.test for f in folds) rec_lists = [] diff --git a/lenskit/tests/test_ml20m.py b/lenskit/tests/test_ml20m.py index 4ed03d1cd..e0703f42a 100644 --- a/lenskit/tests/test_ml20m.py +++ b/lenskit/tests/test_ml20m.py @@ -17,21 +17,17 @@ from lenskit.algorithms import Recommender from lenskit.algorithms.basic import PopScore from lenskit.data.dataset import Dataset, from_interactions_df -from lenskit.datasets import MovieLens +from lenskit.data.movielens import load_movielens _log = logging.getLogger(__name__) _ml_path = Path("data/ml-20m") -if _ml_path.exists(): - _ml_20m = MovieLens(_ml_path) -else: - _ml_20m = None -@pytest.fixture +@pytest.fixture(scope="module") def ml20m(): - if _ml_20m: - return from_interactions_df(_ml_20m.ratings) + if _ml_path.exists(): + return load_movielens(_ml_path) else: pytest.skip("ML-20M not available") diff --git a/lenskit/tests/test_predict_metrics.py b/lenskit/tests/test_predict_metrics.py index 805e136b5..18ad019d9 100644 --- a/lenskit/tests/test_predict_metrics.py +++ b/lenskit/tests/test_predict_metrics.py @@ -166,13 +166,11 @@ def test_mae_series_two(): @mark.slow @mark.eval -@mark.skipif(not lktu.ml100k.available, reason="ML100K data not present") -def test_batch_rmse(): +def test_batch_rmse(ml_100k): import lenskit.algorithms.bias as bs import lenskit.batch as batch import lenskit.crossfold as xf - ratings = lktu.ml100k.ratings algo = bs.Bias(damping=5) def eval(train, test): @@ -181,13 +179,13 @@ def eval(train, test): return preds.set_index(["user", "item"]) results = pd.concat( - (eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5))) + (eval(train, test) for (train, test) in xf.partition_users(ml_100k, 5, xf.SampleN(5))) ) user_rmse = results.groupby("user").apply(lambda df: pm.rmse(df.prediction, df.rating)) # we should have all users - users = ratings.user.unique() + users = ml_100k.user.unique() assert len(user_rmse) == len(users) missing = np.setdiff1d(users, user_rmse.index) assert len(missing) == 0 @@ -200,12 +198,12 @@ def eval(train, test): @mark.slow -def test_global_metric(): +def test_global_metric(ml_ratings: pd.DataFrame): import lenskit.batch as batch import lenskit.crossfold as xf from lenskit.algorithms.bias import Bias - train, test = next(xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5))) + train, test = next(xf.sample_users(ml_ratings, 1, 200, xf.SampleFrac(0.5))) algo = Bias() algo.fit(from_interactions_df(train)) @@ -219,12 +217,12 @@ def test_global_metric(): @mark.slow -def test_user_metric(): +def test_user_metric(ml_ratings: pd.DataFrame): import lenskit.batch as batch import lenskit.crossfold as xf from lenskit.algorithms.bias import Bias - train, test = next(xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5))) + train, test = next(xf.sample_users(ml_ratings, 1, 200, xf.SampleFrac(0.5))) algo = Bias() algo.fit(from_interactions_df(train)) diff --git a/lenskit/tests/test_svd.py b/lenskit/tests/test_svd.py index 9e69147ba..b183565f5 100644 --- a/lenskit/tests/test_svd.py +++ b/lenskit/tests/test_svd.py @@ -12,7 +12,7 @@ from pytest import approx, mark -from lenskit.data.dataset import from_interactions_df +from lenskit.data.dataset import Dataset, from_interactions_df import lenskit.util.test as lktu from lenskit.algorithms import svd from lenskit.util import clone @@ -86,11 +86,9 @@ def test_svd_clone(): @need_skl @mark.slow -def test_svd_save_load(): - ratings = lktu.ml_test.ratings - +def test_svd_save_load(ml_ds: Dataset): original = svd.BiasedSVD(20) - original.fit(from_interactions_df(ratings)) + original.fit(ml_ds) mod = pickle.dumps(original) _log.info("serialized to %d bytes", len(mod)) @@ -105,15 +103,12 @@ def test_svd_save_load(): @need_skl @mark.slow @mark.eval -@mark.skipif(not lktu.ml100k.available, reason="ML100K data not present") -def test_svd_batch_accuracy(): +def test_svd_batch_accuracy(ml_100k: pd.DataFrame): import lenskit.crossfold as xf import lenskit.metrics.predict as pm from lenskit import batch from lenskit.algorithms import basic, bias - ratings = lktu.ml100k.ratings - svd_algo = svd.BiasedSVD(25, damping=10) algo = basic.Fallback(svd_algo, bias.Bias(damping=10)) @@ -123,7 +118,7 @@ def eval(train, test): _log.info("testing %d users", test.user.nunique()) return batch.predict(algo, test) - folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) + folds = xf.partition_users(ml_100k, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.74, abs=0.025) diff --git a/lenskit/tests/test_topn_analysis.py b/lenskit/tests/test_topn_analysis.py index ee951a8bf..459a6320c 100644 --- a/lenskit/tests/test_topn_analysis.py +++ b/lenskit/tests/test_topn_analysis.py @@ -19,7 +19,7 @@ from lenskit.algorithms.knn.user import UserUser from lenskit.data.dataset import from_interactions_df from lenskit.metrics.topn import _dcg, precision, recall -from lenskit.util.test import demo_recs, ml_test # noqa: F401 +from lenskit.util.test import demo_recs # noqa: F401 _log = logging.getLogger(__name__) @@ -205,93 +205,6 @@ def test_java_equiv(): assert umm["err"].values == approx(0, abs=1.0e-6) -@mark.skip("disabled for user-user") -@mark.slow -def test_fill_users(): - rla = topn.RecListAnalysis() - rla.add_metric(topn.precision) - rla.add_metric(topn.recall) - - algo = UserUser(20, min_nbrs=10) - algo = Recommender.adapt(algo) - - splits = xf.sample_users(ml_test.ratings, 1, 50, xf.SampleN(5)) - train, test = next(splits) - algo.fit(from_interactions_df(train)) - - rec_users = test["user"].sample(50).unique() - assert len(rec_users) < 50 - recs = batch.recommend(algo, rec_users, 25) - - scores = rla.compute(recs, test, include_missing=True) - assert len(scores) == test["user"].nunique() - assert scores["recall"].notna().sum() == len(rec_users) - assert all(scores["ntruth"] == 5) - - mscores = rla.compute(recs, test) - assert len(mscores) < len(scores) - - recall = scores.loc[scores["recall"].notna(), "recall"].copy() - recall, mrecall = recall.align(mscores["recall"]) - assert all(recall == mrecall) - - -@mark.skip("disabled for user-user") -@mark.slow -def test_adv_fill_users(): - rla = topn.RecListAnalysis() - rla.add_metric(topn.precision) - rla.add_metric(topn.recall) - - a_uu = UserUser(30, min_nbrs=10) - a_uu = Recommender.adapt(a_uu) - a_ii = ItemItem(20, min_nbrs=4) - a_ii = Recommender.adapt(a_ii) - - splits = xf.sample_users(ml_test.ratings, 2, 50, xf.SampleN(5)) - all_recs = {} - all_test = {} - for i, (train, test) in enumerate(splits): - a_uu.fit(from_interactions_df(train)) - rec_users = test["user"].sample(50).unique() - all_recs[(i + 1, "UU")] = batch.recommend(a_uu, rec_users, 25) - - a_ii.fit(from_interactions_df(train)) - rec_users = test["user"].sample(50).unique() - all_recs[(i + 1, "II")] = batch.recommend(a_ii, rec_users, 25) - all_test[i + 1] = test - - recs = pd.concat(all_recs, names=["part", "algo"]) - recs.reset_index(["part", "algo"], inplace=True) - recs.reset_index(drop=True, inplace=True) - - test = pd.concat(all_test, names=["part"]) - test.reset_index(["part"], inplace=True) - test.reset_index(drop=True, inplace=True) - - scores = rla.compute(recs, test, include_missing=True) - inames = scores.index.names - scores.sort_index(inplace=True) - assert len(scores) == 50 * 4 - assert all(scores["ntruth"] == 5) - assert scores["recall"].isna().sum() > 0 - _log.info("scores:\n%s", scores) - - ucounts = scores.reset_index().groupby("algo")["user"].agg(["count", "nunique"]) - assert all(ucounts["count"] == 100) - assert all(ucounts["nunique"] == 100) - - mscores = rla.compute(recs, test) - mscores = mscores.reset_index().set_index(inames) - mscores.sort_index(inplace=True) - assert len(mscores) < len(scores) - _log.info("mscores:\n%s", mscores) - - recall = scores.loc[scores["recall"].notna(), "recall"].copy() - recall, mrecall = recall.align(mscores["recall"]) - assert all(recall == mrecall) - - @mark.parametrize("drop_rating", [False, True]) def test_pr_bulk_match(demo_recs, drop_rating): "bulk and normal match" diff --git a/lenskit/tests/test_topn_recs.py b/lenskit/tests/test_topn_recs.py index bbb922d9b..1e00b0d61 100644 --- a/lenskit/tests/test_topn_recs.py +++ b/lenskit/tests/test_topn_recs.py @@ -9,7 +9,7 @@ from pytest import approx -from lenskit.data.dataset import from_interactions_df +from lenskit.data.dataset import Dataset, from_interactions_df import lenskit.util.test as lktu from lenskit.algorithms import basic, bias @@ -47,14 +47,13 @@ def test_topn_config(): assert rs.startswith("TopN/") -def test_topn_big(): - ratings = lktu.ml_test.ratings - users = ratings.user.unique() - items = ratings.item.unique() - user_items = ratings.set_index("user").item +def test_topn_big(ml_ds: Dataset): + users = ml_ds.users.ids() + items = ml_ds.items.ids() + user_items = ml_ds.interaction_matrix("pandas", original_ids=True).set_index("user_id").item_id algo = basic.TopN(bias.Bias()) - a2 = algo.fit(from_interactions_df(ratings)) + a2 = algo.fit(ml_ds) assert a2 is algo # test 100 random users diff --git a/lkdev/workflows/test.py b/lkdev/workflows/test.py index 78ab152c7..bd5cc3f1b 100644 --- a/lkdev/workflows/test.py +++ b/lkdev/workflows/test.py @@ -211,7 +211,7 @@ def steps_mldata(options: JobOptions, datasets: list[str]) -> list[GHStep]: data !data/*.zip """), - "key": f"test-mldata-000-{ds_hash}", + "key": f"test-mldata-001-{ds_hash}", }, }, { diff --git a/utils/dump-iknn.py b/utils/dump-iknn.py index 3b5f22213..0435b94a5 100644 --- a/utils/dump-iknn.py +++ b/utils/dump-iknn.py @@ -26,7 +26,7 @@ from docopt import docopt from lenskit.algorithms.knn.item import ItemItem -from lenskit.datasets import MovieLens +from lenskit.data import load_movielens _log = logging.getLogger("dump-iknn") @@ -35,7 +35,7 @@ def main(args): logging.basicConfig(stream=sys.stderr, level=logging.INFO) data = args["--dataset"] _log.info("loading data %s", data) - ml = MovieLens(f"data/{data}") + ml = load_movielens(f"data/{data}") ii_args = {} if args["-n"]: @@ -47,11 +47,11 @@ def main(args): algo = ItemItem(20, **ii_args) _log.info("training algorithm") - algo.fit(ml.ratings) + algo.fit(ml) i_outf = args["--item-output"] _log.info("saving items to %s", i_outf) - items = algo.item_index_ + items = ml.items.ids() stats = pd.DataFrame( {"mean": algo.item_means_.numpy(), "nnbrs": algo.item_counts_.numpy()}, index=items ) diff --git a/utils/recommend.py b/utils/recommend.py index 0bdb17c4f..abec6c7e5 100755 --- a/utils/recommend.py +++ b/utils/recommend.py @@ -33,7 +33,7 @@ from docopt import docopt from lenskit import batch -from lenskit.datasets import MovieLens +from lenskit.data import load_movielens _log = logging.getLogger("test-algo") @@ -44,7 +44,7 @@ def main(args): data = args["--dataset"] _log.info("loading data %s", data) - ml = MovieLens(f"data/{data}") + ml = load_movielens(f"data/{data}") _log.info("reading model from %s", args["MODEL"]) with open(args["MODEL"], "rb") as f: @@ -55,7 +55,7 @@ def main(args): if args["--random-users"]: n = int(args["--random-users"]) _log.info("selecting %d random users", n) - users = rng.choice(ml.ratings["user"].unique(), n) + users = rng.choice(ml.users.ids(), n) else: _log.info("using %d specified users", len(args["USER"])) users = [int(u) for u in args["USER"]] diff --git a/utils/train-model.py b/utils/train-model.py index 8b7cfe06d..1b2e4d49b 100755 --- a/utils/train-model.py +++ b/utils/train-model.py @@ -26,7 +26,7 @@ from lenskit.algorithms import Recommender from lenskit.algorithms.knn.item import ItemItem -from lenskit.datasets import MovieLens +from lenskit.data import load_movielens _log = logging.getLogger("train-model") @@ -35,7 +35,7 @@ def main(args): logging.basicConfig(stream=sys.stderr, level=logging.INFO) data = args["--dataset"] _log.info("loading data %s", data) - ml = MovieLens(f"data/{data}") + ml = load_movielens(f"data/{data}") if args["--item-item"]: algo = ItemItem(20) @@ -45,7 +45,7 @@ def main(args): algo = Recommender.adapt(algo) _log.info("training algorithm") - algo.fit(ml.ratings) + algo.fit(ml) _log.info("training complete") file = args["FILE"]