From 491a4bb93a5ab90ac87527d2aba3e2b95f2d86c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A1=D0=BE=D0=BA=D0=BE=D0=BB=D0=BE=D0=B2=20=D0=9C=D0=B8?= =?UTF-8?q?=D1=85=D0=B0=D0=B8=D0=BB?= Date: Mon, 15 Jan 2024 21:58:46 +0300 Subject: [PATCH 1/5] added arp metric --- CHANGELOG.md | 1 + rectools/metrics/__init__.py | 3 + rectools/metrics/popularity.py | 175 +++++++++++++++++++++++++++++++ rectools/metrics/scoring.py | 9 ++ tests/metrics/test_popularity.py | 63 +++++++++++ tests/metrics/test_scoring.py | 4 + 6 files changed, 255 insertions(+) create mode 100644 rectools/metrics/popularity.py create mode 100644 tests/metrics/test_popularity.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a9eb3c3c..5a6f06da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Methods for conversion `Interactions` to raw form and for getting raw interactions from `Dataset` ([#69](https://github.com/MobileTeleSystems/RecTools/pull/69)) +- `ARP (Average Recommendation Popularity)` to `metrics` ### Changed - Loosened `pandas`, `torch` and `torch-light` versions for `python >= 3.8` ([#58](https://github.com/MobileTeleSystems/RecTools/pull/58)) diff --git a/rectools/metrics/__init__.py b/rectools/metrics/__init__.py index 16fbb95e..2b37911e 100644 --- a/rectools/metrics/__init__.py +++ b/rectools/metrics/__init__.py @@ -31,6 +31,7 @@ `metrics.MRR` `metrics.MeanInvUserFreq` `metrics.IntraListDiversity` +`metrics.ARP` `metrics.Serendipity` Tools @@ -49,6 +50,7 @@ ) from .diversity import IntraListDiversity from .novelty import MeanInvUserFreq +from .popularity import ARP from .ranking import MAP, MRR, NDCG from .scoring import calc_metrics from .serendipity import Serendipity @@ -64,6 +66,7 @@ "MRR", "MeanInvUserFreq", "IntraListDiversity", + "ARP", "Serendipity", "calc_metrics", "PairwiseDistanceCalculator", diff --git a/rectools/metrics/popularity.py b/rectools/metrics/popularity.py new file mode 100644 index 00000000..ab3662ac --- /dev/null +++ b/rectools/metrics/popularity.py @@ -0,0 +1,175 @@ +# Copyright 2024 MTS (Mobile Telesystems) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Popularity metrics.""" + +import typing as tp +from collections import Counter + +import attr +import pandas as pd + +from rectools import Columns +from rectools.metrics.base import MetricAtK +from rectools.utils import select_by_type + + +@attr.s +class ARP(MetricAtK): + r""" + Average Recommendations Popularity metric. + + Calculate the average popularity of the recommended items in each list, + where "popularity" of item is the average number of ratings (interactions) + for this item. + + .. math:: + ARP@k = \frac{1}{\left|U_{t}\right|}\sum_{u\in U_{t}^{}}\frac{\sum_{i\in L_{u}}\phi (i)}{\left | L_{u} \right |} + + where + :math:`\phi (i)` is the number of times item i has been rated in the training set. + :math:`|U_{t}|` is the number of users in the test set. + :math:`L_{u}` is the list of recommended items for user u. + + Parameters + ---------- + k : int + Number of items at the top of recommendations list that will be used to calculate metric. + + Examples + -------- + >>> reco = pd.DataFrame( + ... { + ... Columns.User: [1, 1, 2, 2, 2, 3, 3], + ... Columns.Item: [1, 2, 3, 1, 2, 3, 2], + ... Columns.Rank: [1, 2, 1, 2, 3, 1, 2], + ... } + ... ) + >>> prev_interactions = pd.DataFrame( + ... { + ... Columns.User: [1, 1, 2, 2, 3, 3], + ... Columns.Item: [1, 2, 1, 3, 1, 2], + ... } + ... ) + >>> ARP(k=1).calc_per_user(reco, prev_interactions).values + array([3., 1., 1.]) + >>> ARP(k=3).calc_per_user(reco, prev_interactions).values + array([2.5, 2. , 1.5]) + """ + + def calc(self, reco: pd.DataFrame, prev_interactions: pd.DataFrame) -> float: + """ + Calculate metric value. + + Parameters + ---------- + reco : pd.DataFrame + Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. + prev_interactions : pd.DataFrame + Table with previous user-item interactions, + with columns `Columns.User`, `Columns.Item`. + + Returns + ------- + float + Value of metric (average between users). + """ + per_user = self.calc_per_user(reco, prev_interactions) + return per_user.mean() + + def calc_per_user( + self, + reco: pd.DataFrame, + prev_interactions: pd.DataFrame, + ) -> pd.Series: + """ + Calculate metric values for all users. + + Parameters + ---------- + reco : pd.DataFrame + Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. + prev_interactions : pd.DataFrame + Table with previous user-item interactions, + with columns `Columns.User`, `Columns.Item`. + + Returns + ------- + pd.Series + Values of metric (index - user id, values - metric value for every user). + """ + pop_items = self.get_pop(prev_interactions) + arp = reco.groupby(Columns.User).apply( + lambda x: sum(pop_items[i] for i in x[Columns.Item][: self.k]) / len(x[Columns.Item][: self.k]) + ) + return arp + + def get_pop(self, prev_interactions: pd.DataFrame) -> tp.Dict[int, int]: + """ + Calculate rating for each item in train set. + + Parameters + ---------- + prev_interactions : pd.DataFrame + Table with previous user-item interactions, + with columns `Columns.User`, `Columns.Item`. + + Returns + ------- + dict(int->int) + Set with items' popularity rating (key - item id, value - number of interactions with item in training set). + """ + return Counter(prev_interactions[Columns.Item]) + + +PopularityMetric = ARP + + +def calc_popularity_metrics( + metrics: tp.Dict[str, PopularityMetric], + reco: pd.DataFrame, + prev_interactions: pd.DataFrame, +) -> tp.Dict[str, float]: + """ + Calculate popularity metrics (only AvgRP now). + + Warning: It is not recommended to use this function directly. + Use `calc_metrics` instead. + + Parameters + ---------- + metrics : dict(str -> PopularityMetric) + Dict of metric objects to calculate, + where key is metric name and value is metric object. + reco : pd.DataFrame + Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. + prev_interactions : pd.DataFrame + Table with previous user-item interactions, + with columns `Columns.User`, `Columns.Item`. + + Returns + ------- + dict(str->float) + Dictionary where keys are the same as keys in `metrics` + and values are metric calculation results. + """ + results = {} + + # ARP + pop_metrics: tp.Dict[str, ARP] = select_by_type(metrics, ARP) + if pop_metrics: + for name, metric in pop_metrics.items(): + results[name] = metric.calc(reco, prev_interactions) + + return results diff --git a/rectools/metrics/scoring.py b/rectools/metrics/scoring.py index 26de47c2..9649b4ee 100644 --- a/rectools/metrics/scoring.py +++ b/rectools/metrics/scoring.py @@ -25,6 +25,7 @@ from .classification import ClassificationMetric, SimpleClassificationMetric, calc_classification_metrics from .diversity import DiversityMetric, calc_diversity_metrics from .novelty import NoveltyMetric, calc_novelty_metrics +from .popularity import PopularityMetric, calc_popularity_metrics from .ranking import RankingMetric, calc_ranking_metrics from .serendipity import SerendipityMetric, calc_serendipity_metrics @@ -131,6 +132,14 @@ def calc_metrics( # noqa # pylint: disable=too-many-branches novelty_values = calc_novelty_metrics(novelty_metrics, reco, prev_interactions) results.update(novelty_values) + # Popularity + popularity_metrics = select_by_type(metrics, PopularityMetric) + if popularity_metrics: + if prev_interactions is None: + raise ValueError("For calculating popularity metrics it's necessary to set 'prev_interactions'") + popularity_values = calc_popularity_metrics(popularity_metrics, reco, prev_interactions) + results.update(popularity_values) + # Diversity diversity_metrics = select_by_type(metrics, DiversityMetric) if diversity_metrics: diff --git a/tests/metrics/test_popularity.py b/tests/metrics/test_popularity.py new file mode 100644 index 00000000..36125335 --- /dev/null +++ b/tests/metrics/test_popularity.py @@ -0,0 +1,63 @@ +# Copyright 2022 MTS (Mobile Telesystems) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +from rectools import Columns +from rectools.metrics.popularity import ARP + + +class TestARP: + @pytest.fixture + def interactions(self) -> pd.DataFrame: + interactions = pd.DataFrame( + [["u1", "i1"], ["u1", "i2"], ["u2", "i1"], ["u2", "i3"], ["u3", "i1"], ["u3", "i2"]], + columns=[Columns.User, Columns.Item], + ) + return interactions + + @pytest.fixture + def recommendations(self) -> pd.DataFrame: + recommendations = pd.DataFrame( + [ + ["u1", "i1", 1], + ["u1", "i2", 2], + ["u2", "i3", 1], + ["u2", "i1", 2], + ["u2", "i2", 3], + ["u3", "i3", 1], + ["u3", "i2", 2], + ], + columns=[Columns.User, Columns.Item, Columns.Rank], + ) + return recommendations + + @pytest.mark.parametrize( + "k,expected", + ( + (1, pd.Series(index=["u1", "u2", "u3"], data=[3.0, 1.0, 1.0])), + (3, pd.Series(index=["u1", "u2", "u3"], data=[2.5, 2.0, 1.5])), + ), + ) + def test_correct_arp_values( + self, recommendations: pd.DataFrame, interactions: pd.DataFrame, k: int, expected: pd.Series + ) -> None: + arp = ARP(k) + + actual = arp.calc_per_user(recommendations, interactions) + pd.testing.assert_series_equal(actual, expected, check_names=False) + + actual_mean = arp.calc(recommendations, interactions) + assert actual_mean == expected.mean() diff --git a/tests/metrics/test_scoring.py b/tests/metrics/test_scoring.py index d95c64eb..a0e94dba 100644 --- a/tests/metrics/test_scoring.py +++ b/tests/metrics/test_scoring.py @@ -19,6 +19,7 @@ from rectools import Columns from rectools.metrics import ( + ARP, MAP, MRR, NDCG, @@ -76,6 +77,7 @@ def test_success(self) -> None: "ndcg@1": NDCG(k=1, log_base=3), "mrr@1": MRR(k=1), "miuf": MeanInvUserFreq(k=3), + "arp": ARP(k=2), "ild": IntraListDiversity(k=3, distance_calculator=self.calculator), "serendipity": Serendipity(k=3), "custom": MetricAtK(k=1), @@ -92,6 +94,7 @@ def test_success(self) -> None: "ndcg@1": 0.25, "mrr@1": 0.25, "miuf": 0.125, + "arp": 2.75, "ild": 0.25, "serendipity": 0, } @@ -103,6 +106,7 @@ def test_success(self) -> None: (Precision(k=1), ["reco"]), (MAP(k=1), ["reco"]), (MeanInvUserFreq(k=1), ["reco"]), + (ARP(k=1), ["reco"]), (Serendipity(k=1), ["reco"]), (Serendipity(k=1), ["reco", "interactions"]), (Serendipity(k=1), ["reco", "interactions", "prev_interactions"]), From 256a3f913e306a54919ff7ac6b8a02cceaae07b0 Mon Sep 17 00:00:00 2001 From: Mike <78963317+mikesokolovv@users.noreply.github.com> Date: Wed, 17 Jan 2024 14:38:23 +0300 Subject: [PATCH 2/5] optimization of calc_per_user method; tests added (#1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Соколов Михаил --- CHANGELOG.md | 2 +- rectools/metrics/__init__.py | 6 ++-- rectools/metrics/popularity.py | 29 +++++++++---------- tests/metrics/test_popularity.py | 49 ++++++++++++++++++++++++++++++-- tests/metrics/test_scoring.py | 6 ++-- 5 files changed, 67 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a6f06da..cd7d4e03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Methods for conversion `Interactions` to raw form and for getting raw interactions from `Dataset` ([#69](https://github.com/MobileTeleSystems/RecTools/pull/69)) -- `ARP (Average Recommendation Popularity)` to `metrics` +- `AvgRecPopularity (Average Recommendation Popularity)` to `metrics` ([#81](https://github.com/MobileTeleSystems/RecTools/pull/81)) ### Changed - Loosened `pandas`, `torch` and `torch-light` versions for `python >= 3.8` ([#58](https://github.com/MobileTeleSystems/RecTools/pull/58)) diff --git a/rectools/metrics/__init__.py b/rectools/metrics/__init__.py index 2b37911e..d360558d 100644 --- a/rectools/metrics/__init__.py +++ b/rectools/metrics/__init__.py @@ -31,7 +31,7 @@ `metrics.MRR` `metrics.MeanInvUserFreq` `metrics.IntraListDiversity` -`metrics.ARP` +`metrics.AvgRecPopularity` `metrics.Serendipity` Tools @@ -50,7 +50,7 @@ ) from .diversity import IntraListDiversity from .novelty import MeanInvUserFreq -from .popularity import ARP +from .popularity import AvgRecPopularity from .ranking import MAP, MRR, NDCG from .scoring import calc_metrics from .serendipity import Serendipity @@ -66,7 +66,7 @@ "MRR", "MeanInvUserFreq", "IntraListDiversity", - "ARP", + "AvgRecPopularity", "Serendipity", "calc_metrics", "PairwiseDistanceCalculator", diff --git a/rectools/metrics/popularity.py b/rectools/metrics/popularity.py index ab3662ac..1a804005 100644 --- a/rectools/metrics/popularity.py +++ b/rectools/metrics/popularity.py @@ -15,9 +15,7 @@ """Popularity metrics.""" import typing as tp -from collections import Counter -import attr import pandas as pd from rectools import Columns @@ -25,8 +23,7 @@ from rectools.utils import select_by_type -@attr.s -class ARP(MetricAtK): +class AvgRecPopularity(MetricAtK): r""" Average Recommendations Popularity metric. @@ -35,7 +32,7 @@ class ARP(MetricAtK): for this item. .. math:: - ARP@k = \frac{1}{\left|U_{t}\right|}\sum_{u\in U_{t}^{}}\frac{\sum_{i\in L_{u}}\phi (i)}{\left | L_{u} \right |} + ARP@k = \frac{1}{\left|U_{t}\right|}\sum_{u\in U_{t}^{}}\frac{\sum_{i\in L_{u}}\phi (i)}{\left| L_{u} \right |} where :math:`\phi (i)` is the number of times item i has been rated in the training set. @@ -62,9 +59,9 @@ class ARP(MetricAtK): ... Columns.Item: [1, 2, 1, 3, 1, 2], ... } ... ) - >>> ARP(k=1).calc_per_user(reco, prev_interactions).values + >>> AvgRecPopularity(k=1).calc_per_user(reco, prev_interactions).values array([3., 1., 1.]) - >>> ARP(k=3).calc_per_user(reco, prev_interactions).values + >>> AvgRecPopularity(k=3).calc_per_user(reco, prev_interactions).values array([2.5, 2. , 1.5]) """ @@ -110,12 +107,13 @@ def calc_per_user( Values of metric (index - user id, values - metric value for every user). """ pop_items = self.get_pop(prev_interactions) - arp = reco.groupby(Columns.User).apply( - lambda x: sum(pop_items[i] for i in x[Columns.Item][: self.k]) / len(x[Columns.Item][: self.k]) + reco_prepared = reco.query(f"{Columns.Rank} <= @self.k") + arp = reco_prepared.groupby(Columns.User)[Columns.Item].agg( + lambda x: sum(pop_items[i] if i in pop_items else 0 for i in x) / x.nunique() ) return arp - def get_pop(self, prev_interactions: pd.DataFrame) -> tp.Dict[int, int]: + def get_pop(self, prev_interactions: pd.DataFrame) -> pd.Series: """ Calculate rating for each item in train set. @@ -127,13 +125,14 @@ def get_pop(self, prev_interactions: pd.DataFrame) -> tp.Dict[int, int]: Returns ------- - dict(int->int) - Set with items' popularity rating (key - item id, value - number of interactions with item in training set). + pd.Series + Series with items' popularity rating (index - item id, + value - number of interactions with item in training set). """ - return Counter(prev_interactions[Columns.Item]) + return prev_interactions[Columns.Item].value_counts() -PopularityMetric = ARP +PopularityMetric = AvgRecPopularity def calc_popularity_metrics( @@ -167,7 +166,7 @@ def calc_popularity_metrics( results = {} # ARP - pop_metrics: tp.Dict[str, ARP] = select_by_type(metrics, ARP) + pop_metrics: tp.Dict[str, AvgRecPopularity] = select_by_type(metrics, AvgRecPopularity) if pop_metrics: for name, metric in pop_metrics.items(): results[name] = metric.calc(reco, prev_interactions) diff --git a/tests/metrics/test_popularity.py b/tests/metrics/test_popularity.py index 36125335..ff99ea5f 100644 --- a/tests/metrics/test_popularity.py +++ b/tests/metrics/test_popularity.py @@ -12,14 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np import pandas as pd import pytest from rectools import Columns -from rectools.metrics.popularity import ARP +from rectools.metrics.popularity import AvgRecPopularity -class TestARP: +class TestAvgRecPopularity: @pytest.fixture def interactions(self) -> pd.DataFrame: interactions = pd.DataFrame( @@ -54,10 +55,52 @@ def recommendations(self) -> pd.DataFrame: def test_correct_arp_values( self, recommendations: pd.DataFrame, interactions: pd.DataFrame, k: int, expected: pd.Series ) -> None: - arp = ARP(k) + arp = AvgRecPopularity(k) actual = arp.calc_per_user(recommendations, interactions) pd.testing.assert_series_equal(actual, expected, check_names=False) actual_mean = arp.calc(recommendations, interactions) assert actual_mean == expected.mean() + + def test_when_no_interactions( + self, + recommendations: pd.DataFrame, + ) -> None: + expected = pd.Series(index=recommendations[Columns.User].unique(), data=[0.0, 0.0, 0.0]) + empty_interactions = pd.DataFrame(columns=[Columns.User, Columns.Item], dtype=int) + arp = AvgRecPopularity(k=2) + + actual = arp.calc_per_user(recommendations, empty_interactions) + pd.testing.assert_series_equal(actual, expected, check_names=False) + + actual_mean = arp.calc(recommendations, empty_interactions) + assert actual_mean == expected.mean() + + @pytest.mark.parametrize( + "k,expected", + ( + (1, pd.Series(index=["u1", "u2", "u3"], data=[3.0, 1.0, 1.0])), + (3, pd.Series(index=["u1", "u2", "u3"], data=[2.5, np.divide(4, 3), 1.5])), + ), + ) + def test_when_new_item_in_reco(self, interactions: pd.DataFrame, k: int, expected: pd.Series) -> None: + reco = pd.DataFrame( + [ + ["u1", "i1", 1], + ["u1", "i2", 2], + ["u2", "i3", 1], + ["u2", "i1", 2], + ["u2", "i4", 3], + ["u3", "i3", 1], + ["u3", "i2", 2], + ], + columns=[Columns.User, Columns.Item, Columns.Rank], + ) + arp = AvgRecPopularity(k) + + actual = arp.calc_per_user(reco, interactions) + pd.testing.assert_series_equal(actual, expected, check_names=False) + + actual_mean = arp.calc(reco, interactions) + assert actual_mean == expected.mean() diff --git a/tests/metrics/test_scoring.py b/tests/metrics/test_scoring.py index a0e94dba..40cc71f5 100644 --- a/tests/metrics/test_scoring.py +++ b/tests/metrics/test_scoring.py @@ -19,11 +19,11 @@ from rectools import Columns from rectools.metrics import ( - ARP, MAP, MRR, NDCG, Accuracy, + AvgRecPopularity, IntraListDiversity, MeanInvUserFreq, PairwiseHammingDistanceCalculator, @@ -77,7 +77,7 @@ def test_success(self) -> None: "ndcg@1": NDCG(k=1, log_base=3), "mrr@1": MRR(k=1), "miuf": MeanInvUserFreq(k=3), - "arp": ARP(k=2), + "arp": AvgRecPopularity(k=2), "ild": IntraListDiversity(k=3, distance_calculator=self.calculator), "serendipity": Serendipity(k=3), "custom": MetricAtK(k=1), @@ -106,7 +106,7 @@ def test_success(self) -> None: (Precision(k=1), ["reco"]), (MAP(k=1), ["reco"]), (MeanInvUserFreq(k=1), ["reco"]), - (ARP(k=1), ["reco"]), + (AvgRecPopularity(k=1), ["reco"]), (Serendipity(k=1), ["reco"]), (Serendipity(k=1), ["reco", "interactions"]), (Serendipity(k=1), ["reco", "interactions", "prev_interactions"]), From df692ffb233e1738e5b9bbb5d90bfcbb370afab5 Mon Sep 17 00:00:00 2001 From: Mike <78963317+mikesokolovv@users.noreply.github.com> Date: Thu, 18 Jan 2024 18:14:22 +0300 Subject: [PATCH 3/5] computations changed to vector form MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * optimization of calc_per_user method; tests added * computations changed to vector form --------- Co-authored-by: Соколов Михаил --- rectools/metrics/popularity.py | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/rectools/metrics/popularity.py b/rectools/metrics/popularity.py index 1a804005..902a7eda 100644 --- a/rectools/metrics/popularity.py +++ b/rectools/metrics/popularity.py @@ -106,30 +106,16 @@ def calc_per_user( pd.Series Values of metric (index - user id, values - metric value for every user). """ - pop_items = self.get_pop(prev_interactions) - reco_prepared = reco.query(f"{Columns.Rank} <= @self.k") - arp = reco_prepared.groupby(Columns.User)[Columns.Item].agg( - lambda x: sum(pop_items[i] if i in pop_items else 0 for i in x) / x.nunique() - ) - return arp + item_popularity = prev_interactions[Columns.Item].value_counts() + item_popularity.name = "popularity" - def get_pop(self, prev_interactions: pd.DataFrame) -> pd.Series: - """ - Calculate rating for each item in train set. + reco_k = reco.query(f"{Columns.Rank} <= @self.k") + reco_max_k = reco_k.groupby(Columns.User)[Columns.Rank].transform("count") + reco_prepared = reco_k.join(item_popularity, on=Columns.Item, how="left") + reco_prepared["popularity"] = reco_prepared["popularity"].fillna(0) / reco_max_k - Parameters - ---------- - prev_interactions : pd.DataFrame - Table with previous user-item interactions, - with columns `Columns.User`, `Columns.Item`. - - Returns - ------- - pd.Series - Series with items' popularity rating (index - item id, - value - number of interactions with item in training set). - """ - return prev_interactions[Columns.Item].value_counts() + arp = reco_prepared.groupby(Columns.User)["popularity"].sum() + return arp PopularityMetric = AvgRecPopularity From 760bd831cbe3418c2db4b7b86992b2852087719d Mon Sep 17 00:00:00 2001 From: Mike <78963317+mikesokolovv@users.noreply.github.com> Date: Fri, 19 Jan 2024 11:30:27 +0300 Subject: [PATCH 4/5] arp calc_per_user update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * calc_per_user update --------- Co-authored-by: Соколов Михаил --- rectools/metrics/popularity.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/rectools/metrics/popularity.py b/rectools/metrics/popularity.py index 902a7eda..e32565ea 100644 --- a/rectools/metrics/popularity.py +++ b/rectools/metrics/popularity.py @@ -110,11 +110,9 @@ def calc_per_user( item_popularity.name = "popularity" reco_k = reco.query(f"{Columns.Rank} <= @self.k") - reco_max_k = reco_k.groupby(Columns.User)[Columns.Rank].transform("count") - reco_prepared = reco_k.join(item_popularity, on=Columns.Item, how="left") - reco_prepared["popularity"] = reco_prepared["popularity"].fillna(0) / reco_max_k + reco_prepared = reco_k.join(item_popularity, on=Columns.Item, how="left").fillna(0) - arp = reco_prepared.groupby(Columns.User)["popularity"].sum() + arp = reco_prepared.groupby(Columns.User)["popularity"].agg(lambda x: x.sum() / x.count()) return arp @@ -153,8 +151,7 @@ def calc_popularity_metrics( # ARP pop_metrics: tp.Dict[str, AvgRecPopularity] = select_by_type(metrics, AvgRecPopularity) - if pop_metrics: - for name, metric in pop_metrics.items(): - results[name] = metric.calc(reco, prev_interactions) + for name, metric in pop_metrics.items(): + results[name] = metric.calc(reco, prev_interactions) return results From b5365583a17eeb489a0ff4356de8d892d2272628 Mon Sep 17 00:00:00 2001 From: Mike <78963317+mikesokolovv@users.noreply.github.com> Date: Sat, 20 Jan 2024 12:45:33 +0300 Subject: [PATCH 5/5] update calc_per_user, docstrings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * arp calc_per_user updated; docstrings updated --------- Co-authored-by: Соколов Михаил --- rectools/metrics/popularity.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/rectools/metrics/popularity.py b/rectools/metrics/popularity.py index e32565ea..21b8adfe 100644 --- a/rectools/metrics/popularity.py +++ b/rectools/metrics/popularity.py @@ -28,16 +28,16 @@ class AvgRecPopularity(MetricAtK): Average Recommendations Popularity metric. Calculate the average popularity of the recommended items in each list, - where "popularity" of item is the average number of ratings (interactions) - for this item. + where "popularity" of an item is the number of previous interactions + with this item. .. math:: ARP@k = \frac{1}{\left|U_{t}\right|}\sum_{u\in U_{t}^{}}\frac{\sum_{i\in L_{u}}\phi (i)}{\left| L_{u} \right |} where - :math:`\phi (i)` is the number of times item i has been rated in the training set. + :math:`\phi (i)` is the number of previous interactions with item i. :math:`|U_{t}|` is the number of users in the test set. - :math:`L_{u}` is the list of recommended items for user u. + :math:`L_{u}` is the list of top k recommended items for user u. Parameters ---------- @@ -110,9 +110,10 @@ def calc_per_user( item_popularity.name = "popularity" reco_k = reco.query(f"{Columns.Rank} <= @self.k") - reco_prepared = reco_k.join(item_popularity, on=Columns.Item, how="left").fillna(0) + reco_prepared = reco_k.join(item_popularity, on=Columns.Item, how="left") + reco_prepared["popularity"] = reco_prepared["popularity"].fillna(0) - arp = reco_prepared.groupby(Columns.User)["popularity"].agg(lambda x: x.sum() / x.count()) + arp = reco_prepared.groupby(Columns.User)["popularity"].mean() return arp