Source code for recbole.evaluator.evaluators

# -*- encoding: utf-8 -*-
# @Time    :   2020/08/04
# @Author  :   Kaiyuan Li
# @email   :   tsotfsk@outlook.com

# UPDATE
# @Time    :   2021/01/07, 2020/08/11, 2020/12/18
# @Author  :   Kaiyuan Li, Yupeng Hou, Zhichao Feng
# @email   :   tsotfsk@outlook.com, houyupeng@ruc.edu.cn, fzcbupt@gmail.com

"""
recbole.evaluator.evaluators
#####################################
"""

from collections import ChainMap

import numpy as np
import torch

from recbole.evaluator.abstract_evaluator import GroupedEvaluator, IndividualEvaluator
from recbole.evaluator.metrics import metrics_dict

# These metrics are typical in topk recommendations
topk_metrics = {metric.lower(): metric for metric in ['Hit', 'Recall', 'MRR', 'Precision', 'NDCG', 'MAP']}
# These metrics are typical in loss recommendations
loss_metrics = {metric.lower(): metric for metric in ['AUC', 'RMSE', 'MAE', 'LOGLOSS']}
# For GAUC
rank_metrics = {metric.lower(): metric for metric in ['GAUC']}

# group-based metrics
group_metrics = ChainMap(topk_metrics, rank_metrics)
# not group-based metrics
individual_metrics = ChainMap(loss_metrics)


[docs]class TopKEvaluator(GroupedEvaluator):
    r"""TopK Evaluator is mainly used in ranking tasks. Now, we support six topk metrics which
       contain `'Hit', 'Recall', 'MRR', 'Precision', 'NDCG', 'MAP'`.

    Note:
       The metrics used calculate group-based metrics which considers the metrics scores averaged
       across users. Some of them are also limited to k.

    """

    def __init__(self, config, metrics):
        super().__init__(config, metrics)

        self.topk = config['topk']
        self._check_args()

[docs]    def collect(self, interaction, scores_tensor):
        """collect the topk intermediate result of one batch, this function mainly
        implements padding and TopK finding. It is called at the end of each batch

        Args:
            interaction (Interaction): :class:`AbstractEvaluator` of the batch
            scores_tensor (tensor): the tensor of model output with size of `(N, )`

        Returns:
            torch.Tensor : a matrix contain topk matrix and shape matrix

       """
        user_len_list = interaction.user_len_list

        scores_matrix = self.get_score_matrix(scores_tensor, user_len_list)
        scores_matrix = torch.flip(scores_matrix, dims=[-1])
        shape_matrix = torch.full((len(user_len_list), 1), scores_matrix.shape[1], device=scores_matrix.device)

        # get topk
        _, topk_idx = torch.topk(scores_matrix, max(self.topk), dim=-1)  # n_users x k

        # pack top_idx and shape_matrix
        result = torch.cat((topk_idx, shape_matrix), dim=1)
        return result

[docs]    def evaluate(self, batch_matrix_list, eval_data):
        """calculate the metrics of all batches. It is called at the end of each epoch

        Args:
            batch_matrix_list (list): the results of all batches
            eval_data (Dataset): the class of test data

        Returns:
            dict: such as ``{'Hit@20': 0.3824, 'Recall@20': 0.0527, 'Hit@10': 0.3153, 'Recall@10': 0.0329}``

        """
        pos_len_list = eval_data.get_pos_len_list()
        batch_result = torch.cat(batch_matrix_list, dim=0).cpu().numpy()

        # unpack top_idx and shape_matrix
        topk_idx = batch_result[:, :-1]
        shapes = batch_result[:, -1]

        assert len(pos_len_list) == len(topk_idx)
        # get metrics
        metric_dict = {}
        result_list = self._calculate_metrics(pos_len_list, topk_idx, shapes)
        for metric, value in zip(self.metrics, result_list):
            for k in self.topk:
                key = '{}@{}'.format(metric, k)
                metric_dict[key] = round(value[k - 1], self.precision)

        return metric_dict

    def _check_args(self):

        # Check topk:
        if isinstance(self.topk, (int, list)):
            if isinstance(self.topk, int):
                self.topk = [self.topk]
            for topk in self.topk:
                if topk <= 0:
                    raise ValueError(
                        'topk must be a positive integer or a list of positive integers, '
                        'but get `{}`'.format(topk)
                    )
        else:
            raise TypeError('The topk must be a integer, list')

    def _calculate_metrics(self, pos_len_list, topk_idx, shapes):
        """integrate the results of each batch and evaluate the topk metrics by users

        Args:
            pos_len_list (numpy.ndarray): a list of users' positive items
            topk_idx (numpy.ndarray): a matrix which contains the index of the topk items for users
            shapes (numpy.ndarray): a list which contains the columns of the padded batch matrix

        Returns:
            numpy.ndarray: a matrix which contains the metrics result

        """
        pos_idx_matrix = (topk_idx >= (shapes - pos_len_list).reshape(-1, 1))
        result_list = []
        for metric in self.metrics:
            metric_fuc = metrics_dict[metric.lower()]
            result = metric_fuc(pos_idx_matrix, pos_len_list)
            result_list.append(result)  # n_users x len(metrics) x len(ranks)
        result = np.stack(result_list, axis=0).mean(axis=1)  # len(metrics) x len(ranks)
        return result

    def __str__(self):
        msg = 'The TopK Evaluator Info:\n' + \
              '\tMetrics:[' + \
              ', '.join([topk_metrics[metric.lower()] for metric in self.metrics]) + \
              '], TopK:[' + \
              ', '.join(map(str, self.topk)) + \
              ']'
        return msg


[docs]class RankEvaluator(GroupedEvaluator):
    r"""Rank Evaluator is mainly used in ranking tasks except for topk tasks. Now, we support one
    rank metric containing `'GAUC'`.

    Note:
        The metrics used calculate group-based metrics which considers the metrics scores averaged
        across users except for top-k metrics.

    """

    def __init__(self, config, metrics):
        super().__init__(config, metrics)
        pass

[docs]    def get_user_pos_len_list(self, interaction, scores_tensor):
        """get number of positive items and all items in test set of each user

        Args:
            interaction (Interaction): :class:`AbstractEvaluator` of the batch
            scores_tensor (tensor): the tensor of model output with size of `(N, )`

        Returns:
            list: number of positive items,
            list: number of all items
        """
        pos_len_list = torch.Tensor(interaction.pos_len_list).to(scores_tensor.device)
        user_len_list = interaction.user_len_list
        return pos_len_list, user_len_list

[docs]    def average_rank(self, scores):
        """Get the ranking of an ordered tensor, and take the average of the ranking for positions with equal values.

        Args:
            scores(tensor): an ordered tensor, with size of `(N, )`

        Returns:
            torch.Tensor: average_rank

        Example:
            >>> average_rank(tensor([[1,2,2,2,3,3,6],[2,2,2,2,4,5,5]]))
            tensor([[1.0000, 3.0000, 3.0000, 3.0000, 5.5000, 5.5000, 7.0000],
            [2.5000, 2.5000, 2.5000, 2.5000, 5.0000, 6.5000, 6.5000]])

        Reference:
            https://github.com/scipy/scipy/blob/v0.17.1/scipy/stats/stats.py#L5262-L5352

        """
        length, width = scores.shape
        device = scores.device
        true_tensor = torch.full((length, 1), True, dtype=torch.bool, device=device)

        obs = torch.cat([true_tensor, scores[:, 1:] != scores[:, :-1]], dim=1)
        # bias added to dense
        bias = torch.arange(0, length, device=device).repeat(width).reshape(width, -1). \
            transpose(1, 0).reshape(-1)
        dense = obs.view(-1).cumsum(0) + bias

        # cumulative counts of each unique value
        count = torch.where(torch.cat([obs, true_tensor], dim=1))[1]
        # get average rank
        avg_rank = .5 * (count[dense] + count[dense - 1] + 1).view(length, -1)

        return avg_rank

[docs]    def collect(self, interaction, scores_tensor):
        """collect the rank intermediate result of one batch, this function mainly implements ranking
        and calculating the sum of rank for positive items. It is called at the end of each batch

        Args:
            interaction (Interaction): :class:`AbstractEvaluator` of the batch
            scores_tensor (tensor): the tensor of model output with size of `(N, )`

        """
        pos_len_list, user_len_list = self.get_user_pos_len_list(interaction, scores_tensor)
        scores_matrix = self.get_score_matrix(scores_tensor, user_len_list)
        desc_scores, desc_index = torch.sort(scores_matrix, dim=-1, descending=True)

        # get the index of positive items in the ranking list
        pos_index = (desc_index < pos_len_list.reshape(-1, 1))

        avg_rank = self.average_rank(desc_scores)
        pos_rank_sum = torch.where(pos_index, avg_rank, torch.zeros_like(avg_rank)).sum(axis=-1).reshape(-1, 1)

        return pos_rank_sum

[docs]    def evaluate(self, batch_matrix_list, eval_data):
        """calculate the metrics of all batches. It is called at the end of each epoch

        Args:
            batch_matrix_list (list): the results of all batches
            eval_data (Dataset): the class of test data

        Returns:
            dict: such as ``{'GAUC': 0.9286}``

        """
        pos_len_list = eval_data.get_pos_len_list()
        user_len_list = eval_data.get_user_len_list()
        pos_rank_sum = torch.cat(batch_matrix_list, dim=0).cpu().numpy()
        assert len(pos_len_list) == len(pos_rank_sum)

        # get metrics
        metric_dict = {}
        result_list = self._calculate_metrics(user_len_list, pos_len_list, pos_rank_sum)
        for metric, value in zip(self.metrics, result_list):
            key = '{}'.format(metric)
            metric_dict[key] = round(value, self.precision)

        return metric_dict

    def _calculate_metrics(self, user_len_list, pos_len_list, pos_rank_sum):
        """integrate the results of each batch and evaluate the topk metrics by users

        Args:
            pos_len_list (numpy.ndarray): a list of users' positive items
            topk_idx (numpy.ndarray): a matrix which contains the index of the topk items for users

        Returns:
            numpy.ndarray: a matrix which contains the metrics result

        """
        result_list = []
        for metric in self.metrics:
            metric_fuc = metrics_dict[metric.lower()]
            result = metric_fuc(user_len_list, pos_len_list, pos_rank_sum)
            result_list.append(result)
        return result_list

    def __str__(self):
        msg = 'The Rank Evaluator Info:\n' + \
              '\tMetrics:[' + \
              ', '.join([rank_metrics[metric.lower()] for metric in self.metrics]) + \
              ']'
        return msg


[docs]class LossEvaluator(IndividualEvaluator):
    r"""Loss Evaluator is mainly used in rating prediction and click through rate prediction. Now, we support four
    loss metrics which contain `'AUC', 'RMSE', 'MAE', 'LOGLOSS'`.

    Note:
        The metrics used do not calculate group-based metrics which considers the metrics scores averaged
        across users. They are also not limited to k. Instead, they calculate the scores on the entire
        prediction results regardless the users.

    """

    def __init__(self, config, metrics):
        super().__init__(config, metrics)

        self.label_field = config['LABEL_FIELD']

[docs]    def collect(self, interaction, pred_scores):
        """collect the loss intermediate result of one batch, this function mainly
        implements concatenating preds and trues. It is called at the end of each batch

        Args:
            interaction (Interaction): :class:`AbstractEvaluator` of the batch
            pred_scores (tensor): the tensor of model output with a size of `(N, )`

        Returns:
            tensor : a batch of scores with a size of `(N, 2)`

        """
        true_scores = interaction[self.label_field].to(pred_scores.device)
        assert len(true_scores) == len(pred_scores)
        return self.get_score_matrix(true_scores, pred_scores)

[docs]    def evaluate(self, batch_matrix_list, *args):
        """calculate the metrics of all batches. It is called at the end of each epoch

        Args:
            batch_matrix_list (list): the results of all batches

        Returns:
            dict: such as {'AUC': 0.83}

        """
        concat = torch.cat(batch_matrix_list, dim=0).cpu().numpy()

        trues = concat[:, 0]
        preds = concat[:, 1]

        # get metrics
        metric_dict = {}
        result_list = self._calculate_metrics(trues, preds)
        for metric, value in zip(self.metrics, result_list):
            key = '{}'.format(metric)
            metric_dict[key] = round(value, self.precision)
        return metric_dict

    def _calculate_metrics(self, trues, preds):
        """get metrics result

        Args:
            trues (numpy.ndarray): the true scores' list
            preds (numpy.ndarray): the predict scores' list

        Returns:
            list: a list of metrics result

        """
        result_list = []
        for metric in self.metrics:
            metric_fuc = metrics_dict[metric.lower()]
            result = metric_fuc(trues, preds)
            result_list.append(result)
        return result_list

    def __str__(self):
        msg = 'The Loss Evaluator Info:\n' + \
              '\tMetrics:[' + \
              ', '.join([loss_metrics[metric.lower()] for metric in self.metrics]) + \
              ']'
        return msg


metric_eval_bind = [(topk_metrics, TopKEvaluator), (loss_metrics, LossEvaluator), (rank_metrics, RankEvaluator)]