Source code for recbole.model.general_recommender.itemknn

# -*- coding: utf-8 -*-
# @Time   : 2020/8/18
# @Author : Zihan Lin
# @Email  : linzihan.super@foxmail.com

r"""
ItemKNN
################################################
Reference:
    Aiolli,F et al. Efficient top-n recommendation for very large scale binary rated datasets.
    In Proceedings of the 7th ACM conference on Recommender systems (pp. 273-280). ACM.
"""

import numpy as np
import scipy.sparse as sp
import torch

from recbole.utils import InputType, ModelType
from recbole.model.abstract_recommender import GeneralRecommender


[docs]class ComputeSimilarity:

    def __init__(self, dataMatrix, topk=100, shrink=0, normalize=True):
        r"""Computes the cosine similarity on the columns of dataMatrix

        If it is computed on :math:`URM=|users| \times |items|`, pass the URM.

        If it is computed on :math:`ICM=|items| \times |features|`, pass the ICM transposed.

        Args:
            dataMatrix (scipy.sparse.csr_matrix): The sparse data matrix.
            topk (int) : The k value in KNN.
            shrink(int) :  hyper-parameter in calculate cosine distance.
            normalize (bool):   If True divide the dot product by the product of the norms.
        """

        super(ComputeSimilarity, self).__init__()

        self.shrink = shrink
        self.normalize = normalize

        self.n_rows, self.n_columns = dataMatrix.shape
        self.TopK = min(topk, self.n_columns)

        self.dataMatrix = dataMatrix.copy()

[docs]    def compute_similarity(self, block_size=100):
        r"""Compute the similarity for the given dataset

        Args:
            block_size(int): divide matrix to :math:`n\_columns \div block\_size` to calculate cosine_distance

        Returns:
            scipy.sparse.csr_matrix: sparse matrix W shape of (self.n_columns, self.n_columns)
        """

        values = []
        rows = []
        cols = []

        self.dataMatrix = self.dataMatrix.astype(np.float32)

        # Compute sum of squared values to be used in normalization
        sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel()
        sumOfSquared = np.sqrt(sumOfSquared)

        end_col_local = self.n_columns
        start_col_block = 0

        # Compute all similarities for each item using vectorization
        while start_col_block < end_col_local:

            end_col_block = min(start_col_block + block_size, end_col_local)
            this_block_size = end_col_block - start_col_block

            # All data points for a given item
            item_data = self.dataMatrix[:, start_col_block:end_col_block]
            item_data = item_data.toarray().squeeze()

            if item_data.ndim == 1:
                item_data = np.expand_dims(item_data, axis=1)

            # Compute item similarities

            this_block_weights = self.dataMatrix.T.dot(item_data)
            for col_index_in_block in range(this_block_size):

                if this_block_size == 1:
                    this_column_weights = this_block_weights.squeeze()
                else:
                    this_column_weights = this_block_weights[:, col_index_in_block]

                columnIndex = col_index_in_block + start_col_block
                this_column_weights[columnIndex] = 0.0

                # Apply normalization and shrinkage, ensure denominator != 0
                if self.normalize:
                    denominator = sumOfSquared[columnIndex] * sumOfSquared + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights, 1 / denominator)

                elif self.shrink != 0:
                    this_column_weights = this_column_weights / self.shrink

                # Sort indices and select TopK
                # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                # - Partition the data to extract the set of relevant items
                # - Sort only the relevant items
                # - Get the original item index
                relevant_items_partition = (-this_column_weights).argpartition(self.TopK - 1)[0:self.TopK]
                relevant_items_partition_sorting = np.argsort(-this_column_weights[relevant_items_partition])
                top_k_idx = relevant_items_partition[relevant_items_partition_sorting]

                # Incrementally build sparse matrix, do not add zeros
                notZerosMask = this_column_weights[top_k_idx] != 0.0
                numNotZeros = np.sum(notZerosMask)

                values.extend(this_column_weights[top_k_idx][notZerosMask])
                rows.extend(top_k_idx[notZerosMask])
                cols.extend(np.ones(numNotZeros) * columnIndex)

            start_col_block += block_size

        # End while on columns

        W_sparse = sp.csr_matrix((values, (rows, cols)),
                                 shape=(self.n_columns, self.n_columns),
                                 dtype=np.float32)
        return W_sparse.tocsc()


[docs]class ItemKNN(GeneralRecommender):

    r"""ItemKNN is a basic model that compute item similarity with the interaction matrix.

    """
    input_type = InputType.POINTWISE
    type = ModelType.TRADITIONAL

    def __init__(self, config, dataset):
        super(ItemKNN, self).__init__(config, dataset)

        # load parameters info
        self.k = config['k']
        self.shrink = config['shrink'] if 'shrink' in config else 0.0

        self.interaction_matrix = dataset.inter_matrix(form='csr').astype(np.float32)
        shape = self.interaction_matrix.shape
        assert self.n_users == shape[0] and self.n_items == shape[1]
        self.w = ComputeSimilarity(self.interaction_matrix, topk=self.k, shrink=self.shrink).compute_similarity()
        self.pred_mat = self.interaction_matrix.dot(self.w).tolil()

        self.fake_loss = torch.nn.Parameter(torch.zeros(1))

[docs]    def forward(self, user, item):
        pass

[docs]    def calculate_loss(self, interaction):
        return torch.nn.Parameter(torch.zeros(1))

[docs]    def predict(self, interaction):
        user = interaction[self.USER_ID]
        item = interaction[self.ITEM_ID]
        user = user.cpu().numpy().astype(int)
        item = item.cpu().numpy().astype(int)
        result = []

        for index in range(len(user)):
            uid = user[index]
            iid = item[index]
            score = self.pred_mat[uid, iid]
            result.append(score)
        result = torch.from_numpy(np.array(result)).to(self.device)
        return result

[docs]    def full_sort_predict(self, interaction):
        user = interaction[self.USER_ID]
        user = user.cpu().numpy()

        score = self.pred_mat[user, :].toarray().flatten()
        result = torch.from_numpy(score).to(self.device)

        return result