Source code for recbole.model.context_aware_recommender.dcnv2

# _*_ coding: utf-8 _*_
# @Time   : 2022/8/29
# @Author : Yifan Li
# @Email  : 295435096@qq.com

r"""
DCN V2
################################################
Reference:
    Ruoxi Wang at al. "Dcn v2: Improved deep & cross network and practical lessons for web-scale
    learning to rank systems." in WWW 2021.

Reference code:
    https://github.com/shenweichen/DeepCTR-Torch

"""

import torch
import torch.nn as nn

from recbole.model.abstract_recommender import ContextRecommender
from recbole.model.init import xavier_normal_initialization
from recbole.model.layers import MLPLayers
from recbole.model.loss import RegLoss


[docs]class DCNV2(ContextRecommender):
    r"""DCNV2 improves the cross network by extending the original weight vector to a matrix,
    significantly improves the expressiveness of DCN. It also introduces the MoE and
    low rank techniques to reduce time cost.
    """

    def __init__(self, config, dataset):
        super(DCNV2, self).__init__(config, dataset)

        # load and compute parameters info
        self.mixed = config["mixed"]
        self.structure = config["structure"]
        self.cross_layer_num = config["cross_layer_num"]
        self.embedding_size = config["embedding_size"]
        self.mlp_hidden_size = config["mlp_hidden_size"]
        self.reg_weight = config["reg_weight"]
        self.dropout_prob = config["dropout_prob"]

        if self.mixed:
            self.expert_num = config["expert_num"]
            self.low_rank = config["low_rank"]

        self.in_feature_num = self.num_feature_field * self.embedding_size

        # define cross layers and bias
        if self.mixed:
            # U: (in_feature_num, low_rank)
            self.cross_layer_u = nn.ParameterList(
                nn.Parameter(
                    torch.randn(self.expert_num, self.in_feature_num, self.low_rank)
                )
                for _ in range(self.cross_layer_num)
            )
            # V: (in_feature_num, low_rank)
            self.cross_layer_v = nn.ParameterList(
                nn.Parameter(
                    torch.randn(self.expert_num, self.in_feature_num, self.low_rank)
                )
                for _ in range(self.cross_layer_num)
            )
            # C: (low_rank, low_rank)
            self.cross_layer_c = nn.ParameterList(
                nn.Parameter(torch.randn(self.expert_num, self.low_rank, self.low_rank))
                for _ in range(self.cross_layer_num)
            )
            self.gating = nn.ModuleList(
                nn.Linear(self.in_feature_num, 1) for _ in range(self.expert_num)
            )
        else:
            # W: (in_feature_num, in_feature_num)
            self.cross_layer_w = nn.ParameterList(
                nn.Parameter(torch.randn(self.in_feature_num, self.in_feature_num))
                for _ in range(self.cross_layer_num)
            )
        # bias: (in_feature_num, 1)
        self.bias = nn.ParameterList(
            nn.Parameter(torch.zeros(self.in_feature_num, 1))
            for _ in range(self.cross_layer_num)
        )

        # define deep and predict layers
        mlp_size_list = [self.in_feature_num] + self.mlp_hidden_size
        self.mlp_layers = MLPLayers(mlp_size_list, dropout=self.dropout_prob, bn=True)
        if self.structure == "parallel":
            self.predict_layer = nn.Linear(
                self.in_feature_num + self.mlp_hidden_size[-1], 1
            )
        elif self.structure == "stacked":
            self.predict_layer = nn.Linear(self.mlp_hidden_size[-1], 1)

        # define loss and activation functions
        self.reg_loss = RegLoss()
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.loss = nn.BCELoss()

        # parameters initialization
        self.apply(xavier_normal_initialization)

[docs]    def cross_network(self, x_0):
        r"""Cross network is composed of cross layers, with each layer having the following formula.

        .. math:: x_{l+1} = x_0 \odot (W_l x_l + b_l) + x_l

        :math:`x_l`, :math:`x_{l+1}` are column vectors denoting the outputs from the l -th and
        (l + 1)-th cross layers, respectively.
        :math:`W_l`, :math:`b_l` are the weight and bias parameters of the l -th layer.

        Args:
            x_0(torch.Tensor): Embedding vectors of all features, input of cross network.

        Returns:
            torch.Tensor:output of cross network, [batch_size, num_feature_field * embedding_size]
        """
        x_0 = x_0.unsqueeze(dim=2)
        x_l = x_0  # (batch_size, in_feature_num, 1)
        for i in range(self.cross_layer_num):
            xl_w = torch.matmul(self.cross_layer_w[i], x_l)
            xl_w = xl_w + self.bias[i]
            xl_dot = torch.mul(x_0, xl_w)
            x_l = xl_dot + x_l

        x_l = x_l.squeeze(dim=2)
        return x_l

[docs]    def cross_network_mix(self, x_0):
        r"""Cross network part of DCN-mix, which add MoE and nonlinear transformation in low-rank space.

        .. math::
            x_{l+1} = \sum_{i=1}^K G_i(x_l)E_i(x_l)+x_l
        .. math::
            E_i(x_l) = x_0 \odot (U_l^i \dot g(C_l^i \dot g(V_L^{iT} x_l)) + b_l)

        :math:`E_i` and :math:`G_i` represents the expert and gatings respectively,
        :math:`U_l`, :math:`C_l`, :math:`V_l` stand for low-rank decomposition of weight matrix,
        :math:`g` is the nonlinear activation function.

        Args:
            x_0(torch.Tensor): Embedding vectors of all features, input of cross network.

        Returns:
            torch.Tensor:output of mixed cross network, [batch_size, num_feature_field * embedding_size]
        """
        x_0 = x_0.unsqueeze(dim=2)
        x_l = x_0  # (batch_size, in_feature_num, 1)
        for i in range(self.cross_layer_num):
            expert_output_list = []
            gating_output_list = []
            for expert in range(self.expert_num):
                # compute gating output
                gating_output_list.append(
                    self.gating[expert](x_l.squeeze(dim=2))
                )  # (batch_size, 1)

                # project to low-rank subspace
                xl_v = torch.matmul(
                    self.cross_layer_v[i][expert].T, x_l
                )  # (batch_size, low_rank, 1)

                # nonlinear activation in subspace
                xl_c = self.tanh(xl_v)
                xl_c = torch.matmul(
                    self.cross_layer_c[i][expert], xl_c
                )  # (batch_size, low_rank, 1)
                xl_c = self.tanh(xl_c)

                # project back feature space
                xl_u = torch.matmul(
                    self.cross_layer_u[i][expert], xl_c
                )  # (batch_size, in_feature_num, 1)

                # dot with x_0
                xl_dot = xl_u + self.bias[i]
                xl_dot = torch.mul(x_0, xl_dot)

                expert_output_list.append(
                    xl_dot.squeeze(dim=2)
                )  # (batch_size, in_feature_num)

            expert_output = torch.stack(
                expert_output_list, dim=2
            )  # (batch_size, in_feature_num, expert_num)
            gating_output = torch.stack(
                gating_output_list, dim=1
            )  # (batch_size, expert_num, 1)
            moe_output = torch.matmul(
                expert_output, self.softmax(gating_output)
            )  # (batch_size, in_feature_num, 1)
            x_l = x_l + moe_output

        x_l = x_l.squeeze(dim=2)  # (batch_size, in_feature_num)
        return x_l

[docs]    def forward(self, interaction):
        dcn_all_embeddings = self.concat_embed_input_fields(
            interaction
        )  # (batch_size, num_field, embed_dim)
        batch_size = dcn_all_embeddings.shape[0]
        dcn_all_embeddings = dcn_all_embeddings.view(
            batch_size, -1
        )  # (batch_size, in_feature_num)

        if self.structure == "parallel":
            deep_output = self.mlp_layers(
                dcn_all_embeddings
            )  # (batch_size, mlp_hidden_size)
            if self.mixed:
                cross_output = self.cross_network_mix(
                    dcn_all_embeddings
                )  # (batch_size, in_feature_num)
            else:
                cross_output = self.cross_network(dcn_all_embeddings)
            concat_output = torch.cat(
                [cross_output, deep_output], dim=-1
            )  # (batch_size, in_num + mlp_size)
            output = self.sigmoid(self.predict_layer(concat_output))

            return output.squeeze(dim=1)

        elif self.structure == "stacked":
            if self.mixed:
                cross_output = self.cross_network_mix(
                    dcn_all_embeddings
                )  # (batch_size, in_feature_num)
            else:
                cross_output = self.cross_network(dcn_all_embeddings)
            deep_output = self.mlp_layers(cross_output)  # (batch_size, mlp_hidden_size)
            output = self.sigmoid(self.predict_layer(deep_output))

            return output.squeeze(dim=1)

[docs]    def calculate_loss(self, interaction):
        label = interaction[self.LABEL]
        output = self.forward(interaction)
        if self.mixed:
            reg_loss = (
                self.reg_loss(self.cross_layer_c)
                + self.reg_loss(self.cross_layer_v)
                + self.reg_loss(self.cross_layer_u)
            )
        else:
            reg_loss = self.reg_loss(self.cross_layer_w)
        l2_loss = self.reg_weight * reg_loss
        return self.loss(output, label) + l2_loss

[docs]    def predict(self, interaction):
        return self.forward(interaction)