[docs]classSequentialDataset(Dataset):""":class:`SequentialDataset` is based on :class:`~recbole.data.dataset.dataset.Dataset`, and provides augmentation interface to adapt to Sequential Recommendation, which can accelerate the data loader. Attributes: max_item_list_len (int): Max length of historical item list. item_list_length_field (str): Field name for item lists' length. """def__init__(self,config):self.max_item_list_len=config["MAX_ITEM_LIST_LENGTH"]self.item_list_length_field=config["ITEM_LIST_LENGTH_FIELD"]super().__init__(config)ifconfig["benchmark_filename"]isnotNone:self._benchmark_presets()def_change_feat_format(self):"""Change feat format from :class:`pandas.DataFrame` to :class:`Interaction`, then perform data augmentation. """super()._change_feat_format()ifself.config["benchmark_filename"]isnotNone:returnself.logger.debug("Augmentation for sequential recommendation.")self.data_augmentation()def_aug_presets(self):list_suffix=self.config["LIST_SUFFIX"]forfieldinself.inter_feat:iffield!=self.uid_field:list_field=field+list_suffixsetattr(self,f"{field}_list_field",list_field)ftype=self.field2type[field]ifftypein[FeatureType.TOKEN,FeatureType.TOKEN_SEQ]:list_ftype=FeatureType.TOKEN_SEQelse:list_ftype=FeatureType.FLOAT_SEQifftypein[FeatureType.TOKEN_SEQ,FeatureType.FLOAT_SEQ]:list_len=(self.max_item_list_len,self.field2seqlen[field])else:list_len=self.max_item_list_lenself.set_field_property(list_field,list_ftype,FeatureSource.INTERACTION,list_len)self.set_field_property(self.item_list_length_field,FeatureType.TOKEN,FeatureSource.INTERACTION,1)
[docs]defdata_augmentation(self):"""Augmentation processing for sequential dataset. E.g., ``u1`` has purchase sequence ``<i1, i2, i3, i4>``, then after augmentation, we will generate three cases. ``u1, <i1> | i2`` (Which means given user_id ``u1`` and item_seq ``<i1>``, we need to predict the next item ``i2``.) The other cases are below: ``u1, <i1, i2> | i3`` ``u1, <i1, i2, i3> | i4`` """self.logger.debug("data_augmentation")self._aug_presets()self._check_field("uid_field","time_field")max_item_list_len=self.config["MAX_ITEM_LIST_LENGTH"]self.sort(by=[self.uid_field,self.time_field],ascending=True)last_uid=Noneuid_list,item_list_index,target_index,item_list_length=[],[],[],[]seq_start=0fori,uidinenumerate(self.inter_feat[self.uid_field].numpy()):iflast_uid!=uid:last_uid=uidseq_start=ielse:ifi-seq_start>max_item_list_len:seq_start+=1uid_list.append(uid)item_list_index.append(slice(seq_start,i))target_index.append(i)item_list_length.append(i-seq_start)uid_list=np.array(uid_list)item_list_index=np.array(item_list_index)target_index=np.array(target_index)item_list_length=np.array(item_list_length,dtype=np.int64)new_length=len(item_list_index)new_data=self.inter_feat[target_index]new_dict={self.item_list_length_field:torch.tensor(item_list_length),}forfieldinself.inter_feat:iffield!=self.uid_field:list_field=getattr(self,f"{field}_list_field")list_len=self.field2seqlen[list_field]shape=((new_length,list_len)ifisinstance(list_len,int)else(new_length,)+list_len)if(self.field2type[field]in[FeatureType.FLOAT,FeatureType.FLOAT_SEQ]andfieldinself.config["numerical_features"]):shape+=(2,)new_dict[list_field]=torch.zeros(shape,dtype=self.inter_feat[field].dtype)value=self.inter_feat[field]fori,(index,length)inenumerate(zip(item_list_index,item_list_length)):new_dict[list_field][i][:length]=value[index]new_data.update(Interaction(new_dict))self.inter_feat=new_data
[docs]definter_matrix(self,form="coo",value_field=None):"""Get sparse matrix that describe interactions between user_id and item_id. Sparse matrix has shape (user_num, item_num). For a row of <src, tgt>, ``matrix[src, tgt] = 1`` if ``value_field`` is ``None``, else ``matrix[src, tgt] = self.inter_feat[src, tgt]``. Args: form (str, optional): Sparse matrix format. Defaults to ``coo``. value_field (str, optional): Data of sparse matrix, which should exist in ``df_feat``. Defaults to ``None``. Returns: scipy.sparse: Sparse matrix in form ``coo`` or ``csr``. """ifnotself.uid_fieldornotself.iid_field:raiseValueError("dataset does not exist uid/iid, thus can not converted to sparse matrix.")l1_idx=self.inter_feat[self.item_list_length_field]==1l1_inter_dict=self.inter_feat[l1_idx].interactionnew_dict={}list_suffix=self.config["LIST_SUFFIX"]candidate_field_set=set()forfieldinl1_inter_dict:iffield!=self.uid_fieldandfield+list_suffixinl1_inter_dict:candidate_field_set.add(field)new_dict[field]=torch.cat([self.inter_feat[field],l1_inter_dict[field+list_suffix][:,0]])elif(notfield.endswith(list_suffix))and(field!=self.item_list_length_field):new_dict[field]=torch.cat([self.inter_feat[field],l1_inter_dict[field]])local_inter_feat=Interaction(new_dict)returnself._create_sparse_matrix(local_inter_feat,self.uid_field,self.iid_field,form,value_field)
[docs]defbuild(self):"""Processing dataset according to evaluation setting, including Group, Order and Split. See :class:`~recbole.config.eval_setting.EvalSetting` for details. Args: eval_setting (:class:`~recbole.config.eval_setting.EvalSetting`): Object contains evaluation settings, which guide the data processing procedure. Returns: list: List of built :class:`Dataset`. """ordering_args=self.config["eval_args"]["order"]ifordering_args!="TO":raiseValueError(f"The ordering args for sequential recommendation has to be 'TO'")returnsuper().build()