Source code for surprise.trainset

"""This module contains the Trainset class."""


import numpy as np


[docs]class Trainset: """A trainset contains all useful data that constitute a training set. It is used by the :meth:`fit() <surprise.prediction_algorithms.algo_base.AlgoBase.fit>` method of every prediction algorithm. You should not try to build such an object on your own but rather use the :meth:`Dataset.folds() <surprise.dataset.Dataset.folds>` method or the :meth:`DatasetAutoFolds.build_full_trainset() <surprise.dataset.DatasetAutoFolds.build_full_trainset>` method. Trainsets are different from :class:`Datasets <surprise.dataset.Dataset>`. You can think of a :class:`Dataset <surprise.dataset.Dataset>` as the raw data, and Trainsets as higher-level data where useful methods are defined. Also, a :class:`Dataset <surprise.dataset.Dataset>` may be comprised of multiple Trainsets (e.g. when doing cross validation). Attributes: ur(:obj:`defaultdict` of :obj:`list`): The users ratings. This is a dictionary containing lists of tuples of the form ``(item_inner_id, rating)``. The keys are user inner ids. ir(:obj:`defaultdict` of :obj:`list`): The items ratings. This is a dictionary containing lists of tuples of the form ``(user_inner_id, rating)``. The keys are item inner ids. n_users: Total number of users :math:`|U|`. n_items: Total number of items :math:`|I|`. n_ratings: Total number of ratings :math:`|R_{train}|`. rating_scale(tuple): The minimum and maximal rating of the rating scale. global_mean: The mean of all ratings :math:`\\mu`. """ def __init__( self, ur, ir, n_users, n_items, n_ratings, rating_scale, raw2inner_id_users, raw2inner_id_items, ): self.ur = ur self.ir = ir self.n_users = n_users self.n_items = n_items self.n_ratings = n_ratings self.rating_scale = rating_scale self._raw2inner_id_users = raw2inner_id_users self._raw2inner_id_items = raw2inner_id_items self._global_mean = None # inner2raw dicts could be built right now (or even before) but they # are not always useful so we wait until we need them. self._inner2raw_id_users = None self._inner2raw_id_items = None
[docs] def knows_user(self, uid): """Indicate if the user is part of the trainset. A user is part of the trainset if the user has at least one rating. Args: uid(int): The (inner) user id. See :ref:`this note<raw_inner_note>`. Returns: ``True`` if user is part of the trainset, else ``False``. """ return uid in self.ur
[docs] def knows_item(self, iid): """Indicate if the item is part of the trainset. An item is part of the trainset if the item was rated at least once. Args: iid(int): The (inner) item id. See :ref:`this note<raw_inner_note>`. Returns: ``True`` if item is part of the trainset, else ``False``. """ return iid in self.ir
[docs] def to_inner_uid(self, ruid): """Convert a **user** raw id to an inner id. See :ref:`this note<raw_inner_note>`. Args: ruid(str): The user raw id. Returns: int: The user inner id. Raises: ValueError: When user is not part of the trainset. """ try: return self._raw2inner_id_users[ruid] except KeyError: raise ValueError("User " + str(ruid) + " is not part of the trainset.")
[docs] def to_raw_uid(self, iuid): """Convert a **user** inner id to a raw id. See :ref:`this note<raw_inner_note>`. Args: iuid(int): The user inner id. Returns: str: The user raw id. Raises: ValueError: When ``iuid`` is not an inner id. """ if self._inner2raw_id_users is None: self._inner2raw_id_users = { inner: raw for (raw, inner) in self._raw2inner_id_users.items() } try: return self._inner2raw_id_users[iuid] except KeyError: raise ValueError(str(iuid) + " is not a valid inner id.")
[docs] def to_inner_iid(self, riid): """Convert an **item** raw id to an inner id. See :ref:`this note<raw_inner_note>`. Args: riid(str): The item raw id. Returns: int: The item inner id. Raises: ValueError: When item is not part of the trainset. """ try: return self._raw2inner_id_items[riid] except KeyError: raise ValueError("Item " + str(riid) + " is not part of the trainset.")
[docs] def to_raw_iid(self, iiid): """Convert an **item** inner id to a raw id. See :ref:`this note<raw_inner_note>`. Args: iiid(int): The item inner id. Returns: str: The item raw id. Raises: ValueError: When ``iiid`` is not an inner id. """ if self._inner2raw_id_items is None: self._inner2raw_id_items = { inner: raw for (raw, inner) in self._raw2inner_id_items.items() } try: return self._inner2raw_id_items[iiid] except KeyError: raise ValueError(str(iiid) + " is not a valid inner id.")
[docs] def all_ratings(self): """Generator function to iterate over all ratings. Yields: A tuple ``(uid, iid, rating)`` where ids are inner ids (see :ref:`this note <raw_inner_note>`). """ for u, u_ratings in self.ur.items(): for i, r in u_ratings: yield u, i, r
[docs] def build_testset(self): """Return a list of ratings that can be used as a testset in the :meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>` method. The ratings are all the ratings that are in the trainset, i.e. all the ratings returned by the :meth:`all_ratings() <surprise.Trainset.all_ratings>` generator. This is useful in cases where you want to to test your algorithm on the trainset. """ return [ (self.to_raw_uid(u), self.to_raw_iid(i), r) for (u, i, r) in self.all_ratings() ]
[docs] def build_anti_testset(self, fill=None): """Return a list of ratings that can be used as a testset in the :meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>` method. The ratings are all the ratings that are **not** in the trainset, i.e. all the ratings :math:`r_{ui}` where the user :math:`u` is known, the item :math:`i` is known, but the rating :math:`r_{ui}` is not in the trainset. As :math:`r_{ui}` is unknown, it is either replaced by the :code:`fill` value or assumed to be equal to the mean of all ratings :meth:`global_mean <surprise.Trainset.global_mean>`. Args: fill(float): The value to fill unknown ratings. If :code:`None` the global mean of all ratings :meth:`global_mean <surprise.Trainset.global_mean>` will be used. Returns: A list of tuples ``(uid, iid, fill)`` where ids are raw ids. """ fill = self.global_mean if fill is None else float(fill) anti_testset = [] for u in self.all_users(): user_items = {j for (j, _) in self.ur[u]} anti_testset += [ (self.to_raw_uid(u), self.to_raw_iid(i), fill) for i in self.all_items() if i not in user_items ] return anti_testset
[docs] def all_users(self): """Generator function to iterate over all users. Yields: Inner id of users. """ return range(self.n_users)
[docs] def all_items(self): """Generator function to iterate over all items. Yields: Inner id of items. """ return range(self.n_items)
@property def global_mean(self): if self._global_mean is None: self._global_mean = np.mean([r for (_, _, r) in self.all_ratings()]) return self._global_mean