"""This module contains the Trainset class."""
import numpy as np
[docs]class Trainset:
"""A trainset contains all useful data that constitute a training set.
It is used by the :meth:`fit()
<surprise.prediction_algorithms.algo_base.AlgoBase.fit>` method of every
prediction algorithm. You should not try to build such an object on your
own but rather use the :meth:`Dataset.folds()
<surprise.dataset.Dataset.folds>` method or the
:meth:`DatasetAutoFolds.build_full_trainset()
<surprise.dataset.DatasetAutoFolds.build_full_trainset>` method.
Trainsets are different from :class:`Datasets <surprise.dataset.Dataset>`.
You can think of a :class:`Dataset <surprise.dataset.Dataset>` as the raw
data, and Trainsets as higher-level data where useful methods are defined.
Also, a :class:`Dataset <surprise.dataset.Dataset>` may be comprised of
multiple Trainsets (e.g. when doing cross validation).
Attributes:
ur(:obj:`defaultdict` of :obj:`list`): The users ratings. This is a
dictionary containing lists of tuples of the form ``(item_inner_id,
rating)``. The keys are user inner ids.
ir(:obj:`defaultdict` of :obj:`list`): The items ratings. This is a
dictionary containing lists of tuples of the form ``(user_inner_id,
rating)``. The keys are item inner ids.
n_users: Total number of users :math:`|U|`.
n_items: Total number of items :math:`|I|`.
n_ratings: Total number of ratings :math:`|R_{train}|`.
rating_scale(tuple): The minimum and maximal rating of the rating
scale.
global_mean: The mean of all ratings :math:`\\mu`.
"""
def __init__(
self,
ur,
ir,
n_users,
n_items,
n_ratings,
rating_scale,
raw2inner_id_users,
raw2inner_id_items,
):
self.ur = ur
self.ir = ir
self.n_users = n_users
self.n_items = n_items
self.n_ratings = n_ratings
self.rating_scale = rating_scale
self._raw2inner_id_users = raw2inner_id_users
self._raw2inner_id_items = raw2inner_id_items
self._global_mean = None
# inner2raw dicts could be built right now (or even before) but they
# are not always useful so we wait until we need them.
self._inner2raw_id_users = None
self._inner2raw_id_items = None
[docs] def knows_user(self, uid):
"""Indicate if the user is part of the trainset.
A user is part of the trainset if the user has at least one rating.
Args:
uid(int): The (inner) user id. See :ref:`this
note<raw_inner_note>`.
Returns:
``True`` if user is part of the trainset, else ``False``.
"""
return uid in self.ur
[docs] def knows_item(self, iid):
"""Indicate if the item is part of the trainset.
An item is part of the trainset if the item was rated at least once.
Args:
iid(int): The (inner) item id. See :ref:`this
note<raw_inner_note>`.
Returns:
``True`` if item is part of the trainset, else ``False``.
"""
return iid in self.ir
[docs] def to_inner_uid(self, ruid):
"""Convert a **user** raw id to an inner id.
See :ref:`this note<raw_inner_note>`.
Args:
ruid(str): The user raw id.
Returns:
int: The user inner id.
Raises:
ValueError: When user is not part of the trainset.
"""
try:
return self._raw2inner_id_users[ruid]
except KeyError:
raise ValueError("User " + str(ruid) + " is not part of the trainset.")
[docs] def to_raw_uid(self, iuid):
"""Convert a **user** inner id to a raw id.
See :ref:`this note<raw_inner_note>`.
Args:
iuid(int): The user inner id.
Returns:
str: The user raw id.
Raises:
ValueError: When ``iuid`` is not an inner id.
"""
if self._inner2raw_id_users is None:
self._inner2raw_id_users = {
inner: raw for (raw, inner) in self._raw2inner_id_users.items()
}
try:
return self._inner2raw_id_users[iuid]
except KeyError:
raise ValueError(str(iuid) + " is not a valid inner id.")
[docs] def to_inner_iid(self, riid):
"""Convert an **item** raw id to an inner id.
See :ref:`this note<raw_inner_note>`.
Args:
riid(str): The item raw id.
Returns:
int: The item inner id.
Raises:
ValueError: When item is not part of the trainset.
"""
try:
return self._raw2inner_id_items[riid]
except KeyError:
raise ValueError("Item " + str(riid) + " is not part of the trainset.")
[docs] def to_raw_iid(self, iiid):
"""Convert an **item** inner id to a raw id.
See :ref:`this note<raw_inner_note>`.
Args:
iiid(int): The item inner id.
Returns:
str: The item raw id.
Raises:
ValueError: When ``iiid`` is not an inner id.
"""
if self._inner2raw_id_items is None:
self._inner2raw_id_items = {
inner: raw for (raw, inner) in self._raw2inner_id_items.items()
}
try:
return self._inner2raw_id_items[iiid]
except KeyError:
raise ValueError(str(iiid) + " is not a valid inner id.")
[docs] def all_ratings(self):
"""Generator function to iterate over all ratings.
Yields:
A tuple ``(uid, iid, rating)`` where ids are inner ids (see
:ref:`this note <raw_inner_note>`).
"""
for u, u_ratings in self.ur.items():
for i, r in u_ratings:
yield u, i, r
[docs] def build_testset(self):
"""Return a list of ratings that can be used as a testset in the
:meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>`
method.
The ratings are all the ratings that are in the trainset, i.e. all the
ratings returned by the :meth:`all_ratings()
<surprise.Trainset.all_ratings>` generator. This is useful in
cases where you want to to test your algorithm on the trainset.
"""
return [
(self.to_raw_uid(u), self.to_raw_iid(i), r)
for (u, i, r) in self.all_ratings()
]
[docs] def build_anti_testset(self, fill=None):
"""Return a list of ratings that can be used as a testset in the
:meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>`
method.
The ratings are all the ratings that are **not** in the trainset, i.e.
all the ratings :math:`r_{ui}` where the user :math:`u` is known, the
item :math:`i` is known, but the rating :math:`r_{ui}` is not in the
trainset. As :math:`r_{ui}` is unknown, it is either replaced by the
:code:`fill` value or assumed to be equal to the mean of all ratings
:meth:`global_mean <surprise.Trainset.global_mean>`.
Args:
fill(float): The value to fill unknown ratings. If :code:`None` the
global mean of all ratings :meth:`global_mean
<surprise.Trainset.global_mean>` will be used.
Returns:
A list of tuples ``(uid, iid, fill)`` where ids are raw ids.
"""
fill = self.global_mean if fill is None else float(fill)
anti_testset = []
for u in self.all_users():
user_items = {j for (j, _) in self.ur[u]}
anti_testset += [
(self.to_raw_uid(u), self.to_raw_iid(i), fill)
for i in self.all_items()
if i not in user_items
]
return anti_testset
[docs] def all_users(self):
"""Generator function to iterate over all users.
Yields:
Inner id of users.
"""
return range(self.n_users)
[docs] def all_items(self):
"""Generator function to iterate over all items.
Yields:
Inner id of items.
"""
return range(self.n_items)
@property
def global_mean(self):
if self._global_mean is None:
self._global_mean = np.mean([r for (_, _, r) in self.all_ratings()])
return self._global_mean