Source code for surprise.model_selection.split

"""
The :mod:`model_selection.split<surprise.model_selection.split>` module
contains various cross-validation iterators. Design and tools are inspired from
the mighty scikit learn.

The available iterators are:

.. autosummary::
    :nosignatures:

    KFold
    RepeatedKFold
    ShuffleSplit
    LeaveOneOut
    PredefinedKFold

This module also contains a function for splitting datasets into trainset and
testset:

.. autosummary::
    :nosignatures:

    train_test_split

"""

import numbers
from collections import defaultdict
from itertools import chain
from math import ceil, floor

import numpy as np

from ..utils import get_rng


def get_cv(cv):
    """Return a 'validated' CV iterator."""

    if cv is None:
        return KFold(n_splits=5)
    if isinstance(cv, numbers.Integral):
        return KFold(n_splits=cv)
    if hasattr(cv, "split") and not isinstance(cv, str):
        return cv  # str have split

    raise ValueError(
        "Wrong CV object. Expecting None, an int or CV iterator, "
        "got a {}".format(type(cv))
    )



[docs]
class KFold:
    """A basic cross-validation iterator.

    Each fold is used once as a testset while the k - 1 remaining folds are
    used for training.

    See an example in the :ref:`User Guide <use_cross_validation_iterators>`.

    Args:
        n_splits(int): The number of folds.
        random_state(int, RandomState instance from numpy, or ``None``):
            Determines the RNG that will be used for determining the folds. If
            int, ``random_state`` will be used as a seed for a new RNG. This is
            useful to get the same splits over multiple calls to ``split()``.
            If RandomState instance, this same instance is used as RNG. If
            ``None``, the current RNG from numpy is used. ``random_state`` is
            only used if ``shuffle`` is ``True``.  Default is ``None``.
        shuffle(bool): Whether to shuffle the ratings in the ``data`` parameter
            of the ``split()`` method. Shuffling is not done in-place. Default
            is ``True``.
    """

    def __init__(self, n_splits=5, random_state=None, shuffle=True):

        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state


[docs]
    def split(self, data):
        """Generator function to iterate over trainsets and testsets.

        Args:
            data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
                ratings that will be divided into trainsets and testsets.

        Yields:
            tuple of (trainset, testset)
        """

        if self.n_splits > len(data.raw_ratings) or self.n_splits < 2:
            raise ValueError(
                "Incorrect value for n_splits={}. "
                "Must be >=2 and less than the number "
                "of ratings".format(len(data.raw_ratings))
            )

        # We use indices to avoid shuffling the original data.raw_ratings list.
        indices = np.arange(len(data.raw_ratings))

        if self.shuffle:
            get_rng(self.random_state).shuffle(indices)

        start, stop = 0, 0
        for fold_i in range(self.n_splits):
            start = stop
            stop += len(indices) // self.n_splits
            if fold_i < len(indices) % self.n_splits:
                stop += 1

            raw_trainset = [
                data.raw_ratings[i] for i in chain(indices[:start], indices[stop:])
            ]
            raw_testset = [data.raw_ratings[i] for i in indices[start:stop]]

            trainset = data.construct_trainset(raw_trainset)
            testset = data.construct_testset(raw_testset)

            yield trainset, testset


    def get_n_folds(self):

        return self.n_splits




[docs]
class RepeatedKFold:
    """
    Repeated :class:`KFold` cross validator.

    Repeats :class:`KFold` n times with different randomization in each
    repetition.

    See an example in the :ref:`User Guide <use_cross_validation_iterators>`.

    Args:
        n_splits(int): The number of folds.
        n_repeats(int): The number of repetitions.
        random_state(int, RandomState instance from numpy, or ``None``):
            Determines the RNG that will be used for determining the folds. If
            int, ``random_state`` will be used as a seed for a new RNG. This is
            useful to get the same splits over multiple calls to ``split()``.
            If RandomState instance, this same instance is used as RNG. If
            ``None``, the current RNG from numpy is used. ``random_state`` is
            only used if ``shuffle`` is ``True``.  Default is ``None``.
        shuffle(bool): Whether to shuffle the ratings in the ``data`` parameter
            of the ``split()`` method. Shuffling is not done in-place. Default
            is ``True``.
    """

    def __init__(self, n_splits=5, n_repeats=10, random_state=None):

        self.n_repeats = n_repeats
        self.random_state = random_state
        self.n_splits = n_splits


[docs]
    def split(self, data):
        """Generator function to iterate over trainsets and testsets.

        Args:
            data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
                ratings that will be divided into trainsets and testsets.

        Yields:
            tuple of (trainset, testset)
        """

        rng = get_rng(self.random_state)

        for _ in range(self.n_repeats):
            cv = KFold(n_splits=self.n_splits, random_state=rng, shuffle=True)
            yield from cv.split(data)


    def get_n_folds(self):

        return self.n_repeats * self.n_splits




[docs]
class ShuffleSplit:
    """A basic cross-validation iterator with random trainsets and testsets.

    Contrary to other cross-validation strategies, random splits do not
    guarantee that all folds will be different, although this is still very
    likely for sizeable datasets.

    See an example in the :ref:`User Guide <use_cross_validation_iterators>`.

    Args:
        n_splits(int): The number of folds.
        test_size(float or int ``None``): If float, it represents the
            proportion of ratings to include in the testset. If int,
            represents the absolute number of ratings in the testset. If
            ``None``, the value is set to the complement of the trainset size.
            Default is ``.2``.
        train_size(float or int or ``None``): If float, it represents the
            proportion of ratings to include in the trainset. If int,
            represents the absolute number of ratings in the trainset. If
            ``None``, the value is set to the complement of the testset size.
            Default is ``None``.
        random_state(int, RandomState instance from numpy, or ``None``):
            Determines the RNG that will be used for determining the folds. If
            int, ``random_state`` will be used as a seed for a new RNG. This is
            useful to get the same splits over multiple calls to ``split()``.
            If RandomState instance, this same instance is used as RNG. If
            ``None``, the current RNG from numpy is used. ``random_state`` is
            only used if ``shuffle`` is ``True``.  Default is ``None``.
        shuffle(bool): Whether to shuffle the ratings in the ``data`` parameter
            of the ``split()`` method. Shuffling is not done in-place. Setting
            this to `False` defeats the purpose of this iterator, but it's
            useful for the implementation of :func:`train_test_split`. Default
            is ``True``.
    """

    def __init__(
        self,
        n_splits=5,
        test_size=0.2,
        train_size=None,
        random_state=None,
        shuffle=True,
    ):

        if n_splits <= 0:
            raise ValueError(
                "n_splits = {} should be strictly greater than " "0.".format(n_splits)
            )
        if test_size is not None and test_size <= 0:
            raise ValueError(
                "test_size={} should be strictly greater than " "0".format(test_size)
            )

        if train_size is not None and train_size <= 0:
            raise ValueError(
                "train_size={} should be strictly greater than " "0".format(train_size)
            )

        self.n_splits = n_splits
        self.test_size = test_size
        self.train_size = train_size
        self.random_state = random_state
        self.shuffle = shuffle

    def validate_train_test_sizes(self, test_size, train_size, n_ratings):

        if test_size is not None and test_size >= n_ratings:
            raise ValueError(
                "test_size={} should be less than the number of "
                "ratings {}".format(test_size, n_ratings)
            )

        if train_size is not None and train_size >= n_ratings:
            raise ValueError(
                "train_size={} should be less than the number of"
                " ratings {}".format(train_size, n_ratings)
            )

        if np.asarray(test_size).dtype.kind == "f":
            test_size = ceil(test_size * n_ratings)

        if train_size is None:
            train_size = n_ratings - test_size
        elif np.asarray(train_size).dtype.kind == "f":
            train_size = floor(train_size * n_ratings)

        if test_size is None:
            test_size = n_ratings - train_size

        if train_size + test_size > n_ratings:
            raise ValueError(
                "The sum of train_size and test_size ({}) "
                "should be smaller than the number of "
                "ratings {}.".format(train_size + test_size, n_ratings)
            )

        return int(train_size), int(test_size)


[docs]
    def split(self, data):
        """Generator function to iterate over trainsets and testsets.

        Args:
            data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
                ratings that will be divided into trainsets and testsets.

        Yields:
            tuple of (trainset, testset)
        """

        test_size, train_size = self.validate_train_test_sizes(
            self.test_size, self.train_size, len(data.raw_ratings)
        )
        rng = get_rng(self.random_state)

        for _ in range(self.n_splits):

            if self.shuffle:
                permutation = rng.permutation(len(data.raw_ratings))
            else:
                permutation = np.arange(len(data.raw_ratings))

            raw_trainset = [data.raw_ratings[i] for i in permutation[:test_size]]
            raw_testset = [
                data.raw_ratings[i]
                for i in permutation[test_size : (test_size + train_size)]
            ]

            trainset = data.construct_trainset(raw_trainset)
            testset = data.construct_testset(raw_testset)

            yield trainset, testset


    def get_n_folds(self):

        return self.n_splits




[docs]
def train_test_split(
    data, test_size=0.2, train_size=None, random_state=None, shuffle=True
):
    """Split a dataset into trainset and testset.

    See an example in the :ref:`User Guide <train_test_split_example>`.

    Note: this function cannot be used as a cross-validation iterator.

    Args:
        data(:obj:`Dataset <surprise.dataset.Dataset>`): The dataset to split
            into trainset and testset.
        test_size(float or int ``None``): If float, it represents the
            proportion of ratings to include in the testset. If int,
            represents the absolute number of ratings in the testset. If
            ``None``, the value is set to the complement of the trainset size.
            Default is ``.2``.
        train_size(float or int or ``None``): If float, it represents the
            proportion of ratings to include in the trainset. If int,
            represents the absolute number of ratings in the trainset. If
            ``None``, the value is set to the complement of the testset size.
            Default is ``None``.
        random_state(int, RandomState instance from numpy, or ``None``):
            Determines the RNG that will be used for determining the folds. If
            int, ``random_state`` will be used as a seed for a new RNG. This is
            useful to get the same splits over multiple calls to ``split()``.
            If RandomState instance, this same instance is used as RNG. If
            ``None``, the current RNG from numpy is used. ``random_state`` is
            only used if ``shuffle`` is ``True``.  Default is ``None``.
        shuffle(bool): Whether to shuffle the ratings in the ``data``
            parameter. Shuffling is not done in-place. Default is ``True``.
    """
    ss = ShuffleSplit(
        n_splits=1,
        test_size=test_size,
        train_size=train_size,
        random_state=random_state,
        shuffle=shuffle,
    )
    return next(ss.split(data))




[docs]
class LeaveOneOut:
    """Cross-validation iterator where each user has exactly one rating in the
    testset.

    Contrary to other cross-validation strategies, ``LeaveOneOut`` does not
    guarantee that all folds will be different, although this is still very
    likely for sizeable datasets.

    See an example in the :ref:`User Guide <use_cross_validation_iterators>`.

    Args:
        n_splits(int): The number of folds.
        random_state(int, RandomState instance from numpy, or ``None``):
            Determines the RNG that will be used for determining the folds. If
            int, ``random_state`` will be used as a seed for a new RNG. This is
            useful to get the same splits over multiple calls to ``split()``.
            If RandomState instance, this same instance is used as RNG. If
            ``None``, the current RNG from numpy is used. ``random_state`` is
            only used if ``shuffle`` is ``True``.  Default is ``None``.
        min_n_ratings(int): Minimum number of ratings for each user in the
            trainset. E.g. if ``min_n_ratings`` is ``2``, we are sure each user
            has at least ``2`` ratings in the trainset (and ``1`` in the
            testset). Other users are discarded. Default is ``0``, so some
            users (having only one rating) may be in the testset and not in the
            trainset.
    """

    def __init__(self, n_splits=5, random_state=None, min_n_ratings=0):

        self.n_splits = n_splits
        self.random_state = random_state
        self.min_n_ratings = min_n_ratings


[docs]
    def split(self, data):
        """Generator function to iterate over trainsets and testsets.

        Args:
            data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
                ratings that will be divided into trainsets and testsets.

        Yields:
            tuple of (trainset, testset)
        """

        # map ratings to the users ids
        user_ratings = defaultdict(list)
        for uid, iid, r_ui, _ in data.raw_ratings:
            user_ratings[uid].append((uid, iid, r_ui, None))

        rng = get_rng(self.random_state)

        for _ in range(self.n_splits):
            # for each user, randomly choose a rating and put it in the
            # testset.
            raw_trainset, raw_testset = [], []
            for uid, ratings in user_ratings.items():
                if len(ratings) > self.min_n_ratings:
                    i = rng.randint(0, len(ratings))
                    raw_testset.append(ratings[i])
                    raw_trainset += [
                        rating for (j, rating) in enumerate(ratings) if j != i
                    ]

            if not raw_trainset:
                raise ValueError(
                    "Could not build any trainset. Maybe " "min_n_ratings is too high?"
                )
            trainset = data.construct_trainset(raw_trainset)
            testset = data.construct_testset(raw_testset)

            yield trainset, testset


    def get_n_folds(self):

        return self.n_splits




[docs]
class PredefinedKFold:
    """A cross-validation iterator to when a dataset has been loaded with the
    :meth:`load_from_folds <surprise.dataset.Dataset.load_from_folds>`
    method.

    See an example in the :ref:`User Guide <load_from_folds_example>`.
    """


[docs]
    def split(self, data):
        """Generator function to iterate over trainsets and testsets.

        Args:
            data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
                ratings that will be divided into trainsets and testsets.

        Yields:
            tuple of (trainset, testset)
        """

        self.n_splits = len(data.folds_files)
        for train_file, test_file in data.folds_files:

            raw_trainset = data.read_ratings(train_file)
            raw_testset = data.read_ratings(test_file)
            trainset = data.construct_trainset(raw_trainset)
            testset = data.construct_testset(raw_testset)

            yield trainset, testset


    def get_n_folds(self):

        return self.n_splits