Source code for surprise.evaluate

"""
The :mod:`evaluate` module defines the :func:`evaluate` function.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
from collections import defaultdict
import time
import os

import numpy as np
from .six import iteritems
from .six import itervalues

from . import accuracy
from .dump import dump


[docs]def evaluate(algo, data, measures=['rmse', 'mae'], with_dump=False, dump_dir=None, verbose=1): """Evaluate the performance of the algorithm on given data. Depending on the nature of the ``data`` parameter, it may or may not perform cross validation. Args: algo(:obj:`AlgoBase <surprise.prediction_algorithms.bases.AlgoBase>`): The algorithm to evaluate. data(:obj:`Dataset <surprise.dataset.Dataset>`): The dataset on which to evaluate the algorithm. measures(list of string): The performance measures to compute. Allowed names are function names as defined in the :mod:`accuracy <surprise.accuracy>` module. Default is ``['rmse', 'mae']``. with_dump(bool): If True, the predictions, the trainsets and the algorithm parameters will be dumped for later further analysis at each fold (see :ref:`User Guide <dumping>`). The file names will be set as: ``'<date>-<algorithm name>-<fold number>'``. Default is ``False``. dump_dir(str): The directory where to dump to files. Default is ``'~/.surprise_data/dumps/'``. verbose(int): Level of verbosity. If 0, nothing is printed. If 1 (default), accuracy measures for each folds are printed, with a final summary. If 2, every prediction is printed. Returns: A dictionary containing measures as keys and lists as values. Each list contains one entry per fold. """ performances = CaseInsensitiveDefaultDict(list) print('Evaluating {0} of algorithm {1}.'.format( ', '.join((m.upper() for m in measures)), algo.__class__.__name__)) print() for fold_i, (trainset, testset) in enumerate(data.folds()): if verbose: print('-' * 12) print('Fold ' + str(fold_i + 1)) # train and test algorithm. Keep all rating predictions in a list algo.train(trainset) predictions = algo.test(testset, verbose=(verbose == 2)) # compute needed performance statistics for measure in measures: f = getattr(accuracy, measure.lower()) performances[measure].append(f(predictions, verbose=verbose)) if with_dump: if dump_dir is None: dump_dir = os.path.expanduser('~') + '/.surprise_data/dumps/' if not os.path.exists(dump_dir): os.makedirs(dump_dir) date = time.strftime('%y%m%d-%Hh%Mm%S', time.localtime()) file_name = date + '-' + algo.__class__.__name__ file_name += '-fold{0}'.format(fold_i + 1) file_name = os.path.join(dump_dir, file_name) dump(file_name, predictions, trainset, algo) if verbose: print('-' * 12) print('-' * 12) for measure in measures: print('Mean {0:4s}: {1:1.4f}'.format( measure.upper(), np.mean(performances[measure]))) print('-' * 12) print('-' * 12) return performances
class CaseInsensitiveDefaultDict(defaultdict): """From here: http://stackoverflow.com/questions/2082152/case-insensitive-dictionary. As pointed out in the comments, this only covers a few cases and we should override a lot of other methods, but oh well... Used for the returned dict, so that users can use perf['RMSE'] or perf['rmse'] indifferently. """ def __setitem__(self, key, value): super(CaseInsensitiveDefaultDict, self).__setitem__(key.lower(), value) def __getitem__(self, key): return super(CaseInsensitiveDefaultDict, self).__getitem__(key.lower()) def __str__(self): # retrieve number of folds. Kind of ugly... n_folds = [len(values) for values in itervalues(self)][0] row_format ='{:<8}' * (n_folds + 2) s = row_format.format( '', *['Fold {0}'.format(i + 1) for i in range(n_folds)] + ['Mean']) s += '\n' s += '\n'.join(row_format.format( key.upper(), *['{:1.4f}'.format(v) for v in vals] + ['{:1.4f}'.format(np.mean(vals))]) for (key, vals) in iteritems(self)) return s