From eb4d1796bfd6b7cccd28d134cb66d391e50861d8 Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Thu, 9 Nov 2017 13:20:58 +0100
Subject: [PATCH 01/36] Remove deprecated (0.18) cross_validation.py in favor
of model_selection
---
sklearn/__init__.py | 18 +-
sklearn/cross_validation.py | 2075 ------------------------
sklearn/tests/test_cross_validation.py | 1252 --------------
3 files changed, 9 insertions(+), 3336 deletions(-)
delete mode 100644 sklearn/cross_validation.py
delete mode 100644 sklearn/tests/test_cross_validation.py
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index c45728106ad53..27879e16be363 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -135,15 +135,15 @@ def config_context(**new_config):
__check_build # avoid flakes unused variable error
__all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
- 'cross_validation', 'datasets', 'decomposition', 'dummy',
- 'ensemble', 'exceptions', 'externals', 'feature_extraction',
- 'feature_selection', 'gaussian_process', 'grid_search',
- 'isotonic', 'kernel_approximation', 'kernel_ridge',
- 'learning_curve', 'linear_model', 'manifold', 'metrics',
- 'mixture', 'model_selection', 'multiclass', 'multioutput',
- 'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
- 'preprocessing', 'random_projection', 'semi_supervised',
- 'svm', 'tree', 'discriminant_analysis',
+ 'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions',
+ 'externals', 'feature_extraction', 'feature_selection',
+ 'gaussian_process', 'grid_search', 'isotonic',
+ 'kernel_approximation', 'kernel_ridge', 'learning_curve',
+ 'linear_model', 'manifold', 'metrics', 'mixture',
+ 'model_selection', 'multiclass', 'multioutput', 'naive_bayes',
+ 'neighbors', 'neural_network', 'pipeline', 'preprocessing',
+ 'random_projection', 'semi_supervised', 'svm', 'tree',
+ 'discriminant_analysis',
# Non-modules:
'clone']
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
deleted file mode 100644
index 7646459da3936..0000000000000
--- a/sklearn/cross_validation.py
+++ /dev/null
@@ -1,2075 +0,0 @@
-"""
-The :mod:`sklearn.cross_validation` module includes utilities for cross-
-validation and performance evaluation.
-"""
-
-# Author: Alexandre Gramfort ,
-# Gael Varoquaux ,
-# Olivier Grisel
-# License: BSD 3 clause
-
-from __future__ import print_function
-from __future__ import division
-
-import warnings
-from itertools import chain, combinations
-from math import ceil, floor, factorial
-import numbers
-import time
-from abc import ABCMeta, abstractmethod
-
-import numpy as np
-import scipy.sparse as sp
-
-from .base import is_classifier, clone
-from .utils import indexable, check_random_state, safe_indexing
-from .utils.validation import (_is_arraylike, _num_samples,
- column_or_1d)
-from .utils.multiclass import type_of_target
-from .externals.joblib import Parallel, delayed, logger
-from .externals.six import with_metaclass
-from .externals.six.moves import zip
-from .metrics.scorer import check_scoring
-from .gaussian_process.kernels import Kernel as GPKernel
-from .exceptions import FitFailedWarning
-
-
-warnings.warn("This module was deprecated in version 0.18 in favor of the "
- "model_selection module into which all the refactored classes "
- "and functions are moved. Also note that the interface of the "
- "new CV iterators are different from that of this module. "
- "This module will be removed in 0.20.", DeprecationWarning)
-
-
-__all__ = ['KFold',
- 'LabelKFold',
- 'LeaveOneLabelOut',
- 'LeaveOneOut',
- 'LeavePLabelOut',
- 'LeavePOut',
- 'ShuffleSplit',
- 'StratifiedKFold',
- 'StratifiedShuffleSplit',
- 'PredefinedSplit',
- 'LabelShuffleSplit',
- 'check_cv',
- 'cross_val_score',
- 'cross_val_predict',
- 'permutation_test_score',
- 'train_test_split']
-
-
-class _PartitionIterator(with_metaclass(ABCMeta)):
- """Base class for CV iterators where train_mask = ~test_mask
-
- Implementations must define `_iter_test_masks` or `_iter_test_indices`.
-
- Parameters
- ----------
- n : int
- Total number of elements in dataset.
- """
-
- def __init__(self, n):
- if abs(n - int(n)) >= np.finfo('f').eps:
- raise ValueError("n must be an integer")
- self.n = int(n)
-
- def __iter__(self):
- ind = np.arange(self.n)
- for test_index in self._iter_test_masks():
- train_index = np.logical_not(test_index)
- train_index = ind[train_index]
- test_index = ind[test_index]
- yield train_index, test_index
-
- # Since subclasses must implement either _iter_test_masks or
- # _iter_test_indices, neither can be abstract.
- def _iter_test_masks(self):
- """Generates boolean masks corresponding to test sets.
-
- By default, delegates to _iter_test_indices()
- """
- for test_index in self._iter_test_indices():
- test_mask = self._empty_mask()
- test_mask[test_index] = True
- yield test_mask
-
- def _iter_test_indices(self):
- """Generates integer indices corresponding to test sets."""
- raise NotImplementedError
-
- def _empty_mask(self):
- return np.zeros(self.n, dtype=np.bool)
-
-
-class LeaveOneOut(_PartitionIterator):
- """Leave-One-Out cross validation iterator.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.LeaveOneOut` instead.
-
- Provides train/test indices to split data in train test sets. Each
- sample is used once as a test set (singleton) while the remaining
- samples form the training set.
-
- Note: ``LeaveOneOut(n)`` is equivalent to ``KFold(n, n_folds=n)`` and
- ``LeavePOut(n, p=1)``.
-
- Due to the high number of test sets (which is the same as the
- number of samples) this cross validation method can be very costly.
- For large datasets one should favor KFold, StratifiedKFold or
- ShuffleSplit.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n : int
- Total number of elements in dataset.
-
- Examples
- --------
- >>> from sklearn import cross_validation
- >>> X = np.array([[1, 2], [3, 4]])
- >>> y = np.array([1, 2])
- >>> loo = cross_validation.LeaveOneOut(2)
- >>> len(loo)
- 2
- >>> print(loo)
- sklearn.cross_validation.LeaveOneOut(n=2)
- >>> for train_index, test_index in loo:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- ... print(X_train, X_test, y_train, y_test)
- TRAIN: [1] TEST: [0]
- [[3 4]] [[1 2]] [2] [1]
- TRAIN: [0] TEST: [1]
- [[1 2]] [[3 4]] [1] [2]
-
- See also
- --------
- LeaveOneLabelOut for splitting the data according to explicit,
- domain-specific stratification of the dataset.
- """
-
- def _iter_test_indices(self):
- return range(self.n)
-
- def __repr__(self):
- return '%s.%s(n=%i)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.n,
- )
-
- def __len__(self):
- return self.n
-
-
-class LeavePOut(_PartitionIterator):
- """Leave-P-Out cross validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.LeavePOut` instead.
-
- Provides train/test indices to split data in train test sets. This results
- in testing on all distinct samples of size p, while the remaining n - p
- samples form the training set in each iteration.
-
- Note: ``LeavePOut(n, p)`` is NOT equivalent to ``KFold(n, n_folds=n // p)``
- which creates non-overlapping test sets.
-
- Due to the high number of iterations which grows combinatorically with the
- number of samples this cross validation method can be very costly. For
- large datasets one should favor KFold, StratifiedKFold or ShuffleSplit.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n : int
- Total number of elements in dataset.
-
- p : int
- Size of the test sets.
-
- Examples
- --------
- >>> from sklearn import cross_validation
- >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- >>> y = np.array([1, 2, 3, 4])
- >>> lpo = cross_validation.LeavePOut(4, 2)
- >>> len(lpo)
- 6
- >>> print(lpo)
- sklearn.cross_validation.LeavePOut(n=4, p=2)
- >>> for train_index, test_index in lpo:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- TRAIN: [2 3] TEST: [0 1]
- TRAIN: [1 3] TEST: [0 2]
- TRAIN: [1 2] TEST: [0 3]
- TRAIN: [0 3] TEST: [1 2]
- TRAIN: [0 2] TEST: [1 3]
- TRAIN: [0 1] TEST: [2 3]
- """
-
- def __init__(self, n, p):
- super(LeavePOut, self).__init__(n)
- self.p = p
-
- def _iter_test_indices(self):
- for comb in combinations(range(self.n), self.p):
- yield np.array(comb)
-
- def __repr__(self):
- return '%s.%s(n=%i, p=%i)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.n,
- self.p,
- )
-
- def __len__(self):
- return int(factorial(self.n) / factorial(self.n - self.p)
- / factorial(self.p))
-
-
-class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)):
- """Base class to validate KFold approaches"""
-
- @abstractmethod
- def __init__(self, n, n_folds, shuffle, random_state):
- super(_BaseKFold, self).__init__(n)
-
- if abs(n_folds - int(n_folds)) >= np.finfo('f').eps:
- raise ValueError("n_folds must be an integer")
- self.n_folds = n_folds = int(n_folds)
-
- if n_folds <= 1:
- raise ValueError(
- "k-fold cross validation requires at least one"
- " train / test split by setting n_folds=2 or more,"
- " got n_folds={0}.".format(n_folds))
- if n_folds > self.n:
- raise ValueError(
- ("Cannot have number of folds n_folds={0} greater"
- " than the number of samples: {1}.").format(n_folds, n))
-
- if not isinstance(shuffle, bool):
- raise TypeError("shuffle must be True or False;"
- " got {0}".format(shuffle))
- self.shuffle = shuffle
- self.random_state = random_state
-
-
-class KFold(_BaseKFold):
- """K-Folds cross validation iterator.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.KFold` instead.
-
- Provides train/test indices to split data in train test sets. Split
- dataset into k consecutive folds (without shuffling by default).
-
- Each fold is then used as a validation set once while the k - 1 remaining
- fold(s) form the training set.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n : int
- Total number of elements.
-
- n_folds : int, default=3
- Number of folds. Must be at least 2.
-
- shuffle : boolean, optional
- Whether to shuffle the data before splitting into batches.
-
- random_state : int, RandomState instance or None, optional, default=None
- If int, random_state is the seed used by the random number
- generator; If RandomState instance, random_state is the random number
- generator; If None, the random number generator is the RandomState
- instance used by `np.random`. Used when ``shuffle`` == True.
-
- Examples
- --------
- >>> from sklearn.cross_validation import KFold
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([1, 2, 3, 4])
- >>> kf = KFold(4, n_folds=2)
- >>> len(kf)
- 2
- >>> print(kf) # doctest: +NORMALIZE_WHITESPACE
- sklearn.cross_validation.KFold(n=4, n_folds=2, shuffle=False,
- random_state=None)
- >>> for train_index, test_index in kf:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- TRAIN: [2 3] TEST: [0 1]
- TRAIN: [0 1] TEST: [2 3]
-
- Notes
- -----
- The first n % n_folds folds have size n // n_folds + 1, other folds have
- size n // n_folds.
-
- See also
- --------
- StratifiedKFold take label information into account to avoid building
- folds with imbalanced class distributions (for binary or multiclass
- classification tasks).
-
- LabelKFold: K-fold iterator variant with non-overlapping labels.
- """
-
- def __init__(self, n, n_folds=3, shuffle=False,
- random_state=None):
- super(KFold, self).__init__(n, n_folds, shuffle, random_state)
- self.idxs = np.arange(n)
- if shuffle:
- rng = check_random_state(self.random_state)
- rng.shuffle(self.idxs)
-
- def _iter_test_indices(self):
- n = self.n
- n_folds = self.n_folds
- fold_sizes = (n // n_folds) * np.ones(n_folds, dtype=np.int)
- fold_sizes[:n % n_folds] += 1
- current = 0
- for fold_size in fold_sizes:
- start, stop = current, current + fold_size
- yield self.idxs[start:stop]
- current = stop
-
- def __repr__(self):
- return '%s.%s(n=%i, n_folds=%i, shuffle=%s, random_state=%s)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.n,
- self.n_folds,
- self.shuffle,
- self.random_state,
- )
-
- def __len__(self):
- return self.n_folds
-
-
-class LabelKFold(_BaseKFold):
- """K-fold iterator variant with non-overlapping labels.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.GroupKFold` instead.
-
- The same label will not appear in two different folds (the number of
- distinct labels has to be at least equal to the number of folds).
-
- The folds are approximately balanced in the sense that the number of
- distinct labels is approximately the same in each fold.
-
- .. versionadded:: 0.17
-
- Parameters
- ----------
- labels : array-like with shape (n_samples, )
- Contains a label for each sample.
- The folds are built so that the same label does not appear in two
- different folds.
-
- n_folds : int, default=3
- Number of folds. Must be at least 2.
-
- Examples
- --------
- >>> from sklearn.cross_validation import LabelKFold
- >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- >>> y = np.array([1, 2, 3, 4])
- >>> labels = np.array([0, 0, 2, 2])
- >>> label_kfold = LabelKFold(labels, n_folds=2)
- >>> len(label_kfold)
- 2
- >>> print(label_kfold)
- sklearn.cross_validation.LabelKFold(n_labels=4, n_folds=2)
- >>> for train_index, test_index in label_kfold:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- ... print(X_train, X_test, y_train, y_test)
- ...
- TRAIN: [0 1] TEST: [2 3]
- [[1 2]
- [3 4]] [[5 6]
- [7 8]] [1 2] [3 4]
- TRAIN: [2 3] TEST: [0 1]
- [[5 6]
- [7 8]] [[1 2]
- [3 4]] [3 4] [1 2]
-
- See also
- --------
- LeaveOneLabelOut for splitting the data according to explicit,
- domain-specific stratification of the dataset.
- """
- def __init__(self, labels, n_folds=3):
- super(LabelKFold, self).__init__(len(labels), n_folds,
- shuffle=False, random_state=None)
-
- unique_labels, labels = np.unique(labels, return_inverse=True)
- n_labels = len(unique_labels)
-
- if n_folds > n_labels:
- raise ValueError(
- ("Cannot have number of folds n_folds={0} greater"
- " than the number of labels: {1}.").format(n_folds,
- n_labels))
-
- # Weight labels by their number of occurrences
- n_samples_per_label = np.bincount(labels)
-
- # Distribute the most frequent labels first
- indices = np.argsort(n_samples_per_label)[::-1]
- n_samples_per_label = n_samples_per_label[indices]
-
- # Total weight of each fold
- n_samples_per_fold = np.zeros(n_folds)
-
- # Mapping from label index to fold index
- label_to_fold = np.zeros(len(unique_labels))
-
- # Distribute samples by adding the largest weight to the lightest fold
- for label_index, weight in enumerate(n_samples_per_label):
- lightest_fold = np.argmin(n_samples_per_fold)
- n_samples_per_fold[lightest_fold] += weight
- label_to_fold[indices[label_index]] = lightest_fold
-
- self.idxs = label_to_fold[labels]
-
- def _iter_test_indices(self):
- for f in range(self.n_folds):
- yield np.where(self.idxs == f)[0]
-
- def __repr__(self):
- return '{0}.{1}(n_labels={2}, n_folds={3})'.format(
- self.__class__.__module__,
- self.__class__.__name__,
- self.n,
- self.n_folds,
- )
-
- def __len__(self):
- return self.n_folds
-
-
-class StratifiedKFold(_BaseKFold):
- """Stratified K-Folds cross validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.StratifiedKFold` instead.
-
- Provides train/test indices to split data in train test sets.
-
- This cross-validation object is a variation of KFold that
- returns stratified folds. The folds are made by preserving
- the percentage of samples for each class.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- y : array-like, [n_samples]
- Samples to split in K folds.
-
- n_folds : int, default=3
- Number of folds. Must be at least 2.
-
- shuffle : boolean, optional
- Whether to shuffle each stratification of the data before splitting
- into batches.
-
- random_state : int, RandomState instance or None, optional, default=None
- If int, random_state is the seed used by the random number
- generator; If RandomState instance, random_state is the random number
- generator; If None, the random number generator is the RandomState
- instance used by `np.random`. Used when ``shuffle`` == True.
-
- Examples
- --------
- >>> from sklearn.cross_validation import StratifiedKFold
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 1, 1])
- >>> skf = StratifiedKFold(y, n_folds=2)
- >>> len(skf)
- 2
- >>> print(skf) # doctest: +NORMALIZE_WHITESPACE
- sklearn.cross_validation.StratifiedKFold(labels=[0 0 1 1], n_folds=2,
- shuffle=False, random_state=None)
- >>> for train_index, test_index in skf:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- TRAIN: [1 3] TEST: [0 2]
- TRAIN: [0 2] TEST: [1 3]
-
- Notes
- -----
- All the folds have size trunc(n_samples / n_folds), the last one has the
- complementary.
-
- See also
- --------
- LabelKFold: K-fold iterator variant with non-overlapping labels.
- """
-
- def __init__(self, y, n_folds=3, shuffle=False,
- random_state=None):
- super(StratifiedKFold, self).__init__(
- len(y), n_folds, shuffle, random_state)
- y = np.asarray(y)
- n_samples = y.shape[0]
- unique_labels, y_inversed = np.unique(y, return_inverse=True)
- label_counts = np.bincount(y_inversed)
- min_labels = np.min(label_counts)
- if np.all(self.n_folds > label_counts):
- raise ValueError("All the n_labels for individual classes"
- " are less than %d folds."
- % (self.n_folds))
- if self.n_folds > min_labels:
- warnings.warn(("The least populated class in y has only %d"
- " members, which is too few. The minimum"
- " number of labels for any class cannot"
- " be less than n_folds=%d."
- % (min_labels, self.n_folds)), Warning)
-
- # don't want to use the same seed in each label's shuffle
- if self.shuffle:
- rng = check_random_state(self.random_state)
- else:
- rng = self.random_state
-
- # pre-assign each sample to a test fold index using individual KFold
- # splitting strategies for each label so as to respect the
- # balance of labels
- per_label_cvs = [
- KFold(max(c, self.n_folds), self.n_folds, shuffle=self.shuffle,
- random_state=rng) for c in label_counts]
- test_folds = np.zeros(n_samples, dtype=np.int)
- for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
- for label, (_, test_split) in zip(unique_labels, per_label_splits):
- label_test_folds = test_folds[y == label]
- # the test split can be too big because we used
- # KFold(max(c, self.n_folds), self.n_folds) instead of
- # KFold(c, self.n_folds) to make it possible to not crash even
- # if the data is not 100% stratifiable for all the labels
- # (we use a warning instead of raising an exception)
- # If this is the case, let's trim it:
- test_split = test_split[test_split < len(label_test_folds)]
- label_test_folds[test_split] = test_fold_idx
- test_folds[y == label] = label_test_folds
-
- self.test_folds = test_folds
- self.y = y
-
- def _iter_test_masks(self):
- for i in range(self.n_folds):
- yield self.test_folds == i
-
- def __repr__(self):
- return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.y,
- self.n_folds,
- self.shuffle,
- self.random_state,
- )
-
- def __len__(self):
- return self.n_folds
-
-
-class LeaveOneLabelOut(_PartitionIterator):
- """Leave-One-Label_Out cross-validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.LeaveOneGroupOut` instead.
-
- Provides train/test indices to split data according to a third-party
- provided label. This label information can be used to encode arbitrary
- domain specific stratifications of the samples as integers.
-
- For instance the labels could be the year of collection of the samples
- and thus allow for cross-validation against time-based splits.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- labels : array-like of int with shape (n_samples,)
- Arbitrary domain-specific stratification of the data to be used
- to draw the splits.
-
- Examples
- --------
- >>> from sklearn import cross_validation
- >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- >>> y = np.array([1, 2, 1, 2])
- >>> labels = np.array([1, 1, 2, 2])
- >>> lol = cross_validation.LeaveOneLabelOut(labels)
- >>> len(lol)
- 2
- >>> print(lol)
- sklearn.cross_validation.LeaveOneLabelOut(labels=[1 1 2 2])
- >>> for train_index, test_index in lol:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- ... print(X_train, X_test, y_train, y_test)
- TRAIN: [2 3] TEST: [0 1]
- [[5 6]
- [7 8]] [[1 2]
- [3 4]] [1 2] [1 2]
- TRAIN: [0 1] TEST: [2 3]
- [[1 2]
- [3 4]] [[5 6]
- [7 8]] [1 2] [1 2]
-
- See also
- --------
- LabelKFold: K-fold iterator variant with non-overlapping labels.
- """
-
- def __init__(self, labels):
- super(LeaveOneLabelOut, self).__init__(len(labels))
- # We make a copy of labels to avoid side-effects during iteration
- self.labels = np.array(labels, copy=True)
- self.unique_labels = np.unique(labels)
- self.n_unique_labels = len(self.unique_labels)
-
- def _iter_test_masks(self):
- for i in self.unique_labels:
- yield self.labels == i
-
- def __repr__(self):
- return '%s.%s(labels=%s)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.labels,
- )
-
- def __len__(self):
- return self.n_unique_labels
-
-
-class LeavePLabelOut(_PartitionIterator):
- """Leave-P-Label_Out cross-validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.LeavePGroupsOut` instead.
-
- Provides train/test indices to split data according to a third-party
- provided label. This label information can be used to encode arbitrary
- domain specific stratifications of the samples as integers.
-
- For instance the labels could be the year of collection of the samples
- and thus allow for cross-validation against time-based splits.
-
- The difference between LeavePLabelOut and LeaveOneLabelOut is that
- the former builds the test sets with all the samples assigned to
- ``p`` different values of the labels while the latter uses samples
- all assigned the same labels.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- labels : array-like of int with shape (n_samples,)
- Arbitrary domain-specific stratification of the data to be used
- to draw the splits.
-
- p : int
- Number of samples to leave out in the test split.
-
- Examples
- --------
- >>> from sklearn import cross_validation
- >>> X = np.array([[1, 2], [3, 4], [5, 6]])
- >>> y = np.array([1, 2, 1])
- >>> labels = np.array([1, 2, 3])
- >>> lpl = cross_validation.LeavePLabelOut(labels, p=2)
- >>> len(lpl)
- 3
- >>> print(lpl)
- sklearn.cross_validation.LeavePLabelOut(labels=[1 2 3], p=2)
- >>> for train_index, test_index in lpl:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- ... print(X_train, X_test, y_train, y_test)
- TRAIN: [2] TEST: [0 1]
- [[5 6]] [[1 2]
- [3 4]] [1] [1 2]
- TRAIN: [1] TEST: [0 2]
- [[3 4]] [[1 2]
- [5 6]] [2] [1 1]
- TRAIN: [0] TEST: [1 2]
- [[1 2]] [[3 4]
- [5 6]] [1] [2 1]
-
- See also
- --------
- LabelKFold: K-fold iterator variant with non-overlapping labels.
- """
-
- def __init__(self, labels, p):
- # We make a copy of labels to avoid side-effects during iteration
- super(LeavePLabelOut, self).__init__(len(labels))
- self.labels = np.array(labels, copy=True)
- self.unique_labels = np.unique(labels)
- self.n_unique_labels = len(self.unique_labels)
- self.p = p
-
- def _iter_test_masks(self):
- comb = combinations(range(self.n_unique_labels), self.p)
- for idx in comb:
- test_index = self._empty_mask()
- idx = np.array(idx)
- for l in self.unique_labels[idx]:
- test_index[self.labels == l] = True
- yield test_index
-
- def __repr__(self):
- return '%s.%s(labels=%s, p=%s)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.labels,
- self.p,
- )
-
- def __len__(self):
- return int(factorial(self.n_unique_labels) /
- factorial(self.n_unique_labels - self.p) /
- factorial(self.p))
-
-
-class BaseShuffleSplit(with_metaclass(ABCMeta)):
- """Base class for ShuffleSplit and StratifiedShuffleSplit"""
-
- def __init__(self, n, n_iter=10, test_size=0.1, train_size=None,
- random_state=None):
- self.n = n
- self.n_iter = n_iter
- self.test_size = test_size
- self.train_size = train_size
- self.random_state = random_state
- self.n_train, self.n_test = _validate_shuffle_split(n, test_size,
- train_size)
-
- def __iter__(self):
- for train, test in self._iter_indices():
- yield train, test
- return
-
- @abstractmethod
- def _iter_indices(self):
- """Generate (train, test) indices"""
-
-
-class ShuffleSplit(BaseShuffleSplit):
- """Random permutation cross-validation iterator.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.ShuffleSplit` instead.
-
- Yields indices to split data into training and test sets.
-
- Note: contrary to other cross-validation strategies, random splits
- do not guarantee that all folds will be different, although this is
- still very likely for sizeable datasets.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n : int
- Total number of elements in the dataset.
-
- n_iter : int (default 10)
- Number of re-shuffling & splitting iterations.
-
- test_size : float (default 0.1), int, or None
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the test split. If
- int, represents the absolute number of test samples. If None,
- the value is automatically set to the complement of the train size.
-
- train_size : float, int, or None (default is None)
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the train split. If
- int, represents the absolute number of train samples. If None,
- the value is automatically set to the complement of the test size.
-
- random_state : int, RandomState instance or None, optional (default None)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- Examples
- --------
- >>> from sklearn import cross_validation
- >>> rs = cross_validation.ShuffleSplit(4, n_iter=3,
- ... test_size=.25, random_state=0)
- >>> len(rs)
- 3
- >>> print(rs)
- ... # doctest: +ELLIPSIS
- ShuffleSplit(4, n_iter=3, test_size=0.25, ...)
- >>> for train_index, test_index in rs:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ...
- TRAIN: [3 1 0] TEST: [2]
- TRAIN: [2 1 3] TEST: [0]
- TRAIN: [0 2 1] TEST: [3]
-
- >>> rs = cross_validation.ShuffleSplit(4, n_iter=3,
- ... train_size=0.5, test_size=.25, random_state=0)
- >>> for train_index, test_index in rs:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ...
- TRAIN: [3 1] TEST: [2]
- TRAIN: [2 1] TEST: [0]
- TRAIN: [0 2] TEST: [3]
-
- """
-
- def _iter_indices(self):
- rng = check_random_state(self.random_state)
- for i in range(self.n_iter):
- # random partition
- permutation = rng.permutation(self.n)
- ind_test = permutation[:self.n_test]
- ind_train = permutation[self.n_test:self.n_test + self.n_train]
- yield ind_train, ind_test
-
- def __repr__(self):
- return ('%s(%d, n_iter=%d, test_size=%s, '
- 'random_state=%s)' % (
- self.__class__.__name__,
- self.n,
- self.n_iter,
- str(self.test_size),
- self.random_state,
- ))
-
- def __len__(self):
- return self.n_iter
-
-
-def _validate_shuffle_split(n, test_size, train_size):
- if test_size is None and train_size is None:
- raise ValueError(
- 'test_size and train_size can not both be None')
-
- if test_size is not None:
- if np.asarray(test_size).dtype.kind == 'f':
- if test_size >= 1.:
- raise ValueError(
- 'test_size=%f should be smaller '
- 'than 1.0 or be an integer' % test_size)
- elif np.asarray(test_size).dtype.kind == 'i':
- if test_size >= n:
- raise ValueError(
- 'test_size=%d should be smaller '
- 'than the number of samples %d' % (test_size, n))
- else:
- raise ValueError("Invalid value for test_size: %r" % test_size)
-
- if train_size is not None:
- if np.asarray(train_size).dtype.kind == 'f':
- if train_size >= 1.:
- raise ValueError("train_size=%f should be smaller "
- "than 1.0 or be an integer" % train_size)
- elif np.asarray(test_size).dtype.kind == 'f' and \
- train_size + test_size > 1.:
- raise ValueError('The sum of test_size and train_size = %f, '
- 'should be smaller than 1.0. Reduce '
- 'test_size and/or train_size.' %
- (train_size + test_size))
- elif np.asarray(train_size).dtype.kind == 'i':
- if train_size >= n:
- raise ValueError("train_size=%d should be smaller "
- "than the number of samples %d" %
- (train_size, n))
- else:
- raise ValueError("Invalid value for train_size: %r" % train_size)
-
- if np.asarray(test_size).dtype.kind == 'f':
- n_test = ceil(test_size * n)
- elif np.asarray(test_size).dtype.kind == 'i':
- n_test = float(test_size)
-
- if train_size is None:
- n_train = n - n_test
- else:
- if np.asarray(train_size).dtype.kind == 'f':
- n_train = floor(train_size * n)
- else:
- n_train = float(train_size)
-
- if test_size is None:
- n_test = n - n_train
-
- if n_train + n_test > n:
- raise ValueError('The sum of train_size and test_size = %d, '
- 'should be smaller than the number of '
- 'samples %d. Reduce test_size and/or '
- 'train_size.' % (n_train + n_test, n))
-
- return int(n_train), int(n_test)
-
-
-def _approximate_mode(class_counts, n_draws, rng):
- """Computes approximate mode of multivariate hypergeometric.
-
- This is an approximation to the mode of the multivariate
- hypergeometric given by class_counts and n_draws.
- It shouldn't be off by more than one.
-
- It is the mostly likely outcome of drawing n_draws many
- samples from the population given by class_counts.
-
- Parameters
- ----------
- class_counts : ndarray of int
- Population per class.
- n_draws : int
- Number of draws (samples to draw) from the overall population.
- rng : random state
- Used to break ties.
-
- Returns
- -------
- sampled_classes : ndarray of int
- Number of samples drawn from each class.
- np.sum(sampled_classes) == n_draws
- """
- # this computes a bad approximation to the mode of the
- # multivariate hypergeometric given by class_counts and n_draws
- continuous = n_draws * class_counts / class_counts.sum()
- # floored means we don't overshoot n_samples, but probably undershoot
- floored = np.floor(continuous)
- # we add samples according to how much "left over" probability
- # they had, until we arrive at n_samples
- need_to_add = int(n_draws - floored.sum())
- if need_to_add > 0:
- remainder = continuous - floored
- values = np.sort(np.unique(remainder))[::-1]
- # add according to remainder, but break ties
- # randomly to avoid biases
- for value in values:
- inds, = np.where(remainder == value)
- # if we need_to_add less than what's in inds
- # we draw randomly from them.
- # if we need to add more, we add them all and
- # go to the next value
- add_now = min(len(inds), need_to_add)
- inds = rng.choice(inds, size=add_now, replace=False)
- floored[inds] += 1
- need_to_add -= add_now
- if need_to_add == 0:
- break
- return floored.astype(np.int)
-
-
-class StratifiedShuffleSplit(BaseShuffleSplit):
- """Stratified ShuffleSplit cross validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.StratifiedShuffleSplit` instead.
-
- Provides train/test indices to split data in train test sets.
-
- This cross-validation object is a merge of StratifiedKFold and
- ShuffleSplit, which returns stratified randomized folds. The folds
- are made by preserving the percentage of samples for each class.
-
- Note: like the ShuffleSplit strategy, stratified random splits
- do not guarantee that all folds will be different, although this is
- still very likely for sizeable datasets.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- y : array, [n_samples]
- Labels of samples.
-
- n_iter : int (default 10)
- Number of re-shuffling & splitting iterations.
-
- test_size : float (default 0.1), int, or None
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the test split. If
- int, represents the absolute number of test samples. If None,
- the value is automatically set to the complement of the train size.
-
- train_size : float, int, or None (default is None)
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the train split. If
- int, represents the absolute number of train samples. If None,
- the value is automatically set to the complement of the test size.
-
- random_state : int, RandomState instance or None, optional (default None)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- Examples
- --------
- >>> from sklearn.cross_validation import StratifiedShuffleSplit
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 1, 1])
- >>> sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)
- >>> len(sss)
- 3
- >>> print(sss) # doctest: +ELLIPSIS
- StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, ...)
- >>> for train_index, test_index in sss:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- TRAIN: [1 2] TEST: [3 0]
- TRAIN: [0 2] TEST: [1 3]
- TRAIN: [0 2] TEST: [3 1]
- """
-
- def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
- random_state=None):
-
- super(StratifiedShuffleSplit, self).__init__(
- len(y), n_iter, test_size, train_size, random_state)
-
- self.y = np.array(y)
- self.classes, self.y_indices = np.unique(y, return_inverse=True)
- n_cls = self.classes.shape[0]
-
- if np.min(np.bincount(self.y_indices)) < 2:
- raise ValueError("The least populated class in y has only 1"
- " member, which is too few. The minimum"
- " number of labels for any class cannot"
- " be less than 2.")
-
- if self.n_train < n_cls:
- raise ValueError('The train_size = %d should be greater or '
- 'equal to the number of classes = %d' %
- (self.n_train, n_cls))
- if self.n_test < n_cls:
- raise ValueError('The test_size = %d should be greater or '
- 'equal to the number of classes = %d' %
- (self.n_test, n_cls))
-
- def _iter_indices(self):
- rng = check_random_state(self.random_state)
- cls_count = np.bincount(self.y_indices)
-
- for n in range(self.n_iter):
- # if there are ties in the class-counts, we want
- # to make sure to break them anew in each iteration
- n_i = _approximate_mode(cls_count, self.n_train, rng)
- class_counts_remaining = cls_count - n_i
- t_i = _approximate_mode(class_counts_remaining, self.n_test, rng)
-
- train = []
- test = []
-
- for i, _ in enumerate(self.classes):
- permutation = rng.permutation(cls_count[i])
- perm_indices_class_i = np.where(
- (i == self.y_indices))[0][permutation]
-
- train.extend(perm_indices_class_i[:n_i[i]])
- test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
- train = rng.permutation(train)
- test = rng.permutation(test)
-
- yield train, test
-
- def __repr__(self):
- return ('%s(labels=%s, n_iter=%d, test_size=%s, '
- 'random_state=%s)' % (
- self.__class__.__name__,
- self.y,
- self.n_iter,
- str(self.test_size),
- self.random_state,
- ))
-
- def __len__(self):
- return self.n_iter
-
-
-class PredefinedSplit(_PartitionIterator):
- """Predefined split cross validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.PredefinedSplit` instead.
-
- Splits the data into training/test set folds according to a predefined
- scheme. Each sample can be assigned to at most one test set fold, as
- specified by the user through the ``test_fold`` parameter.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- test_fold : "array-like, shape (n_samples,)
- test_fold[i] gives the test set fold of sample i. A value of -1
- indicates that the corresponding sample is not part of any test set
- folds, but will instead always be put into the training fold.
-
- Examples
- --------
- >>> from sklearn.cross_validation import PredefinedSplit
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 1, 1])
- >>> ps = PredefinedSplit(test_fold=[0, 1, -1, 1])
- >>> len(ps)
- 2
- >>> print(ps) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
- sklearn.cross_validation.PredefinedSplit(test_fold=[ 0 1 -1 1])
- >>> for train_index, test_index in ps:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- TRAIN: [1 2 3] TEST: [0]
- TRAIN: [0 2] TEST: [1 3]
- """
-
- def __init__(self, test_fold):
- super(PredefinedSplit, self).__init__(len(test_fold))
- self.test_fold = np.array(test_fold, dtype=np.int)
- self.test_fold = column_or_1d(self.test_fold)
- self.unique_folds = np.unique(self.test_fold)
- self.unique_folds = self.unique_folds[self.unique_folds != -1]
-
- def _iter_test_indices(self):
- for f in self.unique_folds:
- yield np.where(self.test_fold == f)[0]
-
- def __repr__(self):
- return '%s.%s(test_fold=%s)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.test_fold)
-
- def __len__(self):
- return len(self.unique_folds)
-
-
-class LabelShuffleSplit(ShuffleSplit):
- """Shuffle-Labels-Out cross-validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.GroupShuffleSplit` instead.
-
- Provides randomized train/test indices to split data according to a
- third-party provided label. This label information can be used to encode
- arbitrary domain specific stratifications of the samples as integers.
-
- For instance the labels could be the year of collection of the samples
- and thus allow for cross-validation against time-based splits.
-
- The difference between LeavePLabelOut and LabelShuffleSplit is that
- the former generates splits using all subsets of size ``p`` unique labels,
- whereas LabelShuffleSplit generates a user-determined number of random
- test splits, each with a user-determined fraction of unique labels.
-
- For example, a less computationally intensive alternative to
- ``LeavePLabelOut(labels, p=10)`` would be
- ``LabelShuffleSplit(labels, test_size=10, n_iter=100)``.
-
- Note: The parameters ``test_size`` and ``train_size`` refer to labels, and
- not to samples, as in ShuffleSplit.
-
- .. versionadded:: 0.17
-
- Parameters
- ----------
- labels : array, [n_samples]
- Labels of samples
-
- n_iter : int (default 5)
- Number of re-shuffling and splitting iterations.
-
- test_size : float (default 0.2), int, or None
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the labels to include in the test split. If
- int, represents the absolute number of test labels. If None,
- the value is automatically set to the complement of the train size.
-
- train_size : float, int, or None (default is None)
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the labels to include in the train split. If
- int, represents the absolute number of train labels. If None,
- the value is automatically set to the complement of the test size.
-
- random_state : int, RandomState instance or None, optional (default None)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- """
- def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None,
- random_state=None):
-
- classes, label_indices = np.unique(labels, return_inverse=True)
-
- super(LabelShuffleSplit, self).__init__(
- len(classes),
- n_iter=n_iter,
- test_size=test_size,
- train_size=train_size,
- random_state=random_state)
-
- self.labels = labels
- self.classes = classes
- self.label_indices = label_indices
-
- def __repr__(self):
- return ('%s(labels=%s, n_iter=%d, test_size=%s, '
- 'random_state=%s)' % (
- self.__class__.__name__,
- self.labels,
- self.n_iter,
- str(self.test_size),
- self.random_state,
- ))
-
- def __len__(self):
- return self.n_iter
-
- def _iter_indices(self):
- for label_train, label_test in super(LabelShuffleSplit,
- self)._iter_indices():
- # these are the indices of classes in the partition
- # invert them into data indices
-
- train = np.flatnonzero(np.in1d(self.label_indices, label_train))
- test = np.flatnonzero(np.in1d(self.label_indices, label_test))
-
- yield train, test
-
-
-##############################################################################
-def _index_param_value(X, v, indices):
- """Private helper function for parameter value indexing."""
- if not _is_arraylike(v) or _num_samples(v) != _num_samples(X):
- # pass through: skip indexing
- return v
- if sp.issparse(v):
- v = v.tocsr()
- return safe_indexing(v, indices)
-
-
-def cross_val_predict(estimator, X, y=None, cv=None, n_jobs=1,
- verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
- """Generate cross-validated estimates for each input data point
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.cross_val_predict` instead.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object implementing 'fit' and 'predict'
- The object to use to fit the data.
-
- X : array-like
- The data to fit. Can be, for example a list, or an array at least 2d.
-
- y : array-like, optional, default: None
- The target variable to try to predict in the case of
- supervised learning.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass, :class:`StratifiedKFold` is used. In all
- other cases, :class:`KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- n_jobs : integer, optional
- The number of CPUs to use to do the computation. -1 means
- 'all CPUs'.
-
- verbose : integer, optional
- The verbosity level.
-
- fit_params : dict, optional
- Parameters to pass to the fit method of the estimator.
-
- pre_dispatch : int, or string, optional
- Controls the number of jobs that get dispatched during parallel
- execution. Reducing this number can be useful to avoid an
- explosion of memory consumption when more jobs get dispatched
- than CPUs can process. This parameter can be:
-
- - None, in which case all the jobs are immediately
- created and spawned. Use this for lightweight and
- fast-running jobs, to avoid delays due to on-demand
- spawning of the jobs
-
- - An int, giving the exact number of total jobs that are
- spawned
-
- - A string, giving an expression as a function of n_jobs,
- as in '2*n_jobs'
-
- Returns
- -------
- preds : ndarray
- This is the result of calling 'predict'
-
- Examples
- --------
- >>> from sklearn import datasets, linear_model
- >>> from sklearn.cross_validation import cross_val_predict
- >>> diabetes = datasets.load_diabetes()
- >>> X = diabetes.data[:150]
- >>> y = diabetes.target[:150]
- >>> lasso = linear_model.Lasso()
- >>> y_pred = cross_val_predict(lasso, X, y)
- """
- X, y = indexable(X, y)
-
- cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
- # We clone the estimator to make sure that all the folds are
- # independent, and that it is pickle-able.
- parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
- pre_dispatch=pre_dispatch)
- preds_blocks = parallel(delayed(_fit_and_predict)(clone(estimator), X, y,
- train, test, verbose,
- fit_params)
- for train, test in cv)
-
- preds = [p for p, _ in preds_blocks]
- locs = np.concatenate([loc for _, loc in preds_blocks])
- if not _check_is_partition(locs, _num_samples(X)):
- raise ValueError('cross_val_predict only works for partitions')
- inv_locs = np.empty(len(locs), dtype=int)
- inv_locs[locs] = np.arange(len(locs))
-
- # Check for sparse predictions
- if sp.issparse(preds[0]):
- preds = sp.vstack(preds, format=preds[0].format)
- else:
- preds = np.concatenate(preds)
- return preds[inv_locs]
-
-
-def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params):
- """Fit estimator and predict values for a given dataset split.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object implementing 'fit' and 'predict'
- The object to use to fit the data.
-
- X : array-like of shape at least 2D
- The data to fit.
-
- y : array-like, optional, default: None
- The target variable to try to predict in the case of
- supervised learning.
-
- train : array-like, shape (n_train_samples,)
- Indices of training samples.
-
- test : array-like, shape (n_test_samples,)
- Indices of test samples.
-
- verbose : integer
- The verbosity level.
-
- fit_params : dict or None
- Parameters that will be passed to ``estimator.fit``.
-
- Returns
- -------
- preds : sequence
- Result of calling 'estimator.predict'
-
- test : array-like
- This is the value of the test parameter
- """
- # Adjust length of sample weights
- fit_params = fit_params if fit_params is not None else {}
- fit_params = dict([(k, _index_param_value(X, v, train))
- for k, v in fit_params.items()])
-
- X_train, y_train = _safe_split(estimator, X, y, train)
- X_test, _ = _safe_split(estimator, X, y, test, train)
-
- if y_train is None:
- estimator.fit(X_train, **fit_params)
- else:
- estimator.fit(X_train, y_train, **fit_params)
- preds = estimator.predict(X_test)
- return preds, test
-
-
-def _check_is_partition(locs, n):
- """Check whether locs is a reordering of the array np.arange(n)
-
- Parameters
- ----------
- locs : ndarray
- integer array to test
- n : int
- number of expected elements
-
- Returns
- -------
- is_partition : bool
- True iff sorted(locs) is range(n)
- """
- if len(locs) != n:
- return False
- hit = np.zeros(n, bool)
- hit[locs] = True
- if not np.all(hit):
- return False
- return True
-
-
-def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
- verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
- """Evaluate a score by cross-validation
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.cross_val_score` instead.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object implementing 'fit'
- The object to use to fit the data.
-
- X : array-like
- The data to fit. Can be, for example a list, or an array at least 2d.
-
- y : array-like, optional, default: None
- The target variable to try to predict in the case of
- supervised learning.
-
- scoring : string, callable or None, optional, default: None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass, :class:`StratifiedKFold` is used. In all
- other cases, :class:`KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- n_jobs : integer, optional
- The number of CPUs to use to do the computation. -1 means
- 'all CPUs'.
-
- verbose : integer, optional
- The verbosity level.
-
- fit_params : dict, optional
- Parameters to pass to the fit method of the estimator.
-
- pre_dispatch : int, or string, optional
- Controls the number of jobs that get dispatched during parallel
- execution. Reducing this number can be useful to avoid an
- explosion of memory consumption when more jobs get dispatched
- than CPUs can process. This parameter can be:
-
- - None, in which case all the jobs are immediately
- created and spawned. Use this for lightweight and
- fast-running jobs, to avoid delays due to on-demand
- spawning of the jobs
-
- - An int, giving the exact number of total jobs that are
- spawned
-
- - A string, giving an expression as a function of n_jobs,
- as in '2*n_jobs'
-
- Returns
- -------
- scores : array of float, shape=(len(list(cv)),)
- Array of scores of the estimator for each run of the cross validation.
-
- Examples
- --------
- >>> from sklearn import datasets, linear_model
- >>> from sklearn.cross_validation import cross_val_score
- >>> diabetes = datasets.load_diabetes()
- >>> X = diabetes.data[:150]
- >>> y = diabetes.target[:150]
- >>> lasso = linear_model.Lasso()
- >>> print(cross_val_score(lasso, X, y)) # doctest: +ELLIPSIS
- [ 0.33150734 0.08022311 0.03531764]
-
- See Also
- ---------
- :func:`sklearn.metrics.make_scorer`:
- Make a scorer from a performance metric or loss function.
-
- """
- X, y = indexable(X, y)
-
- cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
- scorer = check_scoring(estimator, scoring=scoring)
- # We clone the estimator to make sure that all the folds are
- # independent, and that it is pickle-able.
- parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
- pre_dispatch=pre_dispatch)
- scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
- train, test, verbose, None,
- fit_params)
- for train, test in cv)
- return np.array(scores)[:, 0]
-
-
-def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
- parameters, fit_params, return_train_score=False,
- return_parameters=False, error_score='raise'):
- """Fit estimator and compute scores for a given dataset split.
-
- Parameters
- ----------
- estimator : estimator object implementing 'fit'
- The object to use to fit the data.
-
- X : array-like of shape at least 2D
- The data to fit.
-
- y : array-like, optional, default: None
- The target variable to try to predict in the case of
- supervised learning.
-
- scorer : callable
- A scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
-
- train : array-like, shape (n_train_samples,)
- Indices of training samples.
-
- test : array-like, shape (n_test_samples,)
- Indices of test samples.
-
- verbose : integer
- The verbosity level.
-
- error_score : 'raise' (default) or numeric
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
-
- parameters : dict or None
- Parameters to be set on the estimator.
-
- fit_params : dict or None
- Parameters that will be passed to ``estimator.fit``.
-
- return_train_score : boolean, optional, default: False
- Compute and return score on training set.
-
- return_parameters : boolean, optional, default: False
- Return parameters that has been used for the estimator.
-
- Returns
- -------
- train_score : float, optional
- Score on training set, returned only if `return_train_score` is `True`.
-
- test_score : float
- Score on test set.
-
- n_test_samples : int
- Number of test samples.
-
- scoring_time : float
- Time spent for fitting and scoring in seconds.
-
- parameters : dict or None, optional
- The parameters that have been evaluated.
- """
- if verbose > 1:
- if parameters is None:
- msg = ''
- else:
- msg = '%s' % (', '.join('%s=%s' % (k, v)
- for k, v in parameters.items()))
- print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
-
- # Adjust length of sample weights
- fit_params = fit_params if fit_params is not None else {}
- fit_params = dict([(k, _index_param_value(X, v, train))
- for k, v in fit_params.items()])
-
- if parameters is not None:
- estimator.set_params(**parameters)
-
- start_time = time.time()
-
- X_train, y_train = _safe_split(estimator, X, y, train)
- X_test, y_test = _safe_split(estimator, X, y, test, train)
-
- try:
- if y_train is None:
- estimator.fit(X_train, **fit_params)
- else:
- estimator.fit(X_train, y_train, **fit_params)
-
- except Exception as e:
- if error_score == 'raise':
- raise
- elif isinstance(error_score, numbers.Number):
- test_score = error_score
- if return_train_score:
- train_score = error_score
- warnings.warn("Classifier fit failed. The score on this train-test"
- " partition for these parameters will be set to %f. "
- "Details: \n%r" % (error_score, e), FitFailedWarning)
- else:
- raise ValueError("error_score must be the string 'raise' or a"
- " numeric value. (Hint: if using 'raise', please"
- " make sure that it has been spelled correctly.)"
- )
-
- else:
- test_score = _score(estimator, X_test, y_test, scorer)
- if return_train_score:
- train_score = _score(estimator, X_train, y_train, scorer)
-
- scoring_time = time.time() - start_time
-
- if verbose > 2:
- msg += ", score=%f" % test_score
- if verbose > 1:
- end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
- print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
-
- ret = [train_score] if return_train_score else []
- ret.extend([test_score, _num_samples(X_test), scoring_time])
- if return_parameters:
- ret.append(parameters)
- return ret
-
-
-def _safe_split(estimator, X, y, indices, train_indices=None):
- """Create subset of dataset and properly handle kernels."""
- if hasattr(estimator, 'kernel') and callable(estimator.kernel) \
- and not isinstance(estimator.kernel, GPKernel):
- # cannot compute the kernel values with custom function
- raise ValueError("Cannot use a custom kernel function. "
- "Precompute the kernel matrix instead.")
-
- if not hasattr(X, "shape"):
- if getattr(estimator, "_pairwise", False):
- raise ValueError("Precomputed kernels or affinity matrices have "
- "to be passed as arrays or sparse matrices.")
- X_subset = [X[idx] for idx in indices]
- else:
- if getattr(estimator, "_pairwise", False):
- # X is a precomputed square kernel matrix
- if X.shape[0] != X.shape[1]:
- raise ValueError("X should be a square kernel matrix")
- if train_indices is None:
- X_subset = X[np.ix_(indices, indices)]
- else:
- X_subset = X[np.ix_(indices, train_indices)]
- else:
- X_subset = safe_indexing(X, indices)
-
- if y is not None:
- y_subset = safe_indexing(y, indices)
- else:
- y_subset = None
-
- return X_subset, y_subset
-
-
-def _score(estimator, X_test, y_test, scorer):
- """Compute the score of an estimator on a given test set."""
- if y_test is None:
- score = scorer(estimator, X_test)
- else:
- score = scorer(estimator, X_test, y_test)
- if hasattr(score, 'item'):
- try:
- # e.g. unwrap memmapped scalars
- score = score.item()
- except ValueError:
- # non-scalar?
- pass
- if not isinstance(score, numbers.Number):
- raise ValueError("scoring must return a number, got %s (%s) instead."
- % (str(score), type(score)))
- return score
-
-
-def _permutation_test_score(estimator, X, y, cv, scorer):
- """Auxiliary function for permutation_test_score"""
- avg_score = []
- for train, test in cv:
- X_train, y_train = _safe_split(estimator, X, y, train)
- X_test, y_test = _safe_split(estimator, X, y, test, train)
- estimator.fit(X_train, y_train)
- avg_score.append(scorer(estimator, X_test, y_test))
- return np.mean(avg_score)
-
-
-def _shuffle(y, labels, random_state):
- """Return a shuffled copy of y eventually shuffle among same labels."""
- if labels is None:
- ind = random_state.permutation(len(y))
- else:
- ind = np.arange(len(labels))
- for label in np.unique(labels):
- this_mask = (labels == label)
- ind[this_mask] = random_state.permutation(ind[this_mask])
- return safe_indexing(y, ind)
-
-
-def check_cv(cv, X=None, y=None, classifier=False):
- """Input checker utility for building a CV in a user friendly way.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.check_cv` instead.
-
- Parameters
- ----------
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if classifier is True and ``y`` is binary or
- multiclass, :class:`StratifiedKFold` is used. In all other cases,
- :class:`KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- X : array-like
- The data the cross-val object will be applied on.
-
- y : array-like
- The target variable for a supervised learning problem.
-
- classifier : boolean optional
- Whether the task is a classification task, in which case
- stratified KFold will be used.
-
- Returns
- -------
- checked_cv : a cross-validation generator instance.
- The return value is guaranteed to be a cv generator instance, whatever
- the input type.
- """
- is_sparse = sp.issparse(X)
- if cv is None:
- cv = 3
- if isinstance(cv, numbers.Integral):
- if classifier:
- if type_of_target(y) in ['binary', 'multiclass']:
- cv = StratifiedKFold(y, cv)
- else:
- cv = KFold(_num_samples(y), cv)
- else:
- if not is_sparse:
- n_samples = len(X)
- else:
- n_samples = X.shape[0]
- cv = KFold(n_samples, cv)
- return cv
-
-
-def permutation_test_score(estimator, X, y, cv=None,
- n_permutations=100, n_jobs=1, labels=None,
- random_state=0, verbose=0, scoring=None):
- """Evaluate the significance of a cross-validated score with permutations
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.permutation_test_score` instead.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object implementing 'fit'
- The object to use to fit the data.
-
- X : array-like of shape at least 2D
- The data to fit.
-
- y : array-like
- The target variable to try to predict in the case of
- supervised learning.
-
- scoring : string, callable or None, optional, default: None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass, :class:`StratifiedKFold` is used. In all
- other cases, :class:`KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- n_permutations : integer, optional
- Number of times to permute ``y``.
-
- n_jobs : integer, optional
- The number of CPUs to use to do the computation. -1 means
- 'all CPUs'.
-
- labels : array-like of shape [n_samples] (optional)
- Labels constrain the permutation among groups of samples with
- a same label.
-
- random_state : int, RandomState instance or None, optional (default=0)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- verbose : integer, optional
- The verbosity level.
-
- Returns
- -------
- score : float
- The true score without permuting targets.
-
- permutation_scores : array, shape (n_permutations,)
- The scores obtained for each permutations.
-
- pvalue : float
- The p-value, which approximates the probability that the score would
- be obtained by chance. This is calculated as:
-
- `(C + 1) / (n_permutations + 1)`
-
- Where C is the number of permutations whose score >= the true score.
-
- The best possible p-value is 1/(n_permutations + 1), the worst is 1.0.
-
- Notes
- -----
- This function implements Test 1 in:
-
- Ojala and Garriga. Permutation Tests for Studying Classifier
- Performance. The Journal of Machine Learning Research (2010)
- vol. 11
-
- """
- X, y = indexable(X, y)
- cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
- scorer = check_scoring(estimator, scoring=scoring)
- random_state = check_random_state(random_state)
-
- # We clone the estimator to make sure that all the folds are
- # independent, and that it is pickle-able.
- score = _permutation_test_score(clone(estimator), X, y, cv, scorer)
- permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
- delayed(_permutation_test_score)(
- clone(estimator), X, _shuffle(y, labels, random_state), cv,
- scorer)
- for _ in range(n_permutations))
- permutation_scores = np.array(permutation_scores)
- pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
- return score, permutation_scores, pvalue
-
-
-permutation_test_score.__test__ = False # to avoid a pb with nosetests
-
-
-def train_test_split(*arrays, **options):
- """Split arrays or matrices into random train and test subsets
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.train_test_split` instead.
-
- Quick utility that wraps input validation and
- ``next(iter(ShuffleSplit(n_samples)))`` and application to input
- data into a single call for splitting (and optionally subsampling)
- data in a oneliner.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- *arrays : sequence of indexables with same length / shape[0]
- Allowed inputs are lists, numpy arrays, scipy-sparse
- matrices or pandas dataframes.
-
- test_size : float, int, or None (default is None)
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the test split. If
- int, represents the absolute number of test samples. If None,
- the value is automatically set to the complement of the train size.
- If train size is also None, test size is set to 0.25.
-
- train_size : float, int, or None (default is None)
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the train split. If
- int, represents the absolute number of train samples. If None,
- the value is automatically set to the complement of the test size.
-
- random_state : int, RandomState instance or None, optional (default=None)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- stratify : array-like or None (default is None)
- If not None, data is split in a stratified fashion, using this as
- the labels array.
-
- .. versionadded:: 0.17
- *stratify* splitting
-
- Returns
- -------
- splitting : list, length = 2 * len(arrays),
- List containing train-test split of inputs.
-
- .. versionadded:: 0.16
- If the input is sparse, the output will be a
- ``scipy.sparse.csr_matrix``. Else, output type is the same as the
- input type.
-
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.cross_validation import train_test_split
- >>> X, y = np.arange(10).reshape((5, 2)), range(5)
- >>> X
- array([[0, 1],
- [2, 3],
- [4, 5],
- [6, 7],
- [8, 9]])
- >>> list(y)
- [0, 1, 2, 3, 4]
-
- >>> X_train, X_test, y_train, y_test = train_test_split(
- ... X, y, test_size=0.33, random_state=42)
- ...
- >>> X_train
- array([[4, 5],
- [0, 1],
- [6, 7]])
- >>> y_train
- [2, 0, 3]
- >>> X_test
- array([[2, 3],
- [8, 9]])
- >>> y_test
- [1, 4]
-
- """
- n_arrays = len(arrays)
- if n_arrays == 0:
- raise ValueError("At least one array required as input")
-
- test_size = options.pop('test_size', None)
- train_size = options.pop('train_size', None)
- random_state = options.pop('random_state', None)
- stratify = options.pop('stratify', None)
-
- if options:
- raise TypeError("Invalid parameters passed: %s" % str(options))
-
- if test_size is None and train_size is None:
- test_size = 0.25
- arrays = indexable(*arrays)
- if stratify is not None:
- cv = StratifiedShuffleSplit(stratify, test_size=test_size,
- train_size=train_size,
- random_state=random_state)
- else:
- n_samples = _num_samples(arrays[0])
- cv = ShuffleSplit(n_samples, test_size=test_size,
- train_size=train_size,
- random_state=random_state)
-
- train, test = next(iter(cv))
- return list(chain.from_iterable((safe_indexing(a, train),
- safe_indexing(a, test)) for a in arrays))
-
-
-train_test_split.__test__ = False # to avoid a pb with nosetests
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
deleted file mode 100644
index 4d756bdaa0cf8..0000000000000
--- a/sklearn/tests/test_cross_validation.py
+++ /dev/null
@@ -1,1252 +0,0 @@
-"""Test the cross_validation module"""
-from __future__ import division
-import warnings
-
-import numpy as np
-from scipy.sparse import coo_matrix
-from scipy.sparse import csr_matrix
-from scipy import stats
-
-from sklearn.exceptions import ConvergenceWarning
-from sklearn.utils.testing import assert_true
-from sklearn.utils.testing import assert_false
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_greater
-from sklearn.utils.testing import assert_greater_equal
-from sklearn.utils.testing import assert_less
-from sklearn.utils.testing import assert_not_equal
-from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_warns_message
-from sklearn.utils.testing import assert_raise_message
-from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.mocking import CheckingClassifier, MockDataFrame
-
-with warnings.catch_warnings():
- warnings.simplefilter('ignore')
- from sklearn import cross_validation as cval
-
-from sklearn.datasets import make_regression
-from sklearn.datasets import load_boston
-from sklearn.datasets import load_digits
-from sklearn.datasets import load_iris
-from sklearn.datasets import make_multilabel_classification
-from sklearn.metrics import explained_variance_score
-from sklearn.metrics import make_scorer
-from sklearn.metrics import precision_score
-from sklearn.externals import six
-from sklearn.externals.six.moves import zip
-
-from sklearn.linear_model import Ridge
-from sklearn.multiclass import OneVsRestClassifier
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.svm import SVC
-from sklearn.cluster import KMeans
-
-from sklearn.preprocessing import Imputer
-from sklearn.pipeline import Pipeline
-
-
-class MockClassifier(object):
- """Dummy classifier to test the cross-validation"""
-
- def __init__(self, a=0, allow_nd=False):
- self.a = a
- self.allow_nd = allow_nd
-
- def fit(self, X, Y=None, sample_weight=None, class_prior=None,
- sparse_sample_weight=None, sparse_param=None, dummy_int=None,
- dummy_str=None, dummy_obj=None, callback=None):
- """The dummy arguments are to test that this fit function can
- accept non-array arguments through cross-validation, such as:
- - int
- - str (this is actually array-like)
- - object
- - function
- """
- self.dummy_int = dummy_int
- self.dummy_str = dummy_str
- self.dummy_obj = dummy_obj
- if callback is not None:
- callback(self)
-
- if self.allow_nd:
- X = X.reshape(len(X), -1)
- if X.ndim >= 3 and not self.allow_nd:
- raise ValueError('X cannot be d')
- if sample_weight is not None:
- assert_true(sample_weight.shape[0] == X.shape[0],
- 'MockClassifier extra fit_param sample_weight.shape[0]'
- ' is {0}, should be {1}'.format(sample_weight.shape[0],
- X.shape[0]))
- if class_prior is not None:
- assert_true(class_prior.shape[0] == len(np.unique(y)),
- 'MockClassifier extra fit_param class_prior.shape[0]'
- ' is {0}, should be {1}'.format(class_prior.shape[0],
- len(np.unique(y))))
- if sparse_sample_weight is not None:
- fmt = ('MockClassifier extra fit_param sparse_sample_weight'
- '.shape[0] is {0}, should be {1}')
- assert_true(sparse_sample_weight.shape[0] == X.shape[0],
- fmt.format(sparse_sample_weight.shape[0], X.shape[0]))
- if sparse_param is not None:
- fmt = ('MockClassifier extra fit_param sparse_param.shape '
- 'is ({0}, {1}), should be ({2}, {3})')
- assert_true(sparse_param.shape == P_sparse.shape,
- fmt.format(sparse_param.shape[0],
- sparse_param.shape[1],
- P_sparse.shape[0], P_sparse.shape[1]))
- return self
-
- def predict(self, T):
- if self.allow_nd:
- T = T.reshape(len(T), -1)
- return T[:, 0]
-
- def score(self, X=None, Y=None):
- return 1. / (1 + np.abs(self.a))
-
- def get_params(self, deep=False):
- return {'a': self.a, 'allow_nd': self.allow_nd}
-
-X = np.ones((10, 2))
-X_sparse = coo_matrix(X)
-W_sparse = coo_matrix((np.array([1]), (np.array([1]), np.array([0]))),
- shape=(10, 1))
-P_sparse = coo_matrix(np.eye(5))
-
-# avoid StratifiedKFold's Warning about least populated class in y
-y = np.arange(10) % 3
-
-##############################################################################
-# Tests
-
-
-def check_valid_split(train, test, n_samples=None):
- # Use python sets to get more informative assertion failure messages
- train, test = set(train), set(test)
-
- # Train and test split should not overlap
- assert_equal(train.intersection(test), set())
-
- if n_samples is not None:
- # Check that the union of train an test split cover all the indices
- assert_equal(train.union(test), set(range(n_samples)))
-
-
-def check_cv_coverage(cv, expected_n_iter=None, n_samples=None):
- # Check that a all the samples appear at least once in a test fold
- if expected_n_iter is not None:
- assert_equal(len(cv), expected_n_iter)
- else:
- expected_n_iter = len(cv)
-
- collected_test_samples = set()
- iterations = 0
- for train, test in cv:
- check_valid_split(train, test, n_samples=n_samples)
- iterations += 1
- collected_test_samples.update(test)
-
- # Check that the accumulated test samples cover the whole dataset
- assert_equal(iterations, expected_n_iter)
- if n_samples is not None:
- assert_equal(collected_test_samples, set(range(n_samples)))
-
-
-def test_kfold_valueerrors():
- # Check that errors are raised if there is not enough samples
- assert_raises(ValueError, cval.KFold, 3, 4)
-
- # Check that a warning is raised if the least populated class has too few
- # members.
- y = [3, 3, -1, -1, 3]
-
- cv = assert_warns_message(Warning, "The least populated class",
- cval.StratifiedKFold, y, 3)
-
- # Check that despite the warning the folds are still computed even
- # though all the classes are not necessarily represented at on each
- # side of the split at each split
- check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y))
-
- # Check that errors are raised if all n_labels for individual
- # classes are less than n_folds.
- y = [3, 3, -1, -1, 2]
-
- assert_raises(ValueError, cval.StratifiedKFold, y, 3)
-
- # Error when number of folds is <= 1
- assert_raises(ValueError, cval.KFold, 2, 0)
- assert_raises(ValueError, cval.KFold, 2, 1)
- error_string = ("k-fold cross validation requires at least one"
- " train / test split")
- assert_raise_message(ValueError, error_string,
- cval.StratifiedKFold, y, 0)
- assert_raise_message(ValueError, error_string,
- cval.StratifiedKFold, y, 1)
-
- # When n is not integer:
- assert_raises(ValueError, cval.KFold, 2.5, 2)
-
- # When n_folds is not integer:
- assert_raises(ValueError, cval.KFold, 5, 1.5)
- assert_raises(ValueError, cval.StratifiedKFold, y, 1.5)
-
-
-def test_kfold_indices():
- # Check all indices are returned in the test folds
- kf = cval.KFold(300, 3)
- check_cv_coverage(kf, expected_n_iter=3, n_samples=300)
-
- # Check all indices are returned in the test folds even when equal-sized
- # folds are not possible
- kf = cval.KFold(17, 3)
- check_cv_coverage(kf, expected_n_iter=3, n_samples=17)
-
-
-def test_kfold_no_shuffle():
- # Manually check that KFold preserves the data ordering on toy datasets
- splits = iter(cval.KFold(4, 2))
- train, test = next(splits)
- assert_array_equal(test, [0, 1])
- assert_array_equal(train, [2, 3])
-
- train, test = next(splits)
- assert_array_equal(test, [2, 3])
- assert_array_equal(train, [0, 1])
-
- splits = iter(cval.KFold(5, 2))
- train, test = next(splits)
- assert_array_equal(test, [0, 1, 2])
- assert_array_equal(train, [3, 4])
-
- train, test = next(splits)
- assert_array_equal(test, [3, 4])
- assert_array_equal(train, [0, 1, 2])
-
-
-def test_stratified_kfold_no_shuffle():
- # Manually check that StratifiedKFold preserves the data ordering as much
- # as possible on toy datasets in order to avoid hiding sample dependencies
- # when possible
- splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2))
- train, test = next(splits)
- assert_array_equal(test, [0, 2])
- assert_array_equal(train, [1, 3])
-
- train, test = next(splits)
- assert_array_equal(test, [1, 3])
- assert_array_equal(train, [0, 2])
-
- splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2))
- train, test = next(splits)
- assert_array_equal(test, [0, 1, 3, 4])
- assert_array_equal(train, [2, 5, 6])
-
- train, test = next(splits)
- assert_array_equal(test, [2, 5, 6])
- assert_array_equal(train, [0, 1, 3, 4])
-
-
-def test_stratified_kfold_ratios():
- # Check that stratified kfold preserves label ratios in individual splits
- # Repeat with shuffling turned off and on
- n_samples = 1000
- labels = np.array([4] * int(0.10 * n_samples) +
- [0] * int(0.89 * n_samples) +
- [1] * int(0.01 * n_samples))
- for shuffle in [False, True]:
- for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle):
- assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10,
- 2)
- assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89,
- 2)
- assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01,
- 2)
- assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2)
- assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2)
- assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2)
-
-
-def test_kfold_balance():
- # Check that KFold returns folds with balanced sizes
- for kf in [cval.KFold(i, 5) for i in range(11, 17)]:
- sizes = []
- for _, test in kf:
- sizes.append(len(test))
-
- assert_true((np.max(sizes) - np.min(sizes)) <= 1)
- assert_equal(np.sum(sizes), kf.n)
-
-
-def test_stratifiedkfold_balance():
- # Check that KFold returns folds with balanced sizes (only when
- # stratification is possible)
- # Repeat with shuffling turned off and on
- labels = [0] * 3 + [1] * 14
- for shuffle in [False, True]:
- for skf in [cval.StratifiedKFold(labels[:i], 3, shuffle=shuffle)
- for i in range(11, 17)]:
- sizes = []
- for _, test in skf:
- sizes.append(len(test))
-
- assert_true((np.max(sizes) - np.min(sizes)) <= 1)
- assert_equal(np.sum(sizes), skf.n)
-
-
-def test_shuffle_kfold():
- # Check the indices are shuffled properly, and that all indices are
- # returned in the different test folds
- kf = cval.KFold(300, 3, shuffle=True, random_state=0)
- ind = np.arange(300)
-
- all_folds = None
- for train, test in kf:
- assert_true(np.any(np.arange(100) != ind[test]))
- assert_true(np.any(np.arange(100, 200) != ind[test]))
- assert_true(np.any(np.arange(200, 300) != ind[test]))
-
- if all_folds is None:
- all_folds = ind[test].copy()
- else:
- all_folds = np.concatenate((all_folds, ind[test]))
-
- all_folds.sort()
- assert_array_equal(all_folds, ind)
-
-
-def test_shuffle_stratifiedkfold():
- # Check that shuffling is happening when requested, and for proper
- # sample coverage
- labels = [0] * 20 + [1] * 20
- kf0 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=0))
- kf1 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=1))
- for (_, test0), (_, test1) in zip(kf0, kf1):
- assert_true(set(test0) != set(test1))
- check_cv_coverage(kf0, expected_n_iter=5, n_samples=40)
-
-
-def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372
- # The digits samples are dependent: they are apparently grouped by authors
- # although we don't have any information on the groups segment locations
- # for this data. We can highlight this fact be computing k-fold cross-
- # validation with and without shuffling: we observe that the shuffling case
- # wrongly makes the IID assumption and is therefore too optimistic: it
- # estimates a much higher accuracy (around 0.96) than the non
- # shuffling variant (around 0.86).
-
- digits = load_digits()
- X, y = digits.data[:800], digits.target[:800]
- model = SVC(C=10, gamma=0.005)
- n = len(y)
-
- cv = cval.KFold(n, 5, shuffle=False)
- mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
- assert_greater(0.88, mean_score)
- assert_greater(mean_score, 0.85)
-
- # Shuffling the data artificially breaks the dependency and hides the
- # overfitting of the model with regards to the writing style of the authors
- # by yielding a seriously overestimated score:
-
- cv = cval.KFold(n, 5, shuffle=True, random_state=0)
- mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
- assert_greater(mean_score, 0.95)
-
- cv = cval.KFold(n, 5, shuffle=True, random_state=1)
- mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
- assert_greater(mean_score, 0.95)
-
- # Similarly, StratifiedKFold should try to shuffle the data as little
- # as possible (while respecting the balanced class constraints)
- # and thus be able to detect the dependency by not overestimating
- # the CV score either. As the digits dataset is approximately balanced
- # the estimated mean score is close to the score measured with
- # non-shuffled KFold
-
- cv = cval.StratifiedKFold(y, 5)
- mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
- assert_greater(0.88, mean_score)
- assert_greater(mean_score, 0.85)
-
-
-def test_label_kfold():
- rng = np.random.RandomState(0)
-
- # Parameters of the test
- n_labels = 15
- n_samples = 1000
- n_folds = 5
-
- # Construct the test data
- tolerance = 0.05 * n_samples # 5 percent error allowed
- labels = rng.randint(0, n_labels, n_samples)
- folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
- ideal_n_labels_per_fold = n_samples // n_folds
-
- # Check that folds have approximately the same size
- assert_equal(len(folds), len(labels))
- for i in np.unique(folds):
- assert_greater_equal(tolerance,
- abs(sum(folds == i) - ideal_n_labels_per_fold))
-
- # Check that each label appears only in 1 fold
- for label in np.unique(labels):
- assert_equal(len(np.unique(folds[labels == label])), 1)
-
- # Check that no label is on both sides of the split
- labels = np.asarray(labels, dtype=object)
- for train, test in cval.LabelKFold(labels, n_folds=n_folds):
- assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
-
- # Construct the test data
- labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
- 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
- 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
- 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
- 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
- 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
- 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
- labels = np.asarray(labels, dtype=object)
-
- n_labels = len(np.unique(labels))
- n_samples = len(labels)
- n_folds = 5
- tolerance = 0.05 * n_samples # 5 percent error allowed
- folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
- ideal_n_labels_per_fold = n_samples // n_folds
-
- # Check that folds have approximately the same size
- assert_equal(len(folds), len(labels))
- for i in np.unique(folds):
- assert_greater_equal(tolerance,
- abs(sum(folds == i) - ideal_n_labels_per_fold))
-
- # Check that each label appears only in 1 fold
- for label in np.unique(labels):
- assert_equal(len(np.unique(folds[labels == label])), 1)
-
- # Check that no label is on both sides of the split
- for train, test in cval.LabelKFold(labels, n_folds=n_folds):
- assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
-
- # Should fail if there are more folds than labels
- labels = np.array([1, 1, 1, 2, 2])
- assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
-
-
-def test_shuffle_split():
- ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
- ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
- ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
- for typ in six.integer_types:
- ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
- for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
- assert_array_equal(t1[0], t2[0])
- assert_array_equal(t2[0], t3[0])
- assert_array_equal(t3[0], t4[0])
- assert_array_equal(t1[1], t2[1])
- assert_array_equal(t2[1], t3[1])
- assert_array_equal(t3[1], t4[1])
-
-
-def test_stratified_shuffle_split_init():
- y = np.asarray([0, 1, 1, 1, 2, 2, 2])
- # Check that error is raised if there is a class with only one sample
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)
-
- # Check that error is raised if the test set size is smaller than n_classes
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
- # Check that error is raised if the train set size is smaller than
- # n_classes
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)
-
- y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
- # Check that errors are raised if there is not enough samples
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)
-
- # Train size or test size too small
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2)
-
-
-def test_stratified_shuffle_split_iter():
- ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
- np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
- np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
- np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
- np.array([-1] * 800 + [1] * 50)
- ]
-
- for y in ys:
- sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
- random_state=0)
- test_size = np.ceil(0.33 * len(y))
- train_size = len(y) - test_size
- for train, test in sss:
- assert_array_equal(np.unique(y[train]), np.unique(y[test]))
- # Checks if folds keep classes proportions
- p_train = (np.bincount(np.unique(y[train],
- return_inverse=True)[1]) /
- float(len(y[train])))
- p_test = (np.bincount(np.unique(y[test],
- return_inverse=True)[1]) /
- float(len(y[test])))
- assert_array_almost_equal(p_train, p_test, 1)
- assert_equal(len(train) + len(test), y.size)
- assert_equal(len(train), train_size)
- assert_equal(len(test), test_size)
- assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
-
-
-def test_stratified_shuffle_split_even():
- # Test the StratifiedShuffleSplit, indices are drawn with a
- # equal chance
- n_folds = 5
- n_iter = 1000
-
- def assert_counts_are_ok(idx_counts, p):
- # Here we test that the distribution of the counts
- # per index is close enough to a binomial
- threshold = 0.05 / n_splits
- bf = stats.binom(n_splits, p)
- for count in idx_counts:
- p = bf.pmf(count)
- assert_true(p > threshold,
- "An index is not drawn with chance corresponding "
- "to even draws")
-
- for n_samples in (6, 22):
- labels = np.array((n_samples // 2) * [0, 1])
- splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
- test_size=1. / n_folds,
- random_state=0)
-
- train_counts = [0] * n_samples
- test_counts = [0] * n_samples
- n_splits = 0
- for train, test in splits:
- n_splits += 1
- for counter, ids in [(train_counts, train), (test_counts, test)]:
- for id in ids:
- counter[id] += 1
- assert_equal(n_splits, n_iter)
-
- assert_equal(len(train), splits.n_train)
- assert_equal(len(test), splits.n_test)
- assert_equal(len(set(train).intersection(test)), 0)
-
- label_counts = np.unique(labels)
- assert_equal(splits.test_size, 1.0 / n_folds)
- assert_equal(splits.n_train + splits.n_test, len(labels))
- assert_equal(len(label_counts), 2)
- ex_test_p = float(splits.n_test) / n_samples
- ex_train_p = float(splits.n_train) / n_samples
-
- assert_counts_are_ok(train_counts, ex_train_p)
- assert_counts_are_ok(test_counts, ex_test_p)
-
-
-def test_stratified_shuffle_split_overlap_train_test_bug():
- # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
- # the original bug report
- labels = [0, 1, 2, 3] * 3 + [4, 5] * 5
-
- splits = cval.StratifiedShuffleSplit(labels, n_iter=1,
- test_size=0.5, random_state=0)
- train, test = next(iter(splits))
-
- assert_array_equal(np.intersect1d(train, test), [])
-
-
-def test_predefinedsplit_with_kfold_split():
- # Check that PredefinedSplit can reproduce a split generated by Kfold.
- folds = -1 * np.ones(10)
- kf_train = []
- kf_test = []
- for i, (train_ind, test_ind) in enumerate(cval.KFold(10, 5, shuffle=True)):
- kf_train.append(train_ind)
- kf_test.append(test_ind)
- folds[test_ind] = i
- ps_train = []
- ps_test = []
- ps = cval.PredefinedSplit(folds)
- for train_ind, test_ind in ps:
- ps_train.append(train_ind)
- ps_test.append(test_ind)
- assert_array_equal(ps_train, kf_train)
- assert_array_equal(ps_test, kf_test)
-
-
-def test_label_shuffle_split():
- ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
- np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
- np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
- np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
- ]
-
- for y in ys:
- n_iter = 6
- test_size = 1. / 3
- slo = cval.LabelShuffleSplit(y, n_iter, test_size=test_size,
- random_state=0)
-
- # Make sure the repr works
- repr(slo)
-
- # Test that the length is correct
- assert_equal(len(slo), n_iter)
-
- y_unique = np.unique(y)
-
- for train, test in slo:
- # First test: no train label is in the test set and vice versa
- y_train_unique = np.unique(y[train])
- y_test_unique = np.unique(y[test])
- assert_false(np.any(np.in1d(y[train], y_test_unique)))
- assert_false(np.any(np.in1d(y[test], y_train_unique)))
-
- # Second test: train and test add up to all the data
- assert_equal(y[train].size + y[test].size, y.size)
-
- # Third test: train and test are disjoint
- assert_array_equal(np.intersect1d(train, test), [])
-
- # Fourth test: # unique train and test labels are correct,
- # +- 1 for rounding error
- assert_true(abs(len(y_test_unique) -
- round(test_size * len(y_unique))) <= 1)
- assert_true(abs(len(y_train_unique) -
- round((1.0 - test_size) * len(y_unique))) <= 1)
-
-
-def test_leave_label_out_changing_labels():
- # Check that LeaveOneLabelOut and LeavePLabelOut work normally if
- # the labels variable is changed before calling __iter__
- labels = np.array([0, 1, 2, 1, 1, 2, 0, 0])
- labels_changing = np.array(labels, copy=True)
- lolo = cval.LeaveOneLabelOut(labels)
- lolo_changing = cval.LeaveOneLabelOut(labels_changing)
- lplo = cval.LeavePLabelOut(labels, p=2)
- lplo_changing = cval.LeavePLabelOut(labels_changing, p=2)
- labels_changing[:] = 0
- for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
- for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
- assert_array_equal(train, train_chan)
- assert_array_equal(test, test_chan)
-
-
-def test_cross_val_score():
- clf = MockClassifier()
- for a in range(-10, 10):
- clf.a = a
- # Smoke test
- scores = cval.cross_val_score(clf, X, y)
- assert_array_equal(scores, clf.score(X, y))
-
- # test with multioutput y
- scores = cval.cross_val_score(clf, X_sparse, X)
- assert_array_equal(scores, clf.score(X_sparse, X))
-
- scores = cval.cross_val_score(clf, X_sparse, y)
- assert_array_equal(scores, clf.score(X_sparse, y))
-
- # test with multioutput y
- scores = cval.cross_val_score(clf, X_sparse, X)
- assert_array_equal(scores, clf.score(X_sparse, X))
-
- # test with X and y as list
- list_check = lambda x: isinstance(x, list)
- clf = CheckingClassifier(check_X=list_check)
- scores = cval.cross_val_score(clf, X.tolist(), y.tolist())
-
- clf = CheckingClassifier(check_y=list_check)
- scores = cval.cross_val_score(clf, X, y.tolist())
-
- assert_raises(ValueError, cval.cross_val_score, clf, X, y,
- scoring="sklearn")
-
- # test with 3d X and
- X_3d = X[:, :, np.newaxis]
- clf = MockClassifier(allow_nd=True)
- scores = cval.cross_val_score(clf, X_3d, y)
-
- clf = MockClassifier(allow_nd=False)
- assert_raises(ValueError, cval.cross_val_score, clf, X_3d, y)
-
-
-def test_cross_val_score_pandas():
- # check cross_val_score doesn't destroy pandas dataframe
- types = [(MockDataFrame, MockDataFrame)]
- try:
- from pandas import Series, DataFrame
- types.append((Series, DataFrame))
- except ImportError:
- pass
- for TargetType, InputFeatureType in types:
- # X dataframe, y series
- X_df, y_ser = InputFeatureType(X), TargetType(y)
- check_df = lambda x: isinstance(x, InputFeatureType)
- check_series = lambda x: isinstance(x, TargetType)
- clf = CheckingClassifier(check_X=check_df, check_y=check_series)
- cval.cross_val_score(clf, X_df, y_ser)
-
-
-def test_cross_val_score_mask():
- # test that cross_val_score works with boolean masks
- svm = SVC(kernel="linear")
- iris = load_iris()
- X, y = iris.data, iris.target
- cv_indices = cval.KFold(len(y), 5)
- scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
- cv_indices = cval.KFold(len(y), 5)
- cv_masks = []
- for train, test in cv_indices:
- mask_train = np.zeros(len(y), dtype=np.bool)
- mask_test = np.zeros(len(y), dtype=np.bool)
- mask_train[train] = 1
- mask_test[test] = 1
- cv_masks.append((train, test))
- scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
- assert_array_equal(scores_indices, scores_masks)
-
-
-def test_cross_val_score_precomputed():
- # test for svm with precomputed kernel
- svm = SVC(kernel="precomputed")
- iris = load_iris()
- X, y = iris.data, iris.target
- linear_kernel = np.dot(X, X.T)
- score_precomputed = cval.cross_val_score(svm, linear_kernel, y)
- svm = SVC(kernel="linear")
- score_linear = cval.cross_val_score(svm, X, y)
- assert_array_equal(score_precomputed, score_linear)
-
- # Error raised for non-square X
- svm = SVC(kernel="precomputed")
- assert_raises(ValueError, cval.cross_val_score, svm, X, y)
-
- # test error is raised when the precomputed kernel is not array-like
- # or sparse
- assert_raises(ValueError, cval.cross_val_score, svm,
- linear_kernel.tolist(), y)
-
-
-def test_cross_val_score_fit_params():
- clf = MockClassifier()
- n_samples = X.shape[0]
- n_classes = len(np.unique(y))
-
- DUMMY_INT = 42
- DUMMY_STR = '42'
- DUMMY_OBJ = object()
-
- def assert_fit_params(clf):
- # Function to test that the values are passed correctly to the
- # classifier arguments for non-array type
-
- assert_equal(clf.dummy_int, DUMMY_INT)
- assert_equal(clf.dummy_str, DUMMY_STR)
- assert_equal(clf.dummy_obj, DUMMY_OBJ)
-
- fit_params = {'sample_weight': np.ones(n_samples),
- 'class_prior': np.ones(n_classes) / n_classes,
- 'sparse_sample_weight': W_sparse,
- 'sparse_param': P_sparse,
- 'dummy_int': DUMMY_INT,
- 'dummy_str': DUMMY_STR,
- 'dummy_obj': DUMMY_OBJ,
- 'callback': assert_fit_params}
- cval.cross_val_score(clf, X, y, fit_params=fit_params)
-
-
-def test_cross_val_score_score_func():
- clf = MockClassifier()
- _score_func_args = []
-
- def score_func(y_test, y_predict):
- _score_func_args.append((y_test, y_predict))
- return 1.0
-
- with warnings.catch_warnings(record=True):
- scoring = make_scorer(score_func)
- score = cval.cross_val_score(clf, X, y, scoring=scoring)
- assert_array_equal(score, [1.0, 1.0, 1.0])
- assert len(_score_func_args) == 3
-
-
-def test_cross_val_score_errors():
- class BrokenEstimator:
- pass
-
- assert_raises(TypeError, cval.cross_val_score, BrokenEstimator(), X)
-
-
-def test_train_test_split_errors():
- assert_raises(ValueError, cval.train_test_split)
- assert_raises(ValueError, cval.train_test_split, range(3), train_size=1.1)
- assert_raises(ValueError, cval.train_test_split, range(3), test_size=0.6,
- train_size=0.6)
- assert_raises(ValueError, cval.train_test_split, range(3),
- test_size=np.float32(0.6), train_size=np.float32(0.6))
- assert_raises(ValueError, cval.train_test_split, range(3),
- test_size="wrong_type")
- assert_raises(ValueError, cval.train_test_split, range(3), test_size=2,
- train_size=4)
- assert_raises(TypeError, cval.train_test_split, range(3),
- some_argument=1.1)
- assert_raises(ValueError, cval.train_test_split, range(3), range(42))
-
-
-def test_train_test_split():
- X = np.arange(100).reshape((10, 10))
- X_s = coo_matrix(X)
- y = np.arange(10)
-
- # simple test
- split = cval.train_test_split(X, y, test_size=None, train_size=.5)
- X_train, X_test, y_train, y_test = split
- assert_equal(len(y_test), len(y_train))
- # test correspondence of X and y
- assert_array_equal(X_train[:, 0], y_train * 10)
- assert_array_equal(X_test[:, 0], y_test * 10)
-
- # conversion of lists to arrays (deprecated?)
- with warnings.catch_warnings(record=True):
- split = cval.train_test_split(X, X_s, y.tolist())
- X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
- assert_array_equal(X_train, X_s_train.toarray())
- assert_array_equal(X_test, X_s_test.toarray())
-
- # don't convert lists to anything else by default
- split = cval.train_test_split(X, X_s, y.tolist())
- X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
- assert_true(isinstance(y_train, list))
- assert_true(isinstance(y_test, list))
-
- # allow nd-arrays
- X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
- y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
- split = cval.train_test_split(X_4d, y_3d)
- assert_equal(split[0].shape, (7, 5, 3, 2))
- assert_equal(split[1].shape, (3, 5, 3, 2))
- assert_equal(split[2].shape, (7, 7, 11))
- assert_equal(split[3].shape, (3, 7, 11))
-
- # test stratification option
- y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
- for test_size, exp_test_size in zip([2, 4, 0.25, 0.5, 0.75],
- [2, 4, 2, 4, 6]):
- train, test = cval.train_test_split(y,
- test_size=test_size,
- stratify=y,
- random_state=0)
- assert_equal(len(test), exp_test_size)
- assert_equal(len(test) + len(train), len(y))
- # check the 1:1 ratio of ones and twos in the data is preserved
- assert_equal(np.sum(train == 1), np.sum(train == 2))
-
-
-def train_test_split_pandas():
- # check cross_val_score doesn't destroy pandas dataframe
- types = [MockDataFrame]
- try:
- from pandas import DataFrame
- types.append(DataFrame)
- except ImportError:
- pass
- for InputFeatureType in types:
- # X dataframe
- X_df = InputFeatureType(X)
- X_train, X_test = cval.train_test_split(X_df)
- assert_true(isinstance(X_train, InputFeatureType))
- assert_true(isinstance(X_test, InputFeatureType))
-
-def train_test_split_mock_pandas():
- # X mock dataframe
- X_df = MockDataFrame(X)
- X_train, X_test = cval.train_test_split(X_df)
- assert_true(isinstance(X_train, MockDataFrame))
- assert_true(isinstance(X_test, MockDataFrame))
-
-
-def test_cross_val_score_with_score_func_classification():
- iris = load_iris()
- clf = SVC(kernel='linear')
-
- # Default score (should be the accuracy score)
- scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
- assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)
-
- # Correct classification score (aka. zero / one score) - should be the
- # same as the default estimator score
- zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
- scoring="accuracy", cv=5)
- assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
-
- # F1 score (class are balanced so f1_score should be equal to zero/one
- # score
- f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
- scoring="f1_weighted", cv=5)
- assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
-
-
-def test_cross_val_score_with_score_func_regression():
- X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
- random_state=0)
- reg = Ridge()
-
- # Default score of the Ridge regression estimator
- scores = cval.cross_val_score(reg, X, y, cv=5)
- assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
-
- # R2 score (aka. determination coefficient) - should be the
- # same as the default estimator score
- r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
- assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
-
- # Mean squared error; this is a loss function, so "scores" are negative
- neg_mse_scores = cval.cross_val_score(reg, X, y, cv=5,
- scoring="neg_mean_squared_error")
- expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
- assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2)
-
- # Explained variance
- scoring = make_scorer(explained_variance_score)
- ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
- assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
-
-
-def test_permutation_score():
- iris = load_iris()
- X = iris.data
- X_sparse = coo_matrix(X)
- y = iris.target
- svm = SVC(kernel='linear')
- cv = cval.StratifiedKFold(y, 2)
-
- score, scores, pvalue = cval.permutation_test_score(
- svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
- assert_greater(score, 0.9)
- assert_almost_equal(pvalue, 0.0, 1)
-
- score_label, _, pvalue_label = cval.permutation_test_score(
- svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
- labels=np.ones(y.size), random_state=0)
- assert_true(score_label == score)
- assert_true(pvalue_label == pvalue)
-
- # check that we obtain the same results with a sparse representation
- svm_sparse = SVC(kernel='linear')
- cv_sparse = cval.StratifiedKFold(y, 2)
- score_label, _, pvalue_label = cval.permutation_test_score(
- svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
- scoring="accuracy", labels=np.ones(y.size), random_state=0)
-
- assert_true(score_label == score)
- assert_true(pvalue_label == pvalue)
-
- # test with custom scoring object
- def custom_score(y_true, y_pred):
- return (((y_true == y_pred).sum() - (y_true != y_pred).sum())
- / y_true.shape[0])
-
- scorer = make_scorer(custom_score)
- score, _, pvalue = cval.permutation_test_score(
- svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
- assert_almost_equal(score, .93, 2)
- assert_almost_equal(pvalue, 0.01, 3)
-
- # set random y
- y = np.mod(np.arange(len(y)), 3)
-
- score, scores, pvalue = cval.permutation_test_score(
- svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
-
- assert_less(score, 0.5)
- assert_greater(pvalue, 0.2)
-
-
-def test_cross_val_generator_with_indices():
- X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- y = np.array([1, 1, 2, 2])
- labels = np.array([1, 2, 3, 4])
- # explicitly passing indices value is deprecated
- loo = cval.LeaveOneOut(4)
- lpo = cval.LeavePOut(4, 2)
- kf = cval.KFold(4, 2)
- skf = cval.StratifiedKFold(y, 2)
- lolo = cval.LeaveOneLabelOut(labels)
- lopo = cval.LeavePLabelOut(labels, 2)
- ps = cval.PredefinedSplit([1, 1, 2, 2])
- ss = cval.ShuffleSplit(2)
- for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
- for train, test in cv:
- assert_not_equal(np.asarray(train).dtype.kind, 'b')
- assert_not_equal(np.asarray(train).dtype.kind, 'b')
- X[train], X[test]
- y[train], y[test]
-
-
-@ignore_warnings
-def test_cross_val_generator_with_default_indices():
- X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- y = np.array([1, 1, 2, 2])
- labels = np.array([1, 2, 3, 4])
- loo = cval.LeaveOneOut(4)
- lpo = cval.LeavePOut(4, 2)
- kf = cval.KFold(4, 2)
- skf = cval.StratifiedKFold(y, 2)
- lolo = cval.LeaveOneLabelOut(labels)
- lopo = cval.LeavePLabelOut(labels, 2)
- ss = cval.ShuffleSplit(2)
- ps = cval.PredefinedSplit([1, 1, 2, 2])
- for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
- for train, test in cv:
- assert_not_equal(np.asarray(train).dtype.kind, 'b')
- assert_not_equal(np.asarray(train).dtype.kind, 'b')
- X[train], X[test]
- y[train], y[test]
-
-
-def test_shufflesplit_errors():
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1,
- train_size=0.95)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
- assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
- train_size=None)
-
-
-def test_shufflesplit_reproducible():
- # Check that iterating twice on the ShuffleSplit gives the same
- # sequence of train-test when the random_state is given
- ss = cval.ShuffleSplit(10, random_state=21)
- assert_array_equal(list(a for a, b in ss), list(a for a, b in ss))
-
-
-def test_safe_split_with_precomputed_kernel():
- clf = SVC()
- clfp = SVC(kernel="precomputed")
-
- iris = load_iris()
- X, y = iris.data, iris.target
- K = np.dot(X, X.T)
-
- cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0)
- tr, te = list(cv)[0]
-
- X_tr, y_tr = cval._safe_split(clf, X, y, tr)
- K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr)
- assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))
-
- X_te, y_te = cval._safe_split(clf, X, y, te, tr)
- K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr)
- assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
-
-
-def test_cross_val_score_allow_nans():
- # Check that cross_val_score allows input data with NaNs
- X = np.arange(200, dtype=np.float64).reshape(10, -1)
- X[2, :] = np.nan
- y = np.repeat([0, 1], X.shape[0] / 2)
- p = Pipeline([
- ('imputer', Imputer(strategy='mean', missing_values='NaN')),
- ('classifier', MockClassifier()),
- ])
- cval.cross_val_score(p, X, y, cv=5)
-
-
-def test_train_test_split_allow_nans():
- # Check that train_test_split allows input data with NaNs
- X = np.arange(200, dtype=np.float64).reshape(10, -1)
- X[2, :] = np.nan
- y = np.repeat([0, 1], X.shape[0] / 2)
- cval.train_test_split(X, y, test_size=0.2, random_state=42)
-
-
-def test_permutation_test_score_allow_nans():
- # Check that permutation_test_score allows input data with NaNs
- X = np.arange(200, dtype=np.float64).reshape(10, -1)
- X[2, :] = np.nan
- y = np.repeat([0, 1], X.shape[0] / 2)
- p = Pipeline([
- ('imputer', Imputer(strategy='mean', missing_values='NaN')),
- ('classifier', MockClassifier()),
- ])
- cval.permutation_test_score(p, X, y, cv=5)
-
-
-def test_check_cv_return_types():
- X = np.ones((9, 2))
- cv = cval.check_cv(3, X, classifier=False)
- assert_true(isinstance(cv, cval.KFold))
-
- y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
- cv = cval.check_cv(3, X, y_binary, classifier=True)
- assert_true(isinstance(cv, cval.StratifiedKFold))
-
- y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
- cv = cval.check_cv(3, X, y_multiclass, classifier=True)
- assert_true(isinstance(cv, cval.StratifiedKFold))
-
- X = np.ones((5, 2))
- y_multilabel = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [0, 1, 1], [1, 0, 0]]
- cv = cval.check_cv(3, X, y_multilabel, classifier=True)
- assert_true(isinstance(cv, cval.KFold))
-
- y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
- cv = cval.check_cv(3, X, y_multioutput, classifier=True)
- assert_true(isinstance(cv, cval.KFold))
-
-
-def test_cross_val_score_multilabel():
- X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
- [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
- y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
- [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
- clf = KNeighborsClassifier(n_neighbors=1)
- scoring_micro = make_scorer(precision_score, average='micro')
- scoring_macro = make_scorer(precision_score, average='macro')
- scoring_samples = make_scorer(precision_score, average='samples')
- score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
- score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
- score_samples = cval.cross_val_score(clf, X, y,
- scoring=scoring_samples, cv=5)
- assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
- assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
- assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
-
-
-def test_cross_val_predict():
- boston = load_boston()
- X, y = boston.data, boston.target
- cv = cval.KFold(len(boston.target))
-
- est = Ridge()
-
- # Naive loop (should be same as cross_val_predict):
- preds2 = np.zeros_like(y)
- for train, test in cv:
- est.fit(X[train], y[train])
- preds2[test] = est.predict(X[test])
-
- preds = cval.cross_val_predict(est, X, y, cv=cv)
- assert_array_almost_equal(preds, preds2)
-
- preds = cval.cross_val_predict(est, X, y)
- assert_equal(len(preds), len(y))
-
- cv = cval.LeaveOneOut(len(y))
- preds = cval.cross_val_predict(est, X, y, cv=cv)
- assert_equal(len(preds), len(y))
-
- Xsp = X.copy()
- Xsp *= (Xsp > np.median(Xsp))
- Xsp = coo_matrix(Xsp)
- preds = cval.cross_val_predict(est, Xsp, y)
- assert_array_almost_equal(len(preds), len(y))
-
- preds = cval.cross_val_predict(KMeans(), X)
- assert_equal(len(preds), len(y))
-
- def bad_cv():
- for i in range(4):
- yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8])
-
- assert_raises(ValueError, cval.cross_val_predict, est, X, y, cv=bad_cv())
-
-
-def test_cross_val_predict_input_types():
- clf = Ridge()
- # Smoke test
- predictions = cval.cross_val_predict(clf, X, y)
- assert_equal(predictions.shape, (10,))
-
- # test with multioutput y
- with ignore_warnings(category=ConvergenceWarning):
- predictions = cval.cross_val_predict(clf, X_sparse, X)
- assert_equal(predictions.shape, (10, 2))
-
- predictions = cval.cross_val_predict(clf, X_sparse, y)
- assert_array_equal(predictions.shape, (10,))
-
- # test with multioutput y
- with ignore_warnings(category=ConvergenceWarning):
- predictions = cval.cross_val_predict(clf, X_sparse, X)
- assert_array_equal(predictions.shape, (10, 2))
-
- # test with X and y as list
- list_check = lambda x: isinstance(x, list)
- clf = CheckingClassifier(check_X=list_check)
- predictions = cval.cross_val_predict(clf, X.tolist(), y.tolist())
-
- clf = CheckingClassifier(check_y=list_check)
- predictions = cval.cross_val_predict(clf, X, y.tolist())
-
- # test with 3d X and
- X_3d = X[:, :, np.newaxis]
- check_3d = lambda x: x.ndim == 3
- clf = CheckingClassifier(check_X=check_3d)
- predictions = cval.cross_val_predict(clf, X_3d, y)
- assert_array_equal(predictions.shape, (10,))
-
-
-def test_cross_val_predict_pandas():
- # check cross_val_score doesn't destroy pandas dataframe
- types = [(MockDataFrame, MockDataFrame)]
- try:
- from pandas import Series, DataFrame
- types.append((Series, DataFrame))
- except ImportError:
- pass
- for TargetType, InputFeatureType in types:
- # X dataframe, y series
- X_df, y_ser = InputFeatureType(X), TargetType(y)
- check_df = lambda x: isinstance(x, InputFeatureType)
- check_series = lambda x: isinstance(x, TargetType)
- clf = CheckingClassifier(check_X=check_df, check_y=check_series)
- cval.cross_val_predict(clf, X_df, y_ser)
-
-
-def test_sparse_fit_params():
- iris = load_iris()
- X, y = iris.data, iris.target
- clf = MockClassifier()
- fit_params = {'sparse_sample_weight': coo_matrix(np.eye(X.shape[0]))}
- a = cval.cross_val_score(clf, X, y, fit_params=fit_params)
- assert_array_equal(a, np.ones(3))
-
-
-def test_check_is_partition():
- p = np.arange(100)
- assert_true(cval._check_is_partition(p, 100))
- assert_false(cval._check_is_partition(np.delete(p, 23), 100))
-
- p[0] = 23
- assert_false(cval._check_is_partition(p, 100))
-
-
-def test_cross_val_predict_sparse_prediction():
- # check that cross_val_predict gives same result for sparse and dense input
- X, y = make_multilabel_classification(n_classes=2, n_labels=1,
- allow_unlabeled=False,
- return_indicator=True,
- random_state=1)
- X_sparse = csr_matrix(X)
- y_sparse = csr_matrix(y)
- classif = OneVsRestClassifier(SVC(kernel='linear'))
- preds = cval.cross_val_predict(classif, X, y, cv=10)
- preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10)
- preds_sparse = preds_sparse.toarray()
- assert_array_almost_equal(preds_sparse, preds)
From 6dfe9aa732a6860ea0d24489b62efe98b289cd06 Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Thu, 9 Nov 2017 15:44:44 +0100
Subject: [PATCH 02/36] Fix imports (from corss_validation module to
model_selection module)
---
sklearn/feature_selection/rfe.py | 3 ++-
sklearn/grid_search.py | 4 ++--
sklearn/learning_curve.py | 5 +++--
sklearn/tests/test_grid_search.py | 2 +-
sklearn/tests/test_learning_curve.py | 2 +-
5 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py
index 5bde9e57c3f9f..576c872982f5a 100644
--- a/sklearn/feature_selection/rfe.py
+++ b/sklearn/feature_selection/rfe.py
@@ -9,6 +9,7 @@
import numpy as np
from ..utils import check_X_y, safe_sqr
from ..utils.metaestimators import if_delegate_has_method
+from ..utils.metaestimators import _safe_split
from ..utils.validation import check_is_fitted
from ..base import BaseEstimator
from ..base import MetaEstimatorMixin
@@ -16,7 +17,7 @@
from ..base import is_classifier
from ..externals.joblib import Parallel, delayed
from ..model_selection import check_cv
-from ..model_selection._validation import _safe_split, _score
+from ..model_selection._validation import _score
from ..metrics.scorer import check_scoring
from .base import SelectorMixin
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
index 76cdaa7cb1de5..e36d22c501621 100644
--- a/sklearn/grid_search.py
+++ b/sklearn/grid_search.py
@@ -21,8 +21,8 @@
from .base import BaseEstimator, is_classifier, clone
from .base import MetaEstimatorMixin
-from .cross_validation import check_cv
-from .cross_validation import _fit_and_score
+from .model_selection import check_cv
+from .model_selection._validation import _fit_and_score
from .externals.joblib import Parallel, delayed
from .externals import six
from .utils import check_random_state
diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py
index 5571138d68d83..0bb24046680ec 100644
--- a/sklearn/learning_curve.py
+++ b/sklearn/learning_curve.py
@@ -9,9 +9,10 @@
import numpy as np
from .base import is_classifier, clone
-from .cross_validation import check_cv
+from .model_selection import check_cv
from .externals.joblib import Parallel, delayed
-from .cross_validation import _safe_split, _score, _fit_and_score
+from .utils.metaestimators import _safe_split
+from .model_selection._validation import _fit_and_score, _score
from .metrics.scorer import check_scoring
from .utils import indexable
diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py
index f3c003e8c5be5..3605da1613e13 100644
--- a/sklearn/tests/test_grid_search.py
+++ b/sklearn/tests/test_grid_search.py
@@ -45,12 +45,12 @@
from sklearn.linear_model import Ridge
from sklearn.exceptions import FitFailedWarning
+from sklearn.model_selection import KFold, StratifiedKFold
with warnings.catch_warnings():
warnings.simplefilter('ignore')
from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV,
ParameterGrid, ParameterSampler)
- from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py
index afaae84b92b04..d75e6bc82f6b3 100644
--- a/sklearn/tests/test_learning_curve.py
+++ b/sklearn/tests/test_learning_curve.py
@@ -14,11 +14,11 @@
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_false
from sklearn.datasets import make_classification
+from sklearn.model_selection import KFold
with warnings.catch_warnings():
warnings.simplefilter('ignore')
from sklearn.learning_curve import learning_curve, validation_curve
- from sklearn.cross_validation import KFold
from sklearn.linear_model import PassiveAggressiveClassifier
From af424240be12734ef2a365fb4205892d32acd72d Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Thu, 9 Nov 2017 15:45:40 +0100
Subject: [PATCH 03/36] Remove tests checking old implementation
---
sklearn/model_selection/tests/test_split.py | 26 ---------------------
1 file changed, 26 deletions(-)
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 3f54aaf3c66fc..0071129d8ce73 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -1210,36 +1210,10 @@ def test_check_cv():
cv = check_cv(3, y_multioutput, classifier=True)
np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))
- # Check if the old style classes are wrapped to have a split method
- X = np.ones(9)
- y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
- cv1 = check_cv(3, y_multiclass, classifier=True)
-
- with warnings.catch_warnings(record=True):
- from sklearn.cross_validation import StratifiedKFold as OldSKF
-
- cv2 = check_cv(OldSKF(y_multiclass, n_folds=3))
- np.testing.assert_equal(list(cv1.split(X, y_multiclass)),
- list(cv2.split()))
-
assert_raises(ValueError, check_cv, cv="lolo")
def test_cv_iterable_wrapper():
- y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
-
- with warnings.catch_warnings(record=True):
- from sklearn.cross_validation import StratifiedKFold as OldSKF
-
- cv = OldSKF(y_multiclass, n_folds=3)
- wrapped_old_skf = _CVIterableWrapper(cv)
-
- # Check if split works correctly
- np.testing.assert_equal(list(cv), list(wrapped_old_skf.split()))
-
- # Check if get_n_splits works correctly
- assert_equal(len(cv), wrapped_old_skf.get_n_splits())
-
kf_iter = KFold(n_splits=5).split(X, y)
kf_iter_wrapped = check_cv(kf_iter)
# Since the wrapped iterable is enlisted and stored,
From 2362011efcbf6651ed6ce4c3cea2cafab67857e2 Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Thu, 9 Nov 2017 16:47:37 +0100
Subject: [PATCH 04/36] Remove grid_search and learning_curve also deprecated
---
sklearn/__init__.py | 13 +-
sklearn/grid_search.py | 1046 --------------------------
sklearn/learning_curve.py | 361 ---------
sklearn/tests/test_grid_search.py | 815 --------------------
sklearn/tests/test_learning_curve.py | 312 --------
5 files changed, 6 insertions(+), 2541 deletions(-)
delete mode 100644 sklearn/grid_search.py
delete mode 100644 sklearn/learning_curve.py
delete mode 100644 sklearn/tests/test_grid_search.py
delete mode 100644 sklearn/tests/test_learning_curve.py
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 27879e16be363..4c1f6f8e829e0 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -137,13 +137,12 @@ def config_context(**new_config):
__all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions',
'externals', 'feature_extraction', 'feature_selection',
- 'gaussian_process', 'grid_search', 'isotonic',
- 'kernel_approximation', 'kernel_ridge', 'learning_curve',
- 'linear_model', 'manifold', 'metrics', 'mixture',
- 'model_selection', 'multiclass', 'multioutput', 'naive_bayes',
- 'neighbors', 'neural_network', 'pipeline', 'preprocessing',
- 'random_projection', 'semi_supervised', 'svm', 'tree',
- 'discriminant_analysis',
+ 'gaussian_process', 'isotonic', 'kernel_approximation',
+ 'kernel_ridge', 'linear_model', 'manifold', 'metrics',
+ 'mixture', 'model_selection', 'multiclass', 'multioutput',
+ 'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
+ 'preprocessing', 'random_projection', 'semi_supervised', 'svm',
+ 'tree', 'discriminant_analysis',
# Non-modules:
'clone']
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
deleted file mode 100644
index e36d22c501621..0000000000000
--- a/sklearn/grid_search.py
+++ /dev/null
@@ -1,1046 +0,0 @@
-"""
-The :mod:`sklearn.grid_search` includes utilities to fine-tune the parameters
-of an estimator.
-"""
-from __future__ import print_function
-
-# Author: Alexandre Gramfort ,
-# Gael Varoquaux
-# Andreas Mueller
-# Olivier Grisel
-# License: BSD 3 clause
-
-from abc import ABCMeta, abstractmethod
-from collections import Mapping, namedtuple, Sized
-from functools import partial, reduce
-from itertools import product
-import operator
-import warnings
-
-import numpy as np
-
-from .base import BaseEstimator, is_classifier, clone
-from .base import MetaEstimatorMixin
-from .model_selection import check_cv
-from .model_selection._validation import _fit_and_score
-from .externals.joblib import Parallel, delayed
-from .externals import six
-from .utils import check_random_state
-from .utils.random import sample_without_replacement
-from .utils.validation import _num_samples, indexable
-from .utils.metaestimators import if_delegate_has_method
-from .metrics.scorer import check_scoring
-
-
-__all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point',
- 'ParameterSampler', 'RandomizedSearchCV']
-
-
-warnings.warn("This module was deprecated in version 0.18 in favor of the "
- "model_selection module into which all the refactored classes "
- "and functions are moved. This module will be removed in 0.20.",
- DeprecationWarning)
-
-
-class ParameterGrid(object):
- """Grid of parameters with a discrete number of values for each.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.ParameterGrid` instead.
-
- Can be used to iterate over parameter value combinations with the
- Python built-in function iter.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- param_grid : dict of string to sequence, or sequence of such
- The parameter grid to explore, as a dictionary mapping estimator
- parameters to sequences of allowed values.
-
- An empty dict signifies default parameters.
-
- A sequence of dicts signifies a sequence of grids to search, and is
- useful to avoid exploring parameter combinations that make no sense
- or have no effect. See the examples below.
-
- Examples
- --------
- >>> from sklearn.grid_search import ParameterGrid
- >>> param_grid = {'a': [1, 2], 'b': [True, False]}
- >>> list(ParameterGrid(param_grid)) == (
- ... [{'a': 1, 'b': True}, {'a': 1, 'b': False},
- ... {'a': 2, 'b': True}, {'a': 2, 'b': False}])
- True
-
- >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
- >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
- ... {'kernel': 'rbf', 'gamma': 1},
- ... {'kernel': 'rbf', 'gamma': 10}]
- True
- >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
- True
-
- See also
- --------
- :class:`GridSearchCV`:
- uses ``ParameterGrid`` to perform a full parallelized parameter search.
- """
-
- def __init__(self, param_grid):
- if isinstance(param_grid, Mapping):
- # wrap dictionary in a singleton list to support either dict
- # or list of dicts
- param_grid = [param_grid]
- self.param_grid = param_grid
-
- def __iter__(self):
- """Iterate over the points in the grid.
-
- Returns
- -------
- params : iterator over dict of string to any
- Yields dictionaries mapping each estimator parameter to one of its
- allowed values.
- """
- for p in self.param_grid:
- # Always sort the keys of a dictionary, for reproducibility
- items = sorted(p.items())
- if not items:
- yield {}
- else:
- keys, values = zip(*items)
- for v in product(*values):
- params = dict(zip(keys, v))
- yield params
-
- def __len__(self):
- """Number of points on the grid."""
- # Product function that can handle iterables (np.product can't).
- product = partial(reduce, operator.mul)
- return sum(product(len(v) for v in p.values()) if p else 1
- for p in self.param_grid)
-
- def __getitem__(self, ind):
- """Get the parameters that would be ``ind``th in iteration
-
- Parameters
- ----------
- ind : int
- The iteration index
-
- Returns
- -------
- params : dict of string to any
- Equal to list(self)[ind]
- """
- # This is used to make discrete sampling without replacement memory
- # efficient.
- for sub_grid in self.param_grid:
- # XXX: could memoize information used here
- if not sub_grid:
- if ind == 0:
- return {}
- else:
- ind -= 1
- continue
-
- # Reverse so most frequent cycling parameter comes first
- keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
- sizes = [len(v_list) for v_list in values_lists]
- total = np.product(sizes)
-
- if ind >= total:
- # Try the next grid
- ind -= total
- else:
- out = {}
- for key, v_list, n in zip(keys, values_lists, sizes):
- ind, offset = divmod(ind, n)
- out[key] = v_list[offset]
- return out
-
- raise IndexError('ParameterGrid index out of range')
-
-
-class ParameterSampler(object):
- """Generator on parameters sampled from given distributions.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.ParameterSampler` instead.
-
- Non-deterministic iterable over random candidate combinations for hyper-
- parameter search. If all parameters are presented as a list,
- sampling without replacement is performed. If at least one parameter
- is given as a distribution, sampling with replacement is used.
- It is highly recommended to use continuous distributions for continuous
- parameters.
-
- Note that as of SciPy 0.12, the ``scipy.stats.distributions`` do not accept
- a custom RNG instance and always use the singleton RNG from
- ``numpy.random``. Hence setting ``random_state`` will not guarantee a
- deterministic iteration whenever ``scipy.stats`` distributions are used to
- define the parameter search space.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- param_distributions : dict
- Dictionary where the keys are parameters and values
- are distributions from which a parameter is to be sampled.
- Distributions either have to provide a ``rvs`` function
- to sample from them, or can be given as a list of values,
- where a uniform distribution is assumed.
-
- n_iter : integer
- Number of parameter settings that are produced.
-
- random_state : int, RandomState instance or None, optional (default=None)
- Pseudo random number generator state used for random uniform sampling
- from lists of possible values instead of scipy.stats distributions.
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- Returns
- -------
- params : dict of string to any
- **Yields** dictionaries mapping each estimator parameter to
- as sampled value.
-
- Examples
- --------
- >>> from sklearn.grid_search import ParameterSampler
- >>> from scipy.stats.distributions import expon
- >>> import numpy as np
- >>> np.random.seed(0)
- >>> param_grid = {'a':[1, 2], 'b': expon()}
- >>> param_list = list(ParameterSampler(param_grid, n_iter=4))
- >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
- ... for d in param_list]
- >>> rounded_list == [{'b': 0.89856, 'a': 1},
- ... {'b': 0.923223, 'a': 1},
- ... {'b': 1.878964, 'a': 2},
- ... {'b': 1.038159, 'a': 2}]
- True
- """
- def __init__(self, param_distributions, n_iter, random_state=None):
- self.param_distributions = param_distributions
- self.n_iter = n_iter
- self.random_state = random_state
-
- def __iter__(self):
- # check if all distributions are given as lists
- # in this case we want to sample without replacement
- all_lists = np.all([not hasattr(v, "rvs")
- for v in self.param_distributions.values()])
- rnd = check_random_state(self.random_state)
-
- if all_lists:
- # look up sampled parameter settings in parameter grid
- param_grid = ParameterGrid(self.param_distributions)
- grid_size = len(param_grid)
-
- if grid_size < self.n_iter:
- raise ValueError(
- "The total space of parameters %d is smaller "
- "than n_iter=%d." % (grid_size, self.n_iter)
- + " For exhaustive searches, use GridSearchCV.")
- for i in sample_without_replacement(grid_size, self.n_iter,
- random_state=rnd):
- yield param_grid[i]
-
- else:
- # Always sort the keys of a dictionary, for reproducibility
- items = sorted(self.param_distributions.items())
- for _ in six.moves.range(self.n_iter):
- params = dict()
- for k, v in items:
- if hasattr(v, "rvs"):
- params[k] = v.rvs()
- else:
- params[k] = v[rnd.randint(len(v))]
- yield params
-
- def __len__(self):
- """Number of points that will be sampled."""
- return self.n_iter
-
-
-def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
- verbose, error_score='raise', **fit_params):
- """Run fit on one set of parameters.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.fit_grid_point` instead.
-
- Parameters
- ----------
- X : array-like, sparse matrix or list
- Input data.
-
- y : array-like or None
- Targets for input data.
-
- estimator : estimator object
- A object of that type is instantiated for each grid point.
- This is assumed to implement the scikit-learn estimator interface.
- Either estimator needs to provide a ``score`` function,
- or ``scoring`` must be passed.
-
- parameters : dict
- Parameters to be set on estimator for this grid point.
-
- train : ndarray, dtype int or bool
- Boolean mask or indices for training set.
-
- test : ndarray, dtype int or bool
- Boolean mask or indices for test set.
-
- scorer : callable or None.
- If provided must be a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
-
- verbose : int
- Verbosity level.
-
- **fit_params : kwargs
- Additional parameter passed to the fit function of the estimator.
-
- error_score : 'raise' (default) or numeric
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
-
- Returns
- -------
- score : float
- Score of this parameter setting on given training / test split.
-
- parameters : dict
- The parameters that have been evaluated.
-
- n_samples_test : int
- Number of test samples in this split.
- """
- score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train,
- test, verbose, parameters,
- fit_params, error_score)
- return score, parameters, n_samples_test
-
-
-def _check_param_grid(param_grid):
- if hasattr(param_grid, 'items'):
- param_grid = [param_grid]
-
- for p in param_grid:
- for name, v in p.items():
- if isinstance(v, np.ndarray) and v.ndim > 1:
- raise ValueError("Parameter array should be one-dimensional.")
-
- check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
- if True not in check:
- raise ValueError("Parameter values for parameter ({0}) need "
- "to be a sequence.".format(name))
-
- if len(v) == 0:
- raise ValueError("Parameter values for parameter ({0}) need "
- "to be a non-empty sequence.".format(name))
-
-
-class _CVScoreTuple (namedtuple('_CVScoreTuple',
- ('parameters',
- 'mean_validation_score',
- 'cv_validation_scores'))):
- # A raw namedtuple is very memory efficient as it packs the attributes
- # in a struct to get rid of the __dict__ of attributes in particular it
- # does not copy the string for the keys on each instance.
- # By deriving a namedtuple class just to introduce the __repr__ method we
- # would also reintroduce the __dict__ on the instance. By telling the
- # Python interpreter that this subclass uses static __slots__ instead of
- # dynamic attributes. Furthermore we don't need any additional slot in the
- # subclass so we set __slots__ to the empty tuple.
- __slots__ = ()
-
- def __repr__(self):
- """Simple custom repr to summarize the main info"""
- return "mean: {0:.5f}, std: {1:.5f}, params: {2}".format(
- self.mean_validation_score,
- np.std(self.cv_validation_scores),
- self.parameters)
-
-
-class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
- MetaEstimatorMixin)):
- """Base class for hyper parameter search with cross-validation."""
-
- @abstractmethod
- def __init__(self, estimator, scoring=None,
- fit_params=None, n_jobs=1, iid=True,
- refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
- error_score='raise'):
-
- self.scoring = scoring
- self.estimator = estimator
- self.n_jobs = n_jobs
- self.fit_params = fit_params if fit_params is not None else {}
- self.iid = iid
- self.refit = refit
- self.cv = cv
- self.verbose = verbose
- self.pre_dispatch = pre_dispatch
- self.error_score = error_score
-
- @property
- def _estimator_type(self):
- return self.estimator._estimator_type
-
- @property
- def classes_(self):
- return self.best_estimator_.classes_
-
- def score(self, X, y=None):
- """Returns the score on the given data, if the estimator has been refit.
-
- This uses the score defined by ``scoring`` where provided, and the
- ``best_estimator_.score`` method otherwise.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
- Input data, where n_samples is the number of samples and
- n_features is the number of features.
-
- y : array-like, shape = [n_samples] or [n_samples, n_output], optional
- Target relative to X for classification or regression;
- None for unsupervised learning.
-
- Returns
- -------
- score : float
-
- Notes
- -----
- * The long-standing behavior of this method changed in version 0.16.
- * It no longer uses the metric provided by ``estimator.score`` if the
- ``scoring`` parameter was set when fitting.
-
- """
- if self.scorer_ is None:
- raise ValueError("No score function explicitly defined, "
- "and the estimator doesn't provide one %s"
- % self.best_estimator_)
- return self.scorer_(self.best_estimator_, X, y)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def predict(self, X):
- """Call predict on the estimator with the best found parameters.
-
- Only available if ``refit=True`` and the underlying estimator supports
- ``predict``.
-
- Parameters
- -----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.predict(X)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def predict_proba(self, X):
- """Call predict_proba on the estimator with the best found parameters.
-
- Only available if ``refit=True`` and the underlying estimator supports
- ``predict_proba``.
-
- Parameters
- -----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.predict_proba(X)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def predict_log_proba(self, X):
- """Call predict_log_proba on the estimator with the best found parameters.
-
- Only available if ``refit=True`` and the underlying estimator supports
- ``predict_log_proba``.
-
- Parameters
- -----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.predict_log_proba(X)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def decision_function(self, X):
- """Call decision_function on the estimator with the best found parameters.
-
- Only available if ``refit=True`` and the underlying estimator supports
- ``decision_function``.
-
- Parameters
- -----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.decision_function(X)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def transform(self, X):
- """Call transform on the estimator with the best found parameters.
-
- Only available if the underlying estimator supports ``transform`` and
- ``refit=True``.
-
- Parameters
- -----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.transform(X)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def inverse_transform(self, Xt):
- """Call inverse_transform on the estimator with the best found parameters.
-
- Only available if the underlying estimator implements ``inverse_transform`` and
- ``refit=True``.
-
- Parameters
- -----------
- Xt : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.inverse_transform(Xt)
-
- def _fit(self, X, y, parameter_iterable):
- """Actual fitting, performing the search over parameters."""
-
- estimator = self.estimator
- cv = self.cv
- self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
-
- n_samples = _num_samples(X)
- X, y = indexable(X, y)
-
- if y is not None:
- if len(y) != n_samples:
- raise ValueError('Target variable (y) has a different number '
- 'of samples (%i) than data (X: %i samples)'
- % (len(y), n_samples))
- cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
-
- if self.verbose > 0:
- if isinstance(parameter_iterable, Sized):
- n_candidates = len(parameter_iterable)
- print("Fitting {0} folds for each of {1} candidates, totalling"
- " {2} fits".format(len(cv), n_candidates,
- n_candidates * len(cv)))
-
- base_estimator = clone(self.estimator)
-
- pre_dispatch = self.pre_dispatch
-
- out = Parallel(
- n_jobs=self.n_jobs, verbose=self.verbose,
- pre_dispatch=pre_dispatch
- )(
- delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
- train, test, self.verbose, parameters,
- self.fit_params, return_parameters=True,
- error_score=self.error_score)
- for parameters in parameter_iterable
- for train, test in cv)
-
- # Out is a list of triplet: score, estimator, n_test_samples
- n_fits = len(out)
- n_folds = len(cv)
-
- scores = list()
- grid_scores = list()
- for grid_start in range(0, n_fits, n_folds):
- n_test_samples = 0
- score = 0
- all_scores = []
- for this_score, this_n_test_samples, _, parameters in \
- out[grid_start:grid_start + n_folds]:
- all_scores.append(this_score)
- if self.iid:
- this_score *= this_n_test_samples
- n_test_samples += this_n_test_samples
- score += this_score
- if self.iid:
- score /= float(n_test_samples)
- else:
- score /= float(n_folds)
- scores.append((score, parameters))
- # TODO: shall we also store the test_fold_sizes?
- grid_scores.append(_CVScoreTuple(
- parameters,
- score,
- np.array(all_scores)))
- # Store the computed scores
- self.grid_scores_ = grid_scores
-
- # Find the best parameters by comparing on the mean validation score:
- # note that `sorted` is deterministic in the way it breaks ties
- best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
- reverse=True)[0]
- self.best_params_ = best.parameters
- self.best_score_ = best.mean_validation_score
-
- if self.refit:
- # fit the best estimator using the entire dataset
- # clone first to work around broken estimators
- best_estimator = clone(base_estimator).set_params(
- **best.parameters)
- if y is not None:
- best_estimator.fit(X, y, **self.fit_params)
- else:
- best_estimator.fit(X, **self.fit_params)
- self.best_estimator_ = best_estimator
- return self
-
-
-class GridSearchCV(BaseSearchCV):
- """Exhaustive search over specified parameter values for an estimator.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.GridSearchCV` instead.
-
- Important members are fit, predict.
-
- GridSearchCV implements a "fit" and a "score" method.
- It also implements "predict", "predict_proba", "decision_function",
- "transform" and "inverse_transform" if they are implemented in the
- estimator used.
-
- The parameters of the estimator used to apply these methods are optimized
- by cross-validated grid-search over a parameter grid.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object.
- A object of that type is instantiated for each grid point.
- This is assumed to implement the scikit-learn estimator interface.
- Either estimator needs to provide a ``score`` function,
- or ``scoring`` must be passed.
-
- param_grid : dict or list of dictionaries
- Dictionary with parameters names (string) as keys and lists of
- parameter settings to try as values, or a list of such
- dictionaries, in which case the grids spanned by each dictionary
- in the list are explored. This enables searching over any sequence
- of parameter settings.
-
- scoring : string, callable or None, default=None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
- If ``None``, the ``score`` method of the estimator is used.
-
- fit_params : dict, optional
- Parameters to pass to the fit method.
-
- n_jobs: int, default: 1 :
- The maximum number of estimators fit in parallel.
-
- - If -1 all CPUs are used.
-
- - If 1 is given, no parallel computing code is used at all,
- which is useful for debugging.
-
- - For ``n_jobs`` below -1, ``(n_cpus + n_jobs + 1)`` are used.
- For example, with ``n_jobs = -2`` all CPUs but one are used.
-
- .. versionchanged:: 0.17
- Upgraded to joblib 0.9.3.
-
- pre_dispatch : int, or string, optional
- Controls the number of jobs that get dispatched during parallel
- execution. Reducing this number can be useful to avoid an
- explosion of memory consumption when more jobs get dispatched
- than CPUs can process. This parameter can be:
-
- - None, in which case all the jobs are immediately
- created and spawned. Use this for lightweight and
- fast-running jobs, to avoid delays due to on-demand
- spawning of the jobs
-
- - An int, giving the exact number of total jobs that are
- spawned
-
- - A string, giving an expression as a function of n_jobs,
- as in '2*n_jobs'
-
- iid : boolean, default=True
- If True, the data is assumed to be identically distributed across
- the folds, and the loss minimized is the total loss per sample,
- and not the mean loss across the folds.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass,
- :class:`sklearn.model_selection.StratifiedKFold` is used. In all
- other cases, :class:`sklearn.model_selection.KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- refit : boolean, default=True
- Refit the best estimator with the entire dataset.
- If "False", it is impossible to make predictions using
- this GridSearchCV instance after fitting.
-
- verbose : integer
- Controls the verbosity: the higher, the more messages.
-
- error_score : 'raise' (default) or numeric
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
-
-
- Examples
- --------
- >>> from sklearn import svm, grid_search, datasets
- >>> iris = datasets.load_iris()
- >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
- >>> svr = svm.SVC()
- >>> clf = grid_search.GridSearchCV(svr, parameters)
- >>> clf.fit(iris.data, iris.target)
- ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
- GridSearchCV(cv=None, error_score=...,
- estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
- decision_function_shape='ovr', degree=..., gamma=...,
- kernel='rbf', max_iter=-1, probability=False,
- random_state=None, shrinking=True, tol=...,
- verbose=False),
- fit_params={}, iid=..., n_jobs=1,
- param_grid=..., pre_dispatch=..., refit=...,
- scoring=..., verbose=...)
-
-
- Attributes
- ----------
- grid_scores_ : list of named tuples
- Contains scores for all parameter combinations in param_grid.
- Each entry corresponds to one parameter setting.
- Each named tuple has the attributes:
-
- * ``parameters``, a dict of parameter settings
- * ``mean_validation_score``, the mean score over the
- cross-validation folds
- * ``cv_validation_scores``, the list of scores for each fold
-
- best_estimator_ : estimator
- Estimator that was chosen by the search, i.e. estimator
- which gave highest score (or smallest loss if specified)
- on the left out data. Not available if refit=False.
-
- best_score_ : float
- Score of best_estimator on the left out data.
-
- best_params_ : dict
- Parameter setting that gave the best results on the hold out data.
-
- scorer_ : function
- Scorer function used on the held out data to choose the best
- parameters for the model.
-
- Notes
- ------
- The parameters selected are those that maximize the score of the left out
- data, unless an explicit score is passed in which case it is used instead.
-
- If `n_jobs` was set to a value higher than one, the data is copied for each
- point in the grid (and not `n_jobs` times). This is done for efficiency
- reasons if individual jobs take very little time, but may raise errors if
- the dataset is large and not enough memory is available. A workaround in
- this case is to set `pre_dispatch`. Then, the memory is copied only
- `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
- n_jobs`.
-
- See Also
- ---------
- :class:`ParameterGrid`:
- generates all the combinations of a hyperparameter grid.
-
- :func:`sklearn.cross_validation.train_test_split`:
- utility function to split the data into a development set usable
- for fitting a GridSearchCV instance and an evaluation set for
- its final evaluation.
-
- :func:`sklearn.metrics.make_scorer`:
- Make a scorer from a performance metric or loss function.
-
- """
-
- def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
- n_jobs=1, iid=True, refit=True, cv=None, verbose=0,
- pre_dispatch='2*n_jobs', error_score='raise'):
-
- super(GridSearchCV, self).__init__(
- estimator, scoring, fit_params, n_jobs, iid,
- refit, cv, verbose, pre_dispatch, error_score)
- self.param_grid = param_grid
- _check_param_grid(param_grid)
-
- def fit(self, X, y=None):
- """Run fit with all sets of parameters.
-
- Parameters
- ----------
-
- X : array-like, shape = [n_samples, n_features]
- Training vector, where n_samples is the number of samples and
- n_features is the number of features.
-
- y : array-like, shape = [n_samples] or [n_samples, n_output], optional
- Target relative to X for classification or regression;
- None for unsupervised learning.
-
- """
- return self._fit(X, y, ParameterGrid(self.param_grid))
-
-
-class RandomizedSearchCV(BaseSearchCV):
- """Randomized search on hyper parameters.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.RandomizedSearchCV` instead.
-
- RandomizedSearchCV implements a "fit" and a "score" method.
- It also implements "predict", "predict_proba", "decision_function",
- "transform" and "inverse_transform" if they are implemented in the
- estimator used.
-
- The parameters of the estimator used to apply these methods are optimized
- by cross-validated search over parameter settings.
-
- In contrast to GridSearchCV, not all parameter values are tried out, but
- rather a fixed number of parameter settings is sampled from the specified
- distributions. The number of parameter settings that are tried is
- given by n_iter.
-
- If all parameters are presented as a list,
- sampling without replacement is performed. If at least one parameter
- is given as a distribution, sampling with replacement is used.
- It is highly recommended to use continuous distributions for continuous
- parameters.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object.
- A object of that type is instantiated for each grid point.
- This is assumed to implement the scikit-learn estimator interface.
- Either estimator needs to provide a ``score`` function,
- or ``scoring`` must be passed.
-
- param_distributions : dict
- Dictionary with parameters names (string) as keys and distributions
- or lists of parameters to try. Distributions must provide a ``rvs``
- method for sampling (such as those from scipy.stats.distributions).
- If a list is given, it is sampled uniformly.
-
- n_iter : int, default=10
- Number of parameter settings that are sampled. n_iter trades
- off runtime vs quality of the solution.
-
- scoring : string, callable or None, default=None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
- If ``None``, the ``score`` method of the estimator is used.
-
- fit_params : dict, optional
- Parameters to pass to the fit method.
-
- n_jobs: int, default: 1 :
- The maximum number of estimators fit in parallel.
-
- - If -1 all CPUs are used.
-
- - If 1 is given, no parallel computing code is used at all,
- which is useful for debugging.
-
- - For ``n_jobs`` below -1, ``(n_cpus + n_jobs + 1)`` are used.
- For example, with ``n_jobs = -2`` all CPUs but one are used.
-
- pre_dispatch : int, or string, optional
- Controls the number of jobs that get dispatched during parallel
- execution. Reducing this number can be useful to avoid an
- explosion of memory consumption when more jobs get dispatched
- than CPUs can process. This parameter can be:
-
- - None, in which case all the jobs are immediately
- created and spawned. Use this for lightweight and
- fast-running jobs, to avoid delays due to on-demand
- spawning of the jobs
-
- - An int, giving the exact number of total jobs that are
- spawned
-
- - A string, giving an expression as a function of n_jobs,
- as in '2*n_jobs'
-
- iid : boolean, default=True
- If True, the data is assumed to be identically distributed across
- the folds, and the loss minimized is the total loss per sample,
- and not the mean loss across the folds.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass,
- :class:`sklearn.model_selection.StratifiedKFold` is used. In all
- other cases, :class:`sklearn.model_selection.KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- refit : boolean, default=True
- Refit the best estimator with the entire dataset.
- If "False", it is impossible to make predictions using
- this RandomizedSearchCV instance after fitting.
-
- verbose : integer
- Controls the verbosity: the higher, the more messages.
-
- random_state : int, RandomState instance or None, optional, default=None
- Pseudo random number generator state used for random uniform sampling
- from lists of possible values instead of scipy.stats distributions.
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- error_score : 'raise' (default) or numeric
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
-
-
- Attributes
- ----------
- grid_scores_ : list of named tuples
- Contains scores for all parameter combinations in param_grid.
- Each entry corresponds to one parameter setting.
- Each named tuple has the attributes:
-
- * ``parameters``, a dict of parameter settings
- * ``mean_validation_score``, the mean score over the
- cross-validation folds
- * ``cv_validation_scores``, the list of scores for each fold
-
- best_estimator_ : estimator
- Estimator that was chosen by the search, i.e. estimator
- which gave highest score (or smallest loss if specified)
- on the left out data. Not available if refit=False.
-
- best_score_ : float
- Score of best_estimator on the left out data.
-
- best_params_ : dict
- Parameter setting that gave the best results on the hold out data.
-
- Notes
- -----
- The parameters selected are those that maximize the score of the held-out
- data, according to the scoring parameter.
-
- If `n_jobs` was set to a value higher than one, the data is copied for each
- parameter setting(and not `n_jobs` times). This is done for efficiency
- reasons if individual jobs take very little time, but may raise errors if
- the dataset is large and not enough memory is available. A workaround in
- this case is to set `pre_dispatch`. Then, the memory is copied only
- `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
- n_jobs`.
-
- See Also
- --------
- :class:`GridSearchCV`:
- Does exhaustive search over a grid of parameters.
-
- :class:`ParameterSampler`:
- A generator over parameter settings, constructed from
- param_distributions.
-
- """
-
- def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
- fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
- verbose=0, pre_dispatch='2*n_jobs', random_state=None,
- error_score='raise'):
-
- self.param_distributions = param_distributions
- self.n_iter = n_iter
- self.random_state = random_state
- super(RandomizedSearchCV, self).__init__(
- estimator=estimator, scoring=scoring, fit_params=fit_params,
- n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
- pre_dispatch=pre_dispatch, error_score=error_score)
-
- def fit(self, X, y=None):
- """Run fit on the estimator with randomly drawn parameters.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
- Training vector, where n_samples in the number of samples and
- n_features is the number of features.
-
- y : array-like, shape = [n_samples] or [n_samples, n_output], optional
- Target relative to X for classification or regression;
- None for unsupervised learning.
-
- """
- sampled_params = ParameterSampler(self.param_distributions,
- self.n_iter,
- random_state=self.random_state)
- return self._fit(X, y, sampled_params)
diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py
deleted file mode 100644
index 0bb24046680ec..0000000000000
--- a/sklearn/learning_curve.py
+++ /dev/null
@@ -1,361 +0,0 @@
-"""Utilities to evaluate models with respect to a variable
-"""
-# Author: Alexander Fabisch
-#
-# License: BSD 3 clause
-
-import warnings
-
-import numpy as np
-
-from .base import is_classifier, clone
-from .model_selection import check_cv
-from .externals.joblib import Parallel, delayed
-from .utils.metaestimators import _safe_split
-from .model_selection._validation import _fit_and_score, _score
-from .metrics.scorer import check_scoring
-from .utils import indexable
-
-
-warnings.warn("This module was deprecated in version 0.18 in favor of the "
- "model_selection module into which all the functions are moved."
- " This module will be removed in 0.20",
- DeprecationWarning)
-
-
-__all__ = ['learning_curve', 'validation_curve']
-
-
-def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5),
- cv=None, scoring=None, exploit_incremental_learning=False,
- n_jobs=1, pre_dispatch="all", verbose=0,
- error_score='raise'):
- """Learning curve.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.learning_curve` instead.
-
- Determines cross-validated training and test scores for different training
- set sizes.
-
- A cross-validation generator splits the whole dataset k times in training
- and test data. Subsets of the training set with varying sizes will be used
- to train the estimator and a score for each training subset size and the
- test set will be computed. Afterwards, the scores will be averaged over
- all k runs for each training subset size.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : object type that implements the "fit" and "predict" methods
- An object of that type which is cloned for each validation.
-
- X : array-like, shape (n_samples, n_features)
- Training vector, where n_samples is the number of samples and
- n_features is the number of features.
-
- y : array-like, shape (n_samples) or (n_samples, n_features), optional
- Target relative to X for classification or regression;
- None for unsupervised learning.
-
- train_sizes : array-like, shape (n_ticks,), dtype float or int
- Relative or absolute numbers of training examples that will be used to
- generate the learning curve. If the dtype is float, it is regarded as a
- fraction of the maximum size of the training set (that is determined
- by the selected validation method), i.e. it has to be within (0, 1].
- Otherwise it is interpreted as absolute sizes of the training sets.
- Note that for classification the number of samples usually have to
- be big enough to contain at least one sample from each class.
- (default: np.linspace(0.1, 1.0, 5))
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass,
- :class:`sklearn.model_selection.StratifiedKFold` is used. In all
- other cases, :class:`sklearn.model_selection.KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- scoring : string, callable or None, optional, default: None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
-
- exploit_incremental_learning : boolean, optional, default: False
- If the estimator supports incremental learning, this will be
- used to speed up fitting for different training set sizes.
-
- n_jobs : integer, optional
- Number of jobs to run in parallel (default 1).
-
- pre_dispatch : integer or string, optional
- Number of predispatched jobs for parallel execution (default is
- all). The option can reduce the allocated memory. The string can
- be an expression like '2*n_jobs'.
-
- verbose : integer, optional
- Controls the verbosity: the higher, the more messages.
-
- error_score : 'raise' (default) or numeric
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
-
- Returns
- -------
- train_sizes_abs : array, shape = (n_unique_ticks,), dtype int
- Numbers of training examples that has been used to generate the
- learning curve. Note that the number of ticks might be less
- than n_ticks because duplicate entries will be removed.
-
- train_scores : array, shape (n_ticks, n_cv_folds)
- Scores on training sets.
-
- test_scores : array, shape (n_ticks, n_cv_folds)
- Scores on test set.
-
- Notes
- -----
- See :ref:`examples/model_selection/plot_learning_curve.py
- `
- """
- if exploit_incremental_learning and not hasattr(estimator, "partial_fit"):
- raise ValueError("An estimator must support the partial_fit interface "
- "to exploit incremental learning")
-
- X, y = indexable(X, y)
- # Make a list since we will be iterating multiple times over the folds
- cv = list(check_cv(cv, X, y, classifier=is_classifier(estimator)))
- scorer = check_scoring(estimator, scoring=scoring)
-
- # HACK as long as boolean indices are allowed in cv generators
- if cv[0][0].dtype == bool:
- new_cv = []
- for i in range(len(cv)):
- new_cv.append((np.nonzero(cv[i][0])[0], np.nonzero(cv[i][1])[0]))
- cv = new_cv
-
- n_max_training_samples = len(cv[0][0])
- # Because the lengths of folds can be significantly different, it is
- # not guaranteed that we use all of the available training data when we
- # use the first 'n_max_training_samples' samples.
- train_sizes_abs = _translate_train_sizes(train_sizes,
- n_max_training_samples)
- n_unique_ticks = train_sizes_abs.shape[0]
- if verbose > 0:
- print("[learning_curve] Training set sizes: " + str(train_sizes_abs))
-
- parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
- verbose=verbose)
- if exploit_incremental_learning:
- classes = np.unique(y) if is_classifier(estimator) else None
- out = parallel(delayed(_incremental_fit_estimator)(
- clone(estimator), X, y, classes, train, test, train_sizes_abs,
- scorer, verbose) for train, test in cv)
- else:
- out = parallel(delayed(_fit_and_score)(
- clone(estimator), X, y, scorer, train[:n_train_samples], test,
- verbose, parameters=None, fit_params=None, return_train_score=True,
- error_score=error_score)
- for train, test in cv for n_train_samples in train_sizes_abs)
- out = np.array(out)[:, :2]
- n_cv_folds = out.shape[0] // n_unique_ticks
- out = out.reshape(n_cv_folds, n_unique_ticks, 2)
-
- out = np.asarray(out).transpose((2, 1, 0))
-
- return train_sizes_abs, out[0], out[1]
-
-
-def _translate_train_sizes(train_sizes, n_max_training_samples):
- """Determine absolute sizes of training subsets and validate 'train_sizes'.
-
- Examples:
- _translate_train_sizes([0.5, 1.0], 10) -> [5, 10]
- _translate_train_sizes([5, 10], 10) -> [5, 10]
-
- Parameters
- ----------
- train_sizes : array-like, shape (n_ticks,), dtype float or int
- Numbers of training examples that will be used to generate the
- learning curve. If the dtype is float, it is regarded as a
- fraction of 'n_max_training_samples', i.e. it has to be within (0, 1].
-
- n_max_training_samples : int
- Maximum number of training samples (upper bound of 'train_sizes').
-
- Returns
- -------
- train_sizes_abs : array, shape (n_unique_ticks,), dtype int
- Numbers of training examples that will be used to generate the
- learning curve. Note that the number of ticks might be less
- than n_ticks because duplicate entries will be removed.
- """
- train_sizes_abs = np.asarray(train_sizes)
- n_ticks = train_sizes_abs.shape[0]
- n_min_required_samples = np.min(train_sizes_abs)
- n_max_required_samples = np.max(train_sizes_abs)
- if np.issubdtype(train_sizes_abs.dtype, np.floating):
- if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0:
- raise ValueError("train_sizes has been interpreted as fractions "
- "of the maximum number of training samples and "
- "must be within (0, 1], but is within [%f, %f]."
- % (n_min_required_samples,
- n_max_required_samples))
- train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype(
- dtype=np.int, copy=False)
- train_sizes_abs = np.clip(train_sizes_abs, 1,
- n_max_training_samples)
- else:
- if (n_min_required_samples <= 0 or
- n_max_required_samples > n_max_training_samples):
- raise ValueError("train_sizes has been interpreted as absolute "
- "numbers of training samples and must be within "
- "(0, %d], but is within [%d, %d]."
- % (n_max_training_samples,
- n_min_required_samples,
- n_max_required_samples))
-
- train_sizes_abs = np.unique(train_sizes_abs)
- if n_ticks > train_sizes_abs.shape[0]:
- warnings.warn("Removed duplicate entries from 'train_sizes'. Number "
- "of ticks will be less than the size of "
- "'train_sizes' %d instead of %d)."
- % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning)
-
- return train_sizes_abs
-
-
-def _incremental_fit_estimator(estimator, X, y, classes, train, test,
- train_sizes, scorer, verbose):
- """Train estimator on training subsets incrementally and compute scores."""
- train_scores, test_scores = [], []
- partitions = zip(train_sizes, np.split(train, train_sizes)[:-1])
- for n_train_samples, partial_train in partitions:
- train_subset = train[:n_train_samples]
- X_train, y_train = _safe_split(estimator, X, y, train_subset)
- X_partial_train, y_partial_train = _safe_split(estimator, X, y,
- partial_train)
- X_test, y_test = _safe_split(estimator, X, y, test, train_subset)
- if y_partial_train is None:
- estimator.partial_fit(X_partial_train, classes=classes)
- else:
- estimator.partial_fit(X_partial_train, y_partial_train,
- classes=classes)
- train_scores.append(_score(estimator, X_train, y_train, scorer))
- test_scores.append(_score(estimator, X_test, y_test, scorer))
- return np.array((train_scores, test_scores)).T
-
-
-def validation_curve(estimator, X, y, param_name, param_range, cv=None,
- scoring=None, n_jobs=1, pre_dispatch="all", verbose=0):
- """Validation curve.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.validation_curve` instead.
-
- Determine training and test scores for varying parameter values.
-
- Compute scores for an estimator with different values of a specified
- parameter. This is similar to grid search with one parameter. However, this
- will also compute training scores and is merely a utility for plotting the
- results.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : object type that implements the "fit" and "predict" methods
- An object of that type which is cloned for each validation.
-
- X : array-like, shape (n_samples, n_features)
- Training vector, where n_samples is the number of samples and
- n_features is the number of features.
-
- y : array-like, shape (n_samples) or (n_samples, n_features), optional
- Target relative to X for classification or regression;
- None for unsupervised learning.
-
- param_name : string
- Name of the parameter that will be varied.
-
- param_range : array-like, shape (n_values,)
- The values of the parameter that will be evaluated.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass,
- :class:`sklearn.model_selection.StratifiedKFold` is used. In all
- other cases, :class:`sklearn.model_selection.KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- scoring : string, callable or None, optional, default: None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
-
- n_jobs : integer, optional
- Number of jobs to run in parallel (default 1).
-
- pre_dispatch : integer or string, optional
- Number of predispatched jobs for parallel execution (default is
- all). The option can reduce the allocated memory. The string can
- be an expression like '2*n_jobs'.
-
- verbose : integer, optional
- Controls the verbosity: the higher, the more messages.
-
- Returns
- -------
- train_scores : array, shape (n_ticks, n_cv_folds)
- Scores on training sets.
-
- test_scores : array, shape (n_ticks, n_cv_folds)
- Scores on test set.
-
- Notes
- -----
- See
- :ref:`examples/model_selection/plot_validation_curve.py
- `
- """
- X, y = indexable(X, y)
- cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
- scorer = check_scoring(estimator, scoring=scoring)
-
- parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
- verbose=verbose)
- out = parallel(delayed(_fit_and_score)(
- clone(estimator), X, y, scorer, train, test, verbose,
- parameters={param_name: v}, fit_params=None, return_train_score=True)
- for train, test in cv for v in param_range)
-
- out = np.asarray(out)[:, :2]
- n_params = len(param_range)
- n_cv_folds = out.shape[0] // n_params
- out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0))
-
- return out[0], out[1]
diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py
deleted file mode 100644
index 3605da1613e13..0000000000000
--- a/sklearn/tests/test_grid_search.py
+++ /dev/null
@@ -1,815 +0,0 @@
-"""
-Testing for grid search module (sklearn.grid_search)
-
-"""
-
-from collections import Iterable, Sized
-from sklearn.externals.six.moves import cStringIO as StringIO
-from sklearn.externals.six.moves import xrange
-from itertools import chain, product
-import pickle
-import warnings
-import sys
-
-import numpy as np
-import scipy.sparse as sp
-
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_not_equal
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_warns
-from sklearn.utils.testing import assert_raise_message
-from sklearn.utils.testing import assert_false, assert_true
-from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_no_warnings
-from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.mocking import CheckingClassifier, MockDataFrame
-
-from scipy.stats import bernoulli, expon, uniform
-
-from sklearn.externals.six.moves import zip
-from sklearn.base import BaseEstimator
-from sklearn.datasets import make_classification
-from sklearn.datasets import make_blobs
-from sklearn.datasets import make_multilabel_classification
-from sklearn.svm import LinearSVC, SVC
-from sklearn.tree import DecisionTreeRegressor
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.cluster import KMeans
-from sklearn.neighbors import KernelDensity
-from sklearn.metrics import f1_score
-from sklearn.metrics import make_scorer
-from sklearn.metrics import roc_auc_score
-from sklearn.linear_model import Ridge
-
-from sklearn.exceptions import FitFailedWarning
-from sklearn.model_selection import KFold, StratifiedKFold
-
-with warnings.catch_warnings():
- warnings.simplefilter('ignore')
- from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV,
- ParameterGrid, ParameterSampler)
-
-from sklearn.preprocessing import Imputer
-from sklearn.pipeline import Pipeline
-
-
-# Neither of the following two estimators inherit from BaseEstimator,
-# to test hyperparameter search on user-defined classifiers.
-class MockClassifier(object):
- """Dummy classifier to test the cross-validation"""
- def __init__(self, foo_param=0):
- self.foo_param = foo_param
-
- def fit(self, X, Y):
- assert_true(len(X) == len(Y))
- return self
-
- def predict(self, T):
- return T.shape[0]
-
- def transform(self, X):
- return X - self.foo_param
-
- def inverse_transform(self, X):
- return X + self.foo_param
-
- predict_proba = predict
- decision_function = predict
-
- def score(self, X=None, Y=None):
- if self.foo_param > 1:
- score = 1.
- else:
- score = 0.
- return score
-
- def get_params(self, deep=False):
- return {'foo_param': self.foo_param}
-
- def set_params(self, **params):
- self.foo_param = params['foo_param']
- return self
-
-
-class LinearSVCNoScore(LinearSVC):
- """An LinearSVC classifier that has no score method."""
- @property
- def score(self):
- raise AttributeError
-
-X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
-y = np.array([1, 1, 2, 2])
-
-
-def assert_grid_iter_equals_getitem(grid):
- assert_equal(list(grid), [grid[i] for i in range(len(grid))])
-
-
-def test_parameter_grid():
- # Test basic properties of ParameterGrid.
- params1 = {"foo": [1, 2, 3]}
- grid1 = ParameterGrid(params1)
- assert_true(isinstance(grid1, Iterable))
- assert_true(isinstance(grid1, Sized))
- assert_equal(len(grid1), 3)
- assert_grid_iter_equals_getitem(grid1)
-
- params2 = {"foo": [4, 2],
- "bar": ["ham", "spam", "eggs"]}
- grid2 = ParameterGrid(params2)
- assert_equal(len(grid2), 6)
-
- # loop to assert we can iterate over the grid multiple times
- for i in xrange(2):
- # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2)
- points = set(tuple(chain(*(sorted(p.items())))) for p in grid2)
- assert_equal(points,
- set(("bar", x, "foo", y)
- for x, y in product(params2["bar"], params2["foo"])))
-
- assert_grid_iter_equals_getitem(grid2)
-
- # Special case: empty grid (useful to get default estimator settings)
- empty = ParameterGrid({})
- assert_equal(len(empty), 1)
- assert_equal(list(empty), [{}])
- assert_grid_iter_equals_getitem(empty)
- assert_raises(IndexError, lambda: empty[1])
-
- has_empty = ParameterGrid([{'C': [1, 10]}, {}, {'C': [.5]}])
- assert_equal(len(has_empty), 4)
- assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}, {'C': .5}])
- assert_grid_iter_equals_getitem(has_empty)
-
-
-def test_grid_search():
- # Test that the best estimator contains the right value for foo_param
- clf = MockClassifier()
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3)
- # make sure it selects the smallest parameter in case of ties
- old_stdout = sys.stdout
- sys.stdout = StringIO()
- grid_search.fit(X, y)
- sys.stdout = old_stdout
- assert_equal(grid_search.best_estimator_.foo_param, 2)
-
- for i, foo_i in enumerate([1, 2, 3]):
- assert_true(grid_search.grid_scores_[i][0]
- == {'foo_param': foo_i})
- # Smoke test the score etc:
- grid_search.score(X, y)
- grid_search.predict_proba(X)
- grid_search.decision_function(X)
- grid_search.transform(X)
-
- # Test exception handling on scoring
- grid_search.scoring = 'sklearn'
- assert_raises(ValueError, grid_search.fit, X, y)
-
-
-def test_transform_inverse_transform_round_trip():
- clf = MockClassifier()
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3)
- grid_search.fit(X, y)
- X_round_trip = grid_search.inverse_transform(grid_search.transform(X))
- assert_array_equal(X, X_round_trip)
-
-
-@ignore_warnings
-def test_grid_search_no_score():
- # Test grid-search on classifier that has no score function.
- clf = LinearSVC(random_state=0)
- X, y = make_blobs(random_state=0, centers=2)
- Cs = [.1, 1, 10]
- clf_no_score = LinearSVCNoScore(random_state=0)
- grid_search = GridSearchCV(clf, {'C': Cs}, scoring='accuracy')
- grid_search.fit(X, y)
-
- grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs},
- scoring='accuracy')
- # smoketest grid search
- grid_search_no_score.fit(X, y)
-
- # check that best params are equal
- assert_equal(grid_search_no_score.best_params_, grid_search.best_params_)
- # check that we can call score and that it gives the correct result
- assert_equal(grid_search.score(X, y), grid_search_no_score.score(X, y))
-
- # giving no scoring function raises an error
- grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs})
- assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit,
- [[1]])
-
-
-def test_grid_search_score_method():
- X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
- random_state=0)
- clf = LinearSVC(random_state=0)
- grid = {'C': [.1]}
-
- search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
- search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y)
- search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid,
- scoring='roc_auc').fit(X, y)
- search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y)
-
- # ChangedBehaviourWarning occurred previously (prior to #9005)
- score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y)
- score_accuracy = assert_no_warnings(search_accuracy.score, X, y)
- score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score,
- X, y)
- score_auc = assert_no_warnings(search_auc.score, X, y)
-
- # ensure the test is sane
- assert_true(score_auc < 1.0)
- assert_true(score_accuracy < 1.0)
- assert_not_equal(score_auc, score_accuracy)
-
- assert_almost_equal(score_accuracy, score_no_scoring)
- assert_almost_equal(score_auc, score_no_score_auc)
-
-
-def test_trivial_grid_scores():
- # Test search over a "grid" with only one point.
- # Non-regression test: grid_scores_ wouldn't be set by GridSearchCV.
- clf = MockClassifier()
- grid_search = GridSearchCV(clf, {'foo_param': [1]})
- grid_search.fit(X, y)
- assert_true(hasattr(grid_search, "grid_scores_"))
-
- random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1)
- random_search.fit(X, y)
- assert_true(hasattr(random_search, "grid_scores_"))
-
-
-def test_no_refit():
- # Test that grid search can be used for model selection only
- clf = MockClassifier()
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False)
- grid_search.fit(X, y)
- assert_true(hasattr(grid_search, "best_params_"))
-
-
-def test_grid_search_error():
- # Test that grid search will capture errors on data with different
- # length
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
- clf = LinearSVC()
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- assert_raises(ValueError, cv.fit, X_[:180], y_)
-
-
-def test_grid_search_iid():
- # test the iid parameter
- # noise-free simple 2d-data
- X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0,
- cluster_std=0.1, shuffle=False, n_samples=80)
- # split dataset into two folds that are not iid
- # first one contains data of all 4 blobs, second only from two.
- mask = np.ones(X.shape[0], dtype=np.bool)
- mask[np.where(y == 1)[0][::2]] = 0
- mask[np.where(y == 2)[0][::2]] = 0
- # this leads to perfect classification on one fold and a score of 1/3 on
- # the other
- svm = SVC(kernel='linear')
- # create "cv" for splits
- cv = [[mask, ~mask], [~mask, mask]]
- # once with iid=True (default)
- grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv)
- grid_search.fit(X, y)
- first = grid_search.grid_scores_[0]
- assert_equal(first.parameters['C'], 1)
- assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
- # for first split, 1/4 of dataset is in test, for second 3/4.
- # take weighted average
- assert_almost_equal(first.mean_validation_score,
- 1 * 1. / 4. + 1. / 3. * 3. / 4.)
-
- # once with iid=False
- grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv,
- iid=False)
- grid_search.fit(X, y)
- first = grid_search.grid_scores_[0]
- assert_equal(first.parameters['C'], 1)
- # scores are the same as above
- assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
- # averaged score is just mean of scores
- assert_almost_equal(first.mean_validation_score,
- np.mean(first.cv_validation_scores))
-
-
-def test_grid_search_one_grid_point():
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
- param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}
-
- clf = SVC()
- cv = GridSearchCV(clf, param_dict)
- cv.fit(X_, y_)
-
- clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
- clf.fit(X_, y_)
-
- assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
-
-
-def test_grid_search_bad_param_grid():
- param_dict = {"C": 1.0}
- clf = SVC()
- assert_raises(ValueError, GridSearchCV, clf, param_dict)
-
- param_dict = {"C": []}
- clf = SVC()
- assert_raises(ValueError, GridSearchCV, clf, param_dict)
-
- param_dict = {"C": np.ones(6).reshape(3, 2)}
- clf = SVC()
- assert_raises(ValueError, GridSearchCV, clf, param_dict)
-
-
-def test_grid_search_sparse():
- # Test that grid search works with both dense and sparse matrices
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
- clf = LinearSVC()
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- cv.fit(X_[:180], y_[:180])
- y_pred = cv.predict(X_[180:])
- C = cv.best_estimator_.C
-
- X_ = sp.csr_matrix(X_)
- clf = LinearSVC()
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- cv.fit(X_[:180].tocoo(), y_[:180])
- y_pred2 = cv.predict(X_[180:])
- C2 = cv.best_estimator_.C
-
- assert_true(np.mean(y_pred == y_pred2) >= .9)
- assert_equal(C, C2)
-
-
-def test_grid_search_sparse_scoring():
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
- clf = LinearSVC()
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
- cv.fit(X_[:180], y_[:180])
- y_pred = cv.predict(X_[180:])
- C = cv.best_estimator_.C
-
- X_ = sp.csr_matrix(X_)
- clf = LinearSVC()
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
- cv.fit(X_[:180], y_[:180])
- y_pred2 = cv.predict(X_[180:])
- C2 = cv.best_estimator_.C
-
- assert_array_equal(y_pred, y_pred2)
- assert_equal(C, C2)
- # Smoke test the score
- # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
- # cv.score(X_[:180], y[:180]))
-
- # test loss where greater is worse
- def f1_loss(y_true_, y_pred_):
- return -f1_score(y_true_, y_pred_)
- F1Loss = make_scorer(f1_loss, greater_is_better=False)
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
- cv.fit(X_[:180], y_[:180])
- y_pred3 = cv.predict(X_[180:])
- C3 = cv.best_estimator_.C
-
- assert_equal(C, C3)
- assert_array_equal(y_pred, y_pred3)
-
-
-def test_grid_search_precomputed_kernel():
- # Test that grid search works when the input features are given in the
- # form of a precomputed kernel matrix
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
- # compute the training kernel matrix corresponding to the linear kernel
- K_train = np.dot(X_[:180], X_[:180].T)
- y_train = y_[:180]
-
- clf = SVC(kernel='precomputed')
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- cv.fit(K_train, y_train)
-
- assert_true(cv.best_score_ >= 0)
-
- # compute the test kernel matrix
- K_test = np.dot(X_[180:], X_[:180].T)
- y_test = y_[180:]
-
- y_pred = cv.predict(K_test)
-
- assert_true(np.mean(y_pred == y_test) >= 0)
-
- # test error is raised when the precomputed kernel is not array-like
- # or sparse
- assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
-
-
-def test_grid_search_precomputed_kernel_error_nonsquare():
- # Test that grid search returns an error with a non-square precomputed
- # training kernel matrix
- K_train = np.zeros((10, 20))
- y_train = np.ones((10, ))
- clf = SVC(kernel='precomputed')
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- assert_raises(ValueError, cv.fit, K_train, y_train)
-
-
-def test_grid_search_precomputed_kernel_error_kernel_function():
- # Test that grid search returns an error when using a kernel_function
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
- kernel_function = lambda x1, x2: np.dot(x1, x2.T)
- clf = SVC(kernel=kernel_function)
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- assert_raises(ValueError, cv.fit, X_, y_)
-
-
-class BrokenClassifier(BaseEstimator):
- """Broken classifier that cannot be fit twice"""
-
- def __init__(self, parameter=None):
- self.parameter = parameter
-
- def fit(self, X, y):
- assert_true(not hasattr(self, 'has_been_fit_'))
- self.has_been_fit_ = True
-
- def predict(self, X):
- return np.zeros(X.shape[0])
-
-
-@ignore_warnings
-def test_refit():
- # Regression test for bug in refitting
- # Simulates re-fitting a broken estimator; this used to break with
- # sparse SVMs.
- X = np.arange(100).reshape(10, 10)
- y = np.array([0] * 5 + [1] * 5)
-
- clf = GridSearchCV(BrokenClassifier(), [{'parameter': [0, 1]}],
- scoring="precision", refit=True)
- clf.fit(X, y)
-
-
-def test_gridsearch_nd():
- # Pass X as list in GridSearchCV
- X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
- y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
- check_X = lambda x: x.shape[1:] == (5, 3, 2)
- check_y = lambda x: x.shape[1:] == (7, 11)
- clf = CheckingClassifier(check_X=check_X, check_y=check_y)
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
- grid_search.fit(X_4d, y_3d).score(X, y)
- assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_X_as_list():
- # Pass X as list in GridSearchCV
- X = np.arange(100).reshape(10, 10)
- y = np.array([0] * 5 + [1] * 5)
-
- clf = CheckingClassifier(check_X=lambda x: isinstance(x, list))
- cv = KFold(n=len(X), n_folds=3)
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
- grid_search.fit(X.tolist(), y).score(X, y)
- assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_y_as_list():
- # Pass y as list in GridSearchCV
- X = np.arange(100).reshape(10, 10)
- y = np.array([0] * 5 + [1] * 5)
-
- clf = CheckingClassifier(check_y=lambda x: isinstance(x, list))
- cv = KFold(n=len(X), n_folds=3)
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
- grid_search.fit(X, y.tolist()).score(X, y)
- assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_pandas_input():
- # check cross_val_score doesn't destroy pandas dataframe
- types = [(MockDataFrame, MockDataFrame)]
- try:
- from pandas import Series, DataFrame
- types.append((DataFrame, Series))
- except ImportError:
- pass
-
- X = np.arange(100).reshape(10, 10)
- y = np.array([0] * 5 + [1] * 5)
-
- for InputFeatureType, TargetType in types:
- # X dataframe, y series
- X_df, y_ser = InputFeatureType(X), TargetType(y)
- check_df = lambda x: isinstance(x, InputFeatureType)
- check_series = lambda x: isinstance(x, TargetType)
- clf = CheckingClassifier(check_X=check_df, check_y=check_series)
-
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
- grid_search.fit(X_df, y_ser).score(X_df, y_ser)
- grid_search.predict(X_df)
- assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_unsupervised_grid_search():
- # test grid-search with unsupervised estimator
- X, y = make_blobs(random_state=0)
- km = KMeans(random_state=0)
- grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
- scoring='adjusted_rand_score')
- grid_search.fit(X, y)
- # ARI can find the right number :)
- assert_equal(grid_search.best_params_["n_clusters"], 3)
-
- # Now without a score, and without y
- grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]))
- grid_search.fit(X)
- assert_equal(grid_search.best_params_["n_clusters"], 4)
-
-
-def test_gridsearch_no_predict():
- # test grid-search with an estimator without predict.
- # slight duplication of a test from KDE
- def custom_scoring(estimator, X):
- return 42 if estimator.bandwidth == .1 else 0
- X, _ = make_blobs(cluster_std=.1, random_state=1,
- centers=[[0, 1], [1, 0], [0, 0]])
- search = GridSearchCV(KernelDensity(),
- param_grid=dict(bandwidth=[.01, .1, 1]),
- scoring=custom_scoring)
- search.fit(X)
- assert_equal(search.best_params_['bandwidth'], .1)
- assert_equal(search.best_score_, 42)
-
-
-def test_param_sampler():
- # test basic properties of param sampler
- param_distributions = {"kernel": ["rbf", "linear"],
- "C": uniform(0, 1)}
- sampler = ParameterSampler(param_distributions=param_distributions,
- n_iter=10, random_state=0)
- samples = [x for x in sampler]
- assert_equal(len(samples), 10)
- for sample in samples:
- assert_true(sample["kernel"] in ["rbf", "linear"])
- assert_true(0 <= sample["C"] <= 1)
-
-
-def test_randomized_search_grid_scores():
- # Make a dataset with a lot of noise to get various kind of prediction
- # errors across CV folds and parameter settings
- X, y = make_classification(n_samples=200, n_features=100, n_informative=3,
- random_state=0)
-
- # XXX: as of today (scipy 0.12) it's not possible to set the random seed
- # of scipy.stats distributions: the assertions in this test should thus
- # not depend on the randomization
- params = dict(C=expon(scale=10),
- gamma=expon(scale=0.1))
- n_cv_iter = 3
- n_search_iter = 30
- search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter,
- param_distributions=params, iid=False)
- search.fit(X, y)
- assert_equal(len(search.grid_scores_), n_search_iter)
-
- # Check consistency of the structure of each cv_score item
- for cv_score in search.grid_scores_:
- assert_equal(len(cv_score.cv_validation_scores), n_cv_iter)
- # Because we set iid to False, the mean_validation score is the
- # mean of the fold mean scores instead of the aggregate sample-wise
- # mean score
- assert_almost_equal(np.mean(cv_score.cv_validation_scores),
- cv_score.mean_validation_score)
- assert_equal(list(sorted(cv_score.parameters.keys())),
- list(sorted(params.keys())))
-
- # Check the consistency with the best_score_ and best_params_ attributes
- sorted_grid_scores = list(sorted(search.grid_scores_,
- key=lambda x: x.mean_validation_score))
- best_score = sorted_grid_scores[-1].mean_validation_score
- assert_equal(search.best_score_, best_score)
-
- tied_best_params = [s.parameters for s in sorted_grid_scores
- if s.mean_validation_score == best_score]
- assert_true(search.best_params_ in tied_best_params,
- "best_params_={0} is not part of the"
- " tied best models: {1}".format(
- search.best_params_, tied_best_params))
-
-
-def test_grid_search_score_consistency():
- # test that correct scores are used
- clf = LinearSVC(random_state=0)
- X, y = make_blobs(random_state=0, centers=2)
- Cs = [.1, 1, 10]
- for score in ['f1', 'roc_auc']:
- grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score)
- grid_search.fit(X, y)
- cv = StratifiedKFold(n_folds=3, y=y)
- for C, scores in zip(Cs, grid_search.grid_scores_):
- clf.set_params(C=C)
- scores = scores[2] # get the separate runs from grid scores
- i = 0
- for train, test in cv:
- clf.fit(X[train], y[train])
- if score == "f1":
- correct_score = f1_score(y[test], clf.predict(X[test]))
- elif score == "roc_auc":
- dec = clf.decision_function(X[test])
- correct_score = roc_auc_score(y[test], dec)
- assert_almost_equal(correct_score, scores[i])
- i += 1
-
-
-def test_pickle():
- # Test that a fit search can be pickled
- clf = MockClassifier()
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True)
- grid_search.fit(X, y)
- pickle.dumps(grid_search) # smoke test
-
- random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]},
- refit=True, n_iter=3)
- random_search.fit(X, y)
- pickle.dumps(random_search) # smoke test
-
-
-def test_grid_search_with_multioutput_data():
- # Test search with multi-output estimator
-
- X, y = make_multilabel_classification(random_state=0)
-
- est_parameters = {"max_depth": [1, 2, 3, 4]}
- cv = KFold(y.shape[0], random_state=0)
-
- estimators = [DecisionTreeRegressor(random_state=0),
- DecisionTreeClassifier(random_state=0)]
-
- # Test with grid search cv
- for est in estimators:
- grid_search = GridSearchCV(est, est_parameters, cv=cv)
- grid_search.fit(X, y)
- for parameters, _, cv_validation_scores in grid_search.grid_scores_:
- est.set_params(**parameters)
-
- for i, (train, test) in enumerate(cv):
- est.fit(X[train], y[train])
- correct_score = est.score(X[test], y[test])
- assert_almost_equal(correct_score,
- cv_validation_scores[i])
-
- # Test with a randomized search
- for est in estimators:
- random_search = RandomizedSearchCV(est, est_parameters,
- cv=cv, n_iter=3)
- random_search.fit(X, y)
- for parameters, _, cv_validation_scores in random_search.grid_scores_:
- est.set_params(**parameters)
-
- for i, (train, test) in enumerate(cv):
- est.fit(X[train], y[train])
- correct_score = est.score(X[test], y[test])
- assert_almost_equal(correct_score,
- cv_validation_scores[i])
-
-
-def test_predict_proba_disabled():
- # Test predict_proba when disabled on estimator.
- X = np.arange(20).reshape(5, -1)
- y = [0, 0, 1, 1, 1]
- clf = SVC(probability=False)
- gs = GridSearchCV(clf, {}, cv=2).fit(X, y)
- assert_false(hasattr(gs, "predict_proba"))
-
-
-def test_grid_search_allows_nans():
- # Test GridSearchCV with Imputer
- X = np.arange(20, dtype=np.float64).reshape(5, -1)
- X[2, :] = np.nan
- y = [0, 0, 1, 1, 1]
- p = Pipeline([
- ('imputer', Imputer(strategy='mean', missing_values='NaN')),
- ('classifier', MockClassifier()),
- ])
- GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
-
-
-class FailingClassifier(BaseEstimator):
- """Classifier that raises a ValueError on fit()"""
-
- FAILING_PARAMETER = 2
-
- def __init__(self, parameter=None):
- self.parameter = parameter
-
- def fit(self, X, y=None):
- if self.parameter == FailingClassifier.FAILING_PARAMETER:
- raise ValueError("Failing classifier failed as required")
-
- def predict(self, X):
- return np.zeros(X.shape[0])
-
-
-def test_grid_search_failing_classifier():
- # GridSearchCV with on_error != 'raise'
- # Ensures that a warning is raised and score reset where appropriate.
-
- X, y = make_classification(n_samples=20, n_features=10, random_state=0)
-
- clf = FailingClassifier()
-
- # refit=False because we only want to check that errors caused by fits
- # to individual folds will be caught and warnings raised instead. If
- # refit was done, then an exception would be raised on refit and not
- # caught by grid_search (expected behavior), and this would cause an
- # error in this test.
- gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
- refit=False, error_score=0.0)
-
- assert_warns(FitFailedWarning, gs.fit, X, y)
-
- # Ensure that grid scores were set to zero as required for those fits
- # that are expected to fail.
- assert all(np.all(this_point.cv_validation_scores == 0.0)
- for this_point in gs.grid_scores_
- if this_point.parameters['parameter'] ==
- FailingClassifier.FAILING_PARAMETER)
-
- gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
- refit=False, error_score=float('nan'))
- assert_warns(FitFailedWarning, gs.fit, X, y)
- assert all(np.all(np.isnan(this_point.cv_validation_scores))
- for this_point in gs.grid_scores_
- if this_point.parameters['parameter'] ==
- FailingClassifier.FAILING_PARAMETER)
-
-
-def test_grid_search_failing_classifier_raise():
- # GridSearchCV with on_error == 'raise' raises the error
-
- X, y = make_classification(n_samples=20, n_features=10, random_state=0)
-
- clf = FailingClassifier()
-
- # refit=False because we want to test the behaviour of the grid search part
- gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
- refit=False, error_score='raise')
-
- # FailingClassifier issues a ValueError so this is what we look for.
- assert_raises(ValueError, gs.fit, X, y)
-
-
-def test_parameters_sampler_replacement():
- # raise error if n_iter too large
- params = {'first': [0, 1], 'second': ['a', 'b', 'c']}
- sampler = ParameterSampler(params, n_iter=7)
- assert_raises(ValueError, list, sampler)
- # degenerates to GridSearchCV if n_iter the same as grid_size
- sampler = ParameterSampler(params, n_iter=6)
- samples = list(sampler)
- assert_equal(len(samples), 6)
- for values in ParameterGrid(params):
- assert_true(values in samples)
-
- # test sampling without replacement in a large grid
- params = {'a': range(10), 'b': range(10), 'c': range(10)}
- sampler = ParameterSampler(params, n_iter=99, random_state=42)
- samples = list(sampler)
- assert_equal(len(samples), 99)
- hashable_samples = ["a%db%dc%d" % (p['a'], p['b'], p['c'])
- for p in samples]
- assert_equal(len(set(hashable_samples)), 99)
-
- # doesn't go into infinite loops
- params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']}
- sampler = ParameterSampler(params_distribution, n_iter=7)
- samples = list(sampler)
- assert_equal(len(samples), 7)
-
-
-def test_classes__property():
- # Test that classes_ property matches best_esimator_.classes_
- X = np.arange(100).reshape(10, 10)
- y = np.array([0] * 5 + [1] * 5)
- Cs = [.1, 1, 10]
-
- grid_search = GridSearchCV(LinearSVC(random_state=0), {'C': Cs})
- grid_search.fit(X, y)
- assert_array_equal(grid_search.best_estimator_.classes_,
- grid_search.classes_)
-
- # Test that regressors do not have a classes_ attribute
- grid_search = GridSearchCV(Ridge(), {'alpha': [1.0, 2.0]})
- grid_search.fit(X, y)
- assert_false(hasattr(grid_search, 'classes_'))
diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py
deleted file mode 100644
index d75e6bc82f6b3..0000000000000
--- a/sklearn/tests/test_learning_curve.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Author: Alexander Fabisch
-#
-# License: BSD 3 clause
-
-import sys
-from sklearn.externals.six.moves import cStringIO as StringIO
-import numpy as np
-import warnings
-from sklearn.base import BaseEstimator
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_warns
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_false
-from sklearn.datasets import make_classification
-from sklearn.model_selection import KFold
-
-with warnings.catch_warnings():
- warnings.simplefilter('ignore')
- from sklearn.learning_curve import learning_curve, validation_curve
-
-from sklearn.linear_model import PassiveAggressiveClassifier
-
-
-class MockImprovingEstimator(BaseEstimator):
- """Dummy classifier to test the learning curve"""
- def __init__(self, n_max_train_sizes):
- self.n_max_train_sizes = n_max_train_sizes
- self.train_sizes = 0
- self.X_subset = None
-
- def fit(self, X_subset, y_subset=None):
- self.X_subset = X_subset
- self.train_sizes = X_subset.shape[0]
- return self
-
- def predict(self, X):
- raise NotImplementedError
-
- def score(self, X=None, Y=None):
- # training score becomes worse (2 -> 1), test error better (0 -> 1)
- if self._is_training_data(X):
- return 2. - float(self.train_sizes) / self.n_max_train_sizes
- else:
- return float(self.train_sizes) / self.n_max_train_sizes
-
- def _is_training_data(self, X):
- return X is self.X_subset
-
-
-class MockIncrementalImprovingEstimator(MockImprovingEstimator):
- """Dummy classifier that provides partial_fit"""
- def __init__(self, n_max_train_sizes):
- super(MockIncrementalImprovingEstimator,
- self).__init__(n_max_train_sizes)
- self.x = None
-
- def _is_training_data(self, X):
- return self.x in X
-
- def partial_fit(self, X, y=None, **params):
- self.train_sizes += X.shape[0]
- self.x = X[0]
-
-
-class MockEstimatorWithParameter(BaseEstimator):
- """Dummy classifier to test the validation curve"""
- def __init__(self, param=0.5):
- self.X_subset = None
- self.param = param
-
- def fit(self, X_subset, y_subset):
- self.X_subset = X_subset
- self.train_sizes = X_subset.shape[0]
- return self
-
- def predict(self, X):
- raise NotImplementedError
-
- def score(self, X=None, y=None):
- return self.param if self._is_training_data(X) else 1 - self.param
-
- def _is_training_data(self, X):
- return X is self.X_subset
-
-
-class MockEstimatorFailing(BaseEstimator):
- """Dummy classifier to test error_score in learning curve"""
- def fit(self, X_subset, y_subset):
- raise ValueError()
-
- def score(self, X=None, y=None):
- return None
-
-
-class MockEstimatorWithSingleFitCallAllowed(MockEstimatorWithParameter):
- """Dummy classifier that disallows repeated calls of fit method"""
-
- def fit(self, X_subset, y_subset):
- assert_false(
- hasattr(self, 'fit_called_'),
- 'fit is called the second time'
- )
- self.fit_called_ = True
- return super(type(self), self).fit(X_subset, y_subset)
-
-
-def test_learning_curve():
- X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- estimator = MockImprovingEstimator(20)
- with warnings.catch_warnings(record=True) as w:
- train_sizes, train_scores, test_scores = learning_curve(
- estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
- if len(w) > 0:
- raise RuntimeError("Unexpected warning: %r" % w[0].message)
- assert_equal(train_scores.shape, (10, 3))
- assert_equal(test_scores.shape, (10, 3))
- assert_array_equal(train_sizes, np.linspace(2, 20, 10))
- assert_array_almost_equal(train_scores.mean(axis=1),
- np.linspace(1.9, 1.0, 10))
- assert_array_almost_equal(test_scores.mean(axis=1),
- np.linspace(0.1, 1.0, 10))
-
-
-def test_learning_curve_unsupervised():
- X, _ = make_classification(n_samples=30, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- estimator = MockImprovingEstimator(20)
- train_sizes, train_scores, test_scores = learning_curve(
- estimator, X, y=None, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
- assert_array_equal(train_sizes, np.linspace(2, 20, 10))
- assert_array_almost_equal(train_scores.mean(axis=1),
- np.linspace(1.9, 1.0, 10))
- assert_array_almost_equal(test_scores.mean(axis=1),
- np.linspace(0.1, 1.0, 10))
-
-
-def test_learning_curve_verbose():
- X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- estimator = MockImprovingEstimator(20)
-
- old_stdout = sys.stdout
- sys.stdout = StringIO()
- try:
- train_sizes, train_scores, test_scores = \
- learning_curve(estimator, X, y, cv=3, verbose=1)
- finally:
- out = sys.stdout.getvalue()
- sys.stdout.close()
- sys.stdout = old_stdout
-
- assert("[learning_curve]" in out)
-
-
-def test_learning_curve_error_score():
- X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- estimator = MockEstimatorFailing()
- _, _, test_scores = learning_curve(estimator, X, y, cv=3, error_score=0)
- all_zeros = not np.any(test_scores)
- assert(all_zeros)
-
-
-def test_learning_curve_error_score_default_raise():
- X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- estimator = MockEstimatorFailing()
- assert_raises(ValueError, learning_curve, estimator, X, y, cv=3)
-
-
-def test_learning_curve_incremental_learning_not_possible():
- X, y = make_classification(n_samples=2, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- # The mockup does not have partial_fit()
- estimator = MockImprovingEstimator(1)
- assert_raises(ValueError, learning_curve, estimator, X, y,
- exploit_incremental_learning=True)
-
-
-def test_learning_curve_incremental_learning():
- X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- estimator = MockIncrementalImprovingEstimator(20)
- train_sizes, train_scores, test_scores = learning_curve(
- estimator, X, y, cv=3, exploit_incremental_learning=True,
- train_sizes=np.linspace(0.1, 1.0, 10))
- assert_array_equal(train_sizes, np.linspace(2, 20, 10))
- assert_array_almost_equal(train_scores.mean(axis=1),
- np.linspace(1.9, 1.0, 10))
- assert_array_almost_equal(test_scores.mean(axis=1),
- np.linspace(0.1, 1.0, 10))
-
-
-def test_learning_curve_incremental_learning_unsupervised():
- X, _ = make_classification(n_samples=30, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- estimator = MockIncrementalImprovingEstimator(20)
- train_sizes, train_scores, test_scores = learning_curve(
- estimator, X, y=None, cv=3, exploit_incremental_learning=True,
- train_sizes=np.linspace(0.1, 1.0, 10))
- assert_array_equal(train_sizes, np.linspace(2, 20, 10))
- assert_array_almost_equal(train_scores.mean(axis=1),
- np.linspace(1.9, 1.0, 10))
- assert_array_almost_equal(test_scores.mean(axis=1),
- np.linspace(0.1, 1.0, 10))
-
-
-def test_learning_curve_batch_and_incremental_learning_are_equal():
- X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- train_sizes = np.linspace(0.2, 1.0, 5)
- estimator = PassiveAggressiveClassifier(max_iter=1, tol=None,
- shuffle=False)
-
- train_sizes_inc, train_scores_inc, test_scores_inc = \
- learning_curve(
- estimator, X, y, train_sizes=train_sizes,
- cv=3, exploit_incremental_learning=True)
- train_sizes_batch, train_scores_batch, test_scores_batch = \
- learning_curve(
- estimator, X, y, cv=3, train_sizes=train_sizes,
- exploit_incremental_learning=False)
-
- assert_array_equal(train_sizes_inc, train_sizes_batch)
- assert_array_almost_equal(train_scores_inc.mean(axis=1),
- train_scores_batch.mean(axis=1))
- assert_array_almost_equal(test_scores_inc.mean(axis=1),
- test_scores_batch.mean(axis=1))
-
-
-def test_learning_curve_n_sample_range_out_of_bounds():
- X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- estimator = MockImprovingEstimator(20)
- assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
- train_sizes=[0, 1])
- assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
- train_sizes=[0.0, 1.0])
- assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
- train_sizes=[0.1, 1.1])
- assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
- train_sizes=[0, 20])
- assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
- train_sizes=[1, 21])
-
-
-def test_learning_curve_remove_duplicate_sample_sizes():
- X, y = make_classification(n_samples=3, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- estimator = MockImprovingEstimator(2)
- train_sizes, _, _ = assert_warns(
- RuntimeWarning, learning_curve, estimator, X, y, cv=3,
- train_sizes=np.linspace(0.33, 1.0, 3))
- assert_array_equal(train_sizes, [1, 2])
-
-
-def test_learning_curve_with_boolean_indices():
- X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- estimator = MockImprovingEstimator(20)
- cv = KFold(n=30, n_folds=3)
- train_sizes, train_scores, test_scores = learning_curve(
- estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10))
- assert_array_equal(train_sizes, np.linspace(2, 20, 10))
- assert_array_almost_equal(train_scores.mean(axis=1),
- np.linspace(1.9, 1.0, 10))
- assert_array_almost_equal(test_scores.mean(axis=1),
- np.linspace(0.1, 1.0, 10))
-
-
-def test_validation_curve():
- X, y = make_classification(n_samples=2, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
- param_range = np.linspace(0, 1, 10)
- with warnings.catch_warnings(record=True) as w:
- train_scores, test_scores = validation_curve(
- MockEstimatorWithParameter(), X, y, param_name="param",
- param_range=param_range, cv=2
- )
- if len(w) > 0:
- raise RuntimeError("Unexpected warning: %r" % w[0].message)
-
- assert_array_almost_equal(train_scores.mean(axis=1), param_range)
- assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range)
-
-
-def test_validation_curve_clone_estimator():
- X, y = make_classification(n_samples=2, n_features=1, n_informative=1,
- n_redundant=0, n_classes=2,
- n_clusters_per_class=1, random_state=0)
-
- param_range = np.linspace(1, 0, 10)
- _, _ = validation_curve(
- MockEstimatorWithSingleFitCallAllowed(), X, y,
- param_name="param", param_range=param_range, cv=2
- )
From 776bba1248ebfa98edcf5eebbb35b5e6fa79ecd1 Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Thu, 9 Nov 2017 17:30:14 +0100
Subject: [PATCH 05/36] Remove gaussian_process
---
sklearn/gaussian_process/__init__.py | 3 +-
sklearn/gaussian_process/gaussian_process.py | 882 ------------------
.../tests/test_gaussian_process.py | 175 ----
sklearn/utils/estimator_checks.py | 4 +-
4 files changed, 2 insertions(+), 1062 deletions(-)
delete mode 100644 sklearn/gaussian_process/gaussian_process.py
delete mode 100644 sklearn/gaussian_process/tests/test_gaussian_process.py
diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py
index 48d9aa05aaf84..377f15795ee58 100644
--- a/sklearn/gaussian_process/__init__.py
+++ b/sklearn/gaussian_process/__init__.py
@@ -14,10 +14,9 @@
from .gpc import GaussianProcessClassifier
from . import kernels
-from .gaussian_process import GaussianProcess
from . import correlation_models
from . import regression_models
-__all__ = ['GaussianProcess', 'correlation_models', 'regression_models',
+__all__ = ['correlation_models', 'regression_models',
'GaussianProcessRegressor', 'GaussianProcessClassifier',
'kernels']
diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py
deleted file mode 100644
index 8c7491e648d31..0000000000000
--- a/sklearn/gaussian_process/gaussian_process.py
+++ /dev/null
@@ -1,882 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Author: Vincent Dubourg
-# (mostly translation, see implementation details)
-# License: BSD 3 clause
-
-from __future__ import print_function
-
-import numpy as np
-from scipy import linalg, optimize
-
-from ..base import BaseEstimator, RegressorMixin
-from ..metrics.pairwise import manhattan_distances
-from ..utils import check_random_state, check_array, check_X_y
-from ..utils.validation import check_is_fitted
-from . import regression_models as regression
-from . import correlation_models as correlation
-from ..utils import deprecated
-
-MACHINE_EPSILON = np.finfo(np.double).eps
-
-
-@deprecated("l1_cross_distances was deprecated in version 0.18 "
- "and will be removed in 0.20.")
-def l1_cross_distances(X):
- """
- Computes the nonzero componentwise L1 cross-distances between the vectors
- in X.
-
- Parameters
- ----------
-
- X : array_like
- An array with shape (n_samples, n_features)
-
- Returns
- -------
-
- D : array with shape (n_samples * (n_samples - 1) / 2, n_features)
- The array of componentwise L1 cross-distances.
-
- ij : arrays with shape (n_samples * (n_samples - 1) / 2, 2)
- The indices i and j of the vectors in X associated to the cross-
- distances in D: D[k] = np.abs(X[ij[k, 0]] - Y[ij[k, 1]]).
- """
- X = check_array(X)
- n_samples, n_features = X.shape
- n_nonzero_cross_dist = n_samples * (n_samples - 1) // 2
- ij = np.zeros((n_nonzero_cross_dist, 2), dtype=np.int)
- D = np.zeros((n_nonzero_cross_dist, n_features))
- ll_1 = 0
- for k in range(n_samples - 1):
- ll_0 = ll_1
- ll_1 = ll_0 + n_samples - k - 1
- ij[ll_0:ll_1, 0] = k
- ij[ll_0:ll_1, 1] = np.arange(k + 1, n_samples)
- D[ll_0:ll_1] = np.abs(X[k] - X[(k + 1):n_samples])
-
- return D, ij
-
-
-@deprecated("GaussianProcess was deprecated in version 0.18 and will be "
- "removed in 0.20. Use the GaussianProcessRegressor instead.")
-class GaussianProcess(BaseEstimator, RegressorMixin):
- """The legacy Gaussian Process model class.
-
- .. deprecated:: 0.18
- This class will be removed in 0.20.
- Use the :class:`GaussianProcessRegressor` instead.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- regr : string or callable, optional
- A regression function returning an array of outputs of the linear
- regression functional basis. The number of observations n_samples
- should be greater than the size p of this basis.
- Default assumes a simple constant regression trend.
- Available built-in regression models are::
-
- 'constant', 'linear', 'quadratic'
-
- corr : string or callable, optional
- A stationary autocorrelation function returning the autocorrelation
- between two points x and x'.
- Default assumes a squared-exponential autocorrelation model.
- Built-in correlation models are::
-
- 'absolute_exponential', 'squared_exponential',
- 'generalized_exponential', 'cubic', 'linear'
-
- beta0 : double array_like, optional
- The regression weight vector to perform Ordinary Kriging (OK).
- Default assumes Universal Kriging (UK) so that the vector beta of
- regression weights is estimated using the maximum likelihood
- principle.
-
- storage_mode : string, optional
- A string specifying whether the Cholesky decomposition of the
- correlation matrix should be stored in the class (storage_mode =
- 'full') or not (storage_mode = 'light').
- Default assumes storage_mode = 'full', so that the
- Cholesky decomposition of the correlation matrix is stored.
- This might be a useful parameter when one is not interested in the
- MSE and only plan to estimate the BLUP, for which the correlation
- matrix is not required.
-
- verbose : boolean, optional
- A boolean specifying the verbose level.
- Default is verbose = False.
-
- theta0 : double array_like, optional
- An array with shape (n_features, ) or (1, ).
- The parameters in the autocorrelation model.
- If thetaL and thetaU are also specified, theta0 is considered as
- the starting point for the maximum likelihood estimation of the
- best set of parameters.
- Default assumes isotropic autocorrelation model with theta0 = 1e-1.
-
- thetaL : double array_like, optional
- An array with shape matching theta0's.
- Lower bound on the autocorrelation parameters for maximum
- likelihood estimation.
- Default is None, so that it skips maximum likelihood estimation and
- it uses theta0.
-
- thetaU : double array_like, optional
- An array with shape matching theta0's.
- Upper bound on the autocorrelation parameters for maximum
- likelihood estimation.
- Default is None, so that it skips maximum likelihood estimation and
- it uses theta0.
-
- normalize : boolean, optional
- Input X and observations y are centered and reduced wrt
- means and standard deviations estimated from the n_samples
- observations provided.
- Default is normalize = True so that data is normalized to ease
- maximum likelihood estimation.
-
- nugget : double or ndarray, optional
- Introduce a nugget effect to allow smooth predictions from noisy
- data. If nugget is an ndarray, it must be the same length as the
- number of data points used for the fit.
- The nugget is added to the diagonal of the assumed training covariance;
- in this way it acts as a Tikhonov regularization in the problem. In
- the special case of the squared exponential correlation function, the
- nugget mathematically represents the variance of the input values.
- Default assumes a nugget close to machine precision for the sake of
- robustness (nugget = 10. * MACHINE_EPSILON).
-
- optimizer : string, optional
- A string specifying the optimization algorithm to be used.
- Default uses 'fmin_cobyla' algorithm from scipy.optimize.
- Available optimizers are::
-
- 'fmin_cobyla', 'Welch'
-
- 'Welch' optimizer is dued to Welch et al., see reference [WBSWM1992]_.
- It consists in iterating over several one-dimensional optimizations
- instead of running one single multi-dimensional optimization.
-
- random_start : int, optional
- The number of times the Maximum Likelihood Estimation should be
- performed from a random starting point.
- The first MLE always uses the specified starting point (theta0),
- the next starting points are picked at random according to an
- exponential distribution (log-uniform on [thetaL, thetaU]).
- Default does not use random starting point (random_start = 1).
-
- random_state : int, RandomState instance or None, optional (default=None)
- The generator used to shuffle the sequence of coordinates of theta in
- the Welch optimizer. If int, random_state is the seed used by the
- random number generator; If RandomState instance, random_state is the
- random number generator; If None, the random number generator is the
- RandomState instance used by `np.random`.
-
- Attributes
- ----------
- theta_ : array
- Specified theta OR the best set of autocorrelation parameters (the \
- sought maximizer of the reduced likelihood function).
-
- reduced_likelihood_function_value_ : array
- The optimal reduced likelihood function value.
-
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.gaussian_process import GaussianProcess
- >>> X = np.array([[1., 3., 5., 6., 7., 8.]]).T
- >>> y = (X * np.sin(X)).ravel()
- >>> gp = GaussianProcess(theta0=0.1, thetaL=.001, thetaU=1.)
- >>> gp.fit(X, y) # doctest: +ELLIPSIS
- GaussianProcess(beta0=None...
- ...
-
- Notes
- -----
- The presentation implementation is based on a translation of the DACE
- Matlab toolbox, see reference [NLNS2002]_.
-
- References
- ----------
-
- .. [NLNS2002] `H.B. Nielsen, S.N. Lophaven, H. B. Nielsen and J.
- Sondergaard. DACE - A MATLAB Kriging Toolbox.` (2002)
- http://imedea.uib-csic.es/master/cambioglobal/Modulo_V_cod101615/Lab/lab_maps/krigging/DACE-krigingsoft/dace/dace.pdf
-
- .. [WBSWM1992] `W.J. Welch, R.J. Buck, J. Sacks, H.P. Wynn, T.J. Mitchell,
- and M.D. Morris (1992). Screening, predicting, and computer
- experiments. Technometrics, 34(1) 15--25.`
- http://www.jstor.org/stable/1269548
- """
-
- _regression_types = {
- 'constant': regression.constant,
- 'linear': regression.linear,
- 'quadratic': regression.quadratic}
-
- _correlation_types = {
- 'absolute_exponential': correlation.absolute_exponential,
- 'squared_exponential': correlation.squared_exponential,
- 'generalized_exponential': correlation.generalized_exponential,
- 'cubic': correlation.cubic,
- 'linear': correlation.linear}
-
- _optimizer_types = [
- 'fmin_cobyla',
- 'Welch']
-
- def __init__(self, regr='constant', corr='squared_exponential', beta0=None,
- storage_mode='full', verbose=False, theta0=1e-1,
- thetaL=None, thetaU=None, optimizer='fmin_cobyla',
- random_start=1, normalize=True,
- nugget=10. * MACHINE_EPSILON, random_state=None):
-
- self.regr = regr
- self.corr = corr
- self.beta0 = beta0
- self.storage_mode = storage_mode
- self.verbose = verbose
- self.theta0 = theta0
- self.thetaL = thetaL
- self.thetaU = thetaU
- self.normalize = normalize
- self.nugget = nugget
- self.optimizer = optimizer
- self.random_start = random_start
- self.random_state = random_state
-
- def fit(self, X, y):
- """
- The Gaussian Process model fitting method.
-
- Parameters
- ----------
- X : double array_like
- An array with shape (n_samples, n_features) with the input at which
- observations were made.
-
- y : double array_like
- An array with shape (n_samples, ) or shape (n_samples, n_targets)
- with the observations of the output to be predicted.
-
- Returns
- -------
- gp : self
- A fitted Gaussian Process model object awaiting data to perform
- predictions.
- """
- # Run input checks
- self._check_params()
-
- self.random_state = check_random_state(self.random_state)
-
- # Force data to 2D numpy.array
- X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
- self.y_ndim_ = y.ndim
- if y.ndim == 1:
- y = y[:, np.newaxis]
-
- # Check shapes of DOE & observations
- n_samples, n_features = X.shape
- _, n_targets = y.shape
-
- # Run input checks
- self._check_params(n_samples)
-
- # Normalize data or don't
- if self.normalize:
- X_mean = np.mean(X, axis=0)
- X_std = np.std(X, axis=0)
- y_mean = np.mean(y, axis=0)
- y_std = np.std(y, axis=0)
- X_std[X_std == 0.] = 1.
- y_std[y_std == 0.] = 1.
- # center and scale X if necessary
- X = (X - X_mean) / X_std
- y = (y - y_mean) / y_std
- else:
- X_mean = np.zeros(1)
- X_std = np.ones(1)
- y_mean = np.zeros(1)
- y_std = np.ones(1)
-
- # Calculate matrix of distances D between samples
- D, ij = l1_cross_distances(X)
- if (np.min(np.sum(D, axis=1)) == 0.
- and self.corr != correlation.pure_nugget):
- raise Exception("Multiple input features cannot have the same"
- " target value.")
-
- # Regression matrix and parameters
- F = self.regr(X)
- n_samples_F = F.shape[0]
- if F.ndim > 1:
- p = F.shape[1]
- else:
- p = 1
- if n_samples_F != n_samples:
- raise Exception("Number of rows in F and X do not match. Most "
- "likely something is going wrong with the "
- "regression model.")
- if p > n_samples_F:
- raise Exception(("Ordinary least squares problem is undetermined "
- "n_samples=%d must be greater than the "
- "regression model size p=%d.") % (n_samples, p))
- if self.beta0 is not None:
- if self.beta0.shape[0] != p:
- raise Exception("Shapes of beta0 and F do not match.")
-
- # Set attributes
- self.X = X
- self.y = y
- self.D = D
- self.ij = ij
- self.F = F
- self.X_mean, self.X_std = X_mean, X_std
- self.y_mean, self.y_std = y_mean, y_std
-
- # Determine Gaussian Process model parameters
- if self.thetaL is not None and self.thetaU is not None:
- # Maximum Likelihood Estimation of the parameters
- if self.verbose:
- print("Performing Maximum Likelihood Estimation of the "
- "autocorrelation parameters...")
- self.theta_, self.reduced_likelihood_function_value_, par = \
- self._arg_max_reduced_likelihood_function()
- if np.isinf(self.reduced_likelihood_function_value_):
- raise Exception("Bad parameter region. "
- "Try increasing upper bound")
-
- else:
- # Given parameters
- if self.verbose:
- print("Given autocorrelation parameters. "
- "Computing Gaussian Process model parameters...")
- self.theta_ = self.theta0
- self.reduced_likelihood_function_value_, par = \
- self.reduced_likelihood_function()
- if np.isinf(self.reduced_likelihood_function_value_):
- raise Exception("Bad point. Try increasing theta0.")
-
- self.beta = par['beta']
- self.gamma = par['gamma']
- self.sigma2 = par['sigma2']
- self.C = par['C']
- self.Ft = par['Ft']
- self.G = par['G']
-
- if self.storage_mode == 'light':
- # Delete heavy data (it will be computed again if required)
- # (it is required only when MSE is wanted in self.predict)
- if self.verbose:
- print("Light storage mode specified. "
- "Flushing autocorrelation matrix...")
- self.D = None
- self.ij = None
- self.F = None
- self.C = None
- self.Ft = None
- self.G = None
-
- return self
-
- def predict(self, X, eval_MSE=False, batch_size=None):
- """
- This function evaluates the Gaussian Process model at x.
-
- Parameters
- ----------
- X : array_like
- An array with shape (n_eval, n_features) giving the point(s) at
- which the prediction(s) should be made.
-
- eval_MSE : boolean, optional
- A boolean specifying whether the Mean Squared Error should be
- evaluated or not.
- Default assumes evalMSE = False and evaluates only the BLUP (mean
- prediction).
-
- batch_size : integer, optional
- An integer giving the maximum number of points that can be
- evaluated simultaneously (depending on the available memory).
- Default is None so that all given points are evaluated at the same
- time.
-
- Returns
- -------
- y : array_like, shape (n_samples, ) or (n_samples, n_targets)
- An array with shape (n_eval, ) if the Gaussian Process was trained
- on an array of shape (n_samples, ) or an array with shape
- (n_eval, n_targets) if the Gaussian Process was trained on an array
- of shape (n_samples, n_targets) with the Best Linear Unbiased
- Prediction at x.
-
- MSE : array_like, optional (if eval_MSE == True)
- An array with shape (n_eval, ) or (n_eval, n_targets) as with y,
- with the Mean Squared Error at x.
- """
- check_is_fitted(self, "X")
-
- # Check input shapes
- X = check_array(X)
- n_eval, _ = X.shape
- n_samples, n_features = self.X.shape
- n_samples_y, n_targets = self.y.shape
-
- # Run input checks
- self._check_params(n_samples)
-
- if X.shape[1] != n_features:
- raise ValueError(("The number of features in X (X.shape[1] = %d) "
- "should match the number of features used "
- "for fit() "
- "which is %d.") % (X.shape[1], n_features))
-
- if batch_size is None:
- # No memory management
- # (evaluates all given points in a single batch run)
-
- # Normalize input
- X = (X - self.X_mean) / self.X_std
-
- # Get pairwise componentwise L1-distances to the input training set
- dx = manhattan_distances(X, Y=self.X, sum_over_features=False)
- # Get regression function and correlation
- f = self.regr(X)
- r = self.corr(self.theta_, dx).reshape(n_eval, n_samples)
-
- # Scaled predictor
- y_ = np.dot(f, self.beta) + np.dot(r, self.gamma)
-
- # Predictor
- y = (self.y_mean + self.y_std * y_).reshape(n_eval, n_targets)
-
- if self.y_ndim_ == 1:
- y = y.ravel()
-
- # Mean Squared Error
- if eval_MSE:
- C = self.C
- if C is None:
- # Light storage mode (need to recompute C, F, Ft and G)
- if self.verbose:
- print("This GaussianProcess used 'light' storage mode "
- "at instantiation. Need to recompute "
- "autocorrelation matrix...")
- reduced_likelihood_function_value, par = \
- self.reduced_likelihood_function()
- self.C = par['C']
- self.Ft = par['Ft']
- self.G = par['G']
-
- rt = linalg.solve_triangular(self.C, r.T, lower=True)
-
- if self.beta0 is None:
- # Universal Kriging
- u = linalg.solve_triangular(self.G.T,
- np.dot(self.Ft.T, rt) - f.T,
- lower=True)
- else:
- # Ordinary Kriging
- u = np.zeros((n_targets, n_eval))
-
- MSE = np.dot(self.sigma2.reshape(n_targets, 1),
- (1. - (rt ** 2.).sum(axis=0)
- + (u ** 2.).sum(axis=0))[np.newaxis, :])
- MSE = np.sqrt((MSE ** 2.).sum(axis=0) / n_targets)
-
- # Mean Squared Error might be slightly negative depending on
- # machine precision: force to zero!
- MSE[MSE < 0.] = 0.
-
- if self.y_ndim_ == 1:
- MSE = MSE.ravel()
-
- return y, MSE
-
- else:
-
- return y
-
- else:
- # Memory management
-
- if type(batch_size) is not int or batch_size <= 0:
- raise Exception("batch_size must be a positive integer")
-
- if eval_MSE:
-
- y, MSE = np.zeros(n_eval), np.zeros(n_eval)
- for k in range(max(1, int(n_eval / batch_size))):
- batch_from = k * batch_size
- batch_to = min([(k + 1) * batch_size + 1, n_eval + 1])
- y[batch_from:batch_to], MSE[batch_from:batch_to] = \
- self.predict(X[batch_from:batch_to],
- eval_MSE=eval_MSE, batch_size=None)
-
- return y, MSE
-
- else:
-
- y = np.zeros(n_eval)
- for k in range(max(1, int(n_eval / batch_size))):
- batch_from = k * batch_size
- batch_to = min([(k + 1) * batch_size + 1, n_eval + 1])
- y[batch_from:batch_to] = \
- self.predict(X[batch_from:batch_to],
- eval_MSE=eval_MSE, batch_size=None)
-
- return y
-
- def reduced_likelihood_function(self, theta=None):
- """
- This function determines the BLUP parameters and evaluates the reduced
- likelihood function for the given autocorrelation parameters theta.
-
- Maximizing this function wrt the autocorrelation parameters theta is
- equivalent to maximizing the likelihood of the assumed joint Gaussian
- distribution of the observations y evaluated onto the design of
- experiments X.
-
- Parameters
- ----------
- theta : array_like, optional
- An array containing the autocorrelation parameters at which the
- Gaussian Process model parameters should be determined.
- Default uses the built-in autocorrelation parameters
- (ie ``theta = self.theta_``).
-
- Returns
- -------
- reduced_likelihood_function_value : double
- The value of the reduced likelihood function associated to the
- given autocorrelation parameters theta.
-
- par : dict
- A dictionary containing the requested Gaussian Process model
- parameters:
-
- - ``sigma2`` is the Gaussian Process variance.
- - ``beta`` is the generalized least-squares regression weights for
- Universal Kriging or given beta0 for Ordinary Kriging.
- - ``gamma`` is the Gaussian Process weights.
- - ``C`` is the Cholesky decomposition of the correlation
- matrix [R].
- - ``Ft`` is the solution of the linear equation system
- [R] x Ft = F
- - ``G`` is the QR decomposition of the matrix Ft.
- """
- check_is_fitted(self, "X")
-
- if theta is None:
- # Use built-in autocorrelation parameters
- theta = self.theta_
-
- # Initialize output
- reduced_likelihood_function_value = - np.inf
- par = {}
-
- # Retrieve data
- n_samples = self.X.shape[0]
- D = self.D
- ij = self.ij
- F = self.F
-
- if D is None:
- # Light storage mode (need to recompute D, ij and F)
- D, ij = l1_cross_distances(self.X)
- if (np.min(np.sum(D, axis=1)) == 0.
- and self.corr != correlation.pure_nugget):
- raise Exception("Multiple X are not allowed")
- F = self.regr(self.X)
-
- # Set up R
- r = self.corr(theta, D)
- R = np.eye(n_samples) * (1. + self.nugget)
- R[ij[:, 0], ij[:, 1]] = r
- R[ij[:, 1], ij[:, 0]] = r
-
- # Cholesky decomposition of R
- try:
- C = linalg.cholesky(R, lower=True)
- except linalg.LinAlgError:
- return reduced_likelihood_function_value, par
-
- # Get generalized least squares solution
- Ft = linalg.solve_triangular(C, F, lower=True)
- Q, G = linalg.qr(Ft, mode='economic')
-
- sv = linalg.svd(G, compute_uv=False)
- rcondG = sv[-1] / sv[0]
- if rcondG < 1e-10:
- # Check F
- sv = linalg.svd(F, compute_uv=False)
- condF = sv[0] / sv[-1]
- if condF > 1e15:
- raise Exception("F is too ill conditioned. Poor combination "
- "of regression model and observations.")
- else:
- # Ft is too ill conditioned, get out (try different theta)
- return reduced_likelihood_function_value, par
-
- Yt = linalg.solve_triangular(C, self.y, lower=True)
- if self.beta0 is None:
- # Universal Kriging
- beta = linalg.solve_triangular(G, np.dot(Q.T, Yt))
- else:
- # Ordinary Kriging
- beta = np.array(self.beta0)
-
- rho = Yt - np.dot(Ft, beta)
- sigma2 = (rho ** 2.).sum(axis=0) / n_samples
- # The determinant of R is equal to the squared product of the diagonal
- # elements of its Cholesky decomposition C
- detR = (np.diag(C) ** (2. / n_samples)).prod()
-
- # Compute/Organize output
- reduced_likelihood_function_value = - sigma2.sum() * detR
- par['sigma2'] = sigma2 * self.y_std ** 2.
- par['beta'] = beta
- par['gamma'] = linalg.solve_triangular(C.T, rho)
- par['C'] = C
- par['Ft'] = Ft
- par['G'] = G
-
- return reduced_likelihood_function_value, par
-
- def _arg_max_reduced_likelihood_function(self):
- """
- This function estimates the autocorrelation parameters theta as the
- maximizer of the reduced likelihood function.
- (Minimization of the opposite reduced likelihood function is used for
- convenience)
-
- Parameters
- ----------
- self : All parameters are stored in the Gaussian Process model object.
-
- Returns
- -------
- optimal_theta : array_like
- The best set of autocorrelation parameters (the sought maximizer of
- the reduced likelihood function).
-
- optimal_reduced_likelihood_function_value : double
- The optimal reduced likelihood function value.
-
- optimal_par : dict
- The BLUP parameters associated to thetaOpt.
- """
-
- # Initialize output
- best_optimal_theta = []
- best_optimal_rlf_value = []
- best_optimal_par = []
-
- if self.verbose:
- print("The chosen optimizer is: " + str(self.optimizer))
- if self.random_start > 1:
- print(str(self.random_start) + " random starts are required.")
-
- percent_completed = 0.
-
- # Force optimizer to fmin_cobyla if the model is meant to be isotropic
- if self.optimizer == 'Welch' and self.theta0.size == 1:
- self.optimizer = 'fmin_cobyla'
-
- if self.optimizer == 'fmin_cobyla':
-
- def minus_reduced_likelihood_function(log10t):
- return - self.reduced_likelihood_function(
- theta=10. ** log10t)[0]
-
- constraints = []
- for i in range(self.theta0.size):
- constraints.append(lambda log10t, i=i:
- log10t[i] - np.log10(self.thetaL[0, i]))
- constraints.append(lambda log10t, i=i:
- np.log10(self.thetaU[0, i]) - log10t[i])
-
- for k in range(self.random_start):
-
- if k == 0:
- # Use specified starting point as first guess
- theta0 = self.theta0
- else:
- # Generate a random starting point log10-uniformly
- # distributed between bounds
- log10theta0 = (np.log10(self.thetaL)
- + self.random_state.rand(*self.theta0.shape)
- * np.log10(self.thetaU / self.thetaL))
- theta0 = 10. ** log10theta0
-
- # Run Cobyla
- try:
- log10_optimal_theta = \
- optimize.fmin_cobyla(minus_reduced_likelihood_function,
- np.log10(theta0).ravel(),
- constraints, disp=0)
- except ValueError as ve:
- print("Optimization failed. Try increasing the ``nugget``")
- raise ve
-
- optimal_theta = 10. ** log10_optimal_theta
- optimal_rlf_value, optimal_par = \
- self.reduced_likelihood_function(theta=optimal_theta)
-
- # Compare the new optimizer to the best previous one
- if k > 0:
- if optimal_rlf_value > best_optimal_rlf_value:
- best_optimal_rlf_value = optimal_rlf_value
- best_optimal_par = optimal_par
- best_optimal_theta = optimal_theta
- else:
- best_optimal_rlf_value = optimal_rlf_value
- best_optimal_par = optimal_par
- best_optimal_theta = optimal_theta
- if self.verbose and self.random_start > 1:
- if (20 * k) / self.random_start > percent_completed:
- percent_completed = (20 * k) / self.random_start
- print("%s completed" % (5 * percent_completed))
-
- optimal_rlf_value = best_optimal_rlf_value
- optimal_par = best_optimal_par
- optimal_theta = best_optimal_theta
-
- elif self.optimizer == 'Welch':
-
- # Backup of the given attributes
- theta0, thetaL, thetaU = self.theta0, self.thetaL, self.thetaU
- corr = self.corr
- verbose = self.verbose
-
- # This will iterate over fmin_cobyla optimizer
- self.optimizer = 'fmin_cobyla'
- self.verbose = False
-
- # Initialize under isotropy assumption
- if verbose:
- print("Initialize under isotropy assumption...")
- self.theta0 = check_array(self.theta0.min())
- self.thetaL = check_array(self.thetaL.min())
- self.thetaU = check_array(self.thetaU.max())
- theta_iso, optimal_rlf_value_iso, par_iso = \
- self._arg_max_reduced_likelihood_function()
- optimal_theta = theta_iso + np.zeros(theta0.shape)
-
- # Iterate over all dimensions of theta allowing for anisotropy
- if verbose:
- print("Now improving allowing for anisotropy...")
- for i in self.random_state.permutation(theta0.size):
- if verbose:
- print("Proceeding along dimension %d..." % (i + 1))
- self.theta0 = check_array(theta_iso)
- self.thetaL = check_array(thetaL[0, i])
- self.thetaU = check_array(thetaU[0, i])
-
- def corr_cut(t, d):
- return corr(check_array(np.hstack([optimal_theta[0][0:i],
- t[0],
- optimal_theta[0][(i +
- 1)::]])),
- d)
-
- self.corr = corr_cut
- optimal_theta[0, i], optimal_rlf_value, optimal_par = \
- self._arg_max_reduced_likelihood_function()
-
- # Restore the given attributes
- self.theta0, self.thetaL, self.thetaU = theta0, thetaL, thetaU
- self.corr = corr
- self.optimizer = 'Welch'
- self.verbose = verbose
-
- else:
-
- raise NotImplementedError("This optimizer ('%s') is not "
- "implemented yet. Please contribute!"
- % self.optimizer)
-
- return optimal_theta, optimal_rlf_value, optimal_par
-
- def _check_params(self, n_samples=None):
-
- # Check regression model
- if not callable(self.regr):
- if self.regr in self._regression_types:
- self.regr = self._regression_types[self.regr]
- else:
- raise ValueError("regr should be one of %s or callable, "
- "%s was given."
- % (self._regression_types.keys(), self.regr))
-
- # Check regression weights if given (Ordinary Kriging)
- if self.beta0 is not None:
- self.beta0 = np.atleast_2d(self.beta0)
- if self.beta0.shape[1] != 1:
- # Force to column vector
- self.beta0 = self.beta0.T
-
- # Check correlation model
- if not callable(self.corr):
- if self.corr in self._correlation_types:
- self.corr = self._correlation_types[self.corr]
- else:
- raise ValueError("corr should be one of %s or callable, "
- "%s was given."
- % (self._correlation_types.keys(), self.corr))
-
- # Check storage mode
- if self.storage_mode != 'full' and self.storage_mode != 'light':
- raise ValueError("Storage mode should either be 'full' or "
- "'light', %s was given." % self.storage_mode)
-
- # Check correlation parameters
- self.theta0 = np.atleast_2d(self.theta0)
- lth = self.theta0.size
-
- if self.thetaL is not None and self.thetaU is not None:
- self.thetaL = np.atleast_2d(self.thetaL)
- self.thetaU = np.atleast_2d(self.thetaU)
- if self.thetaL.size != lth or self.thetaU.size != lth:
- raise ValueError("theta0, thetaL and thetaU must have the "
- "same length.")
- if np.any(self.thetaL <= 0) or np.any(self.thetaU < self.thetaL):
- raise ValueError("The bounds must satisfy O < thetaL <= "
- "thetaU.")
-
- elif self.thetaL is None and self.thetaU is None:
- if np.any(self.theta0 <= 0):
- raise ValueError("theta0 must be strictly positive.")
-
- elif self.thetaL is None or self.thetaU is None:
- raise ValueError("thetaL and thetaU should either be both or "
- "neither specified.")
-
- # Force verbose type to bool
- self.verbose = bool(self.verbose)
-
- # Force normalize type to bool
- self.normalize = bool(self.normalize)
-
- # Check nugget value
- self.nugget = np.asarray(self.nugget)
- if np.any(self.nugget) < 0.:
- raise ValueError("nugget must be positive or zero.")
- if (n_samples is not None
- and self.nugget.shape not in [(), (n_samples,)]):
- raise ValueError("nugget must be either a scalar "
- "or array of length n_samples.")
-
- # Check optimizer
- if self.optimizer not in self._optimizer_types:
- raise ValueError("optimizer should be one of %s"
- % self._optimizer_types)
-
- # Force random_start type to int
- self.random_start = int(self.random_start)
diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py
deleted file mode 100644
index 37d872fc99fb5..0000000000000
--- a/sklearn/gaussian_process/tests/test_gaussian_process.py
+++ /dev/null
@@ -1,175 +0,0 @@
-"""
-Testing for Gaussian Process module (sklearn.gaussian_process)
-"""
-
-# Author: Vincent Dubourg
-# License: BSD 3 clause
-
-import numpy as np
-
-from sklearn.gaussian_process import GaussianProcess
-from sklearn.gaussian_process import regression_models as regression
-from sklearn.gaussian_process import correlation_models as correlation
-from sklearn.datasets import make_regression
-from sklearn.utils.testing import assert_greater, assert_true, assert_raises
-
-
-f = lambda x: x * np.sin(x)
-X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
-X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
-y = f(X).ravel()
-
-
-def test_1d(regr=regression.constant, corr=correlation.squared_exponential,
- random_start=10, beta0=None):
- # MLE estimation of a one-dimensional Gaussian Process model.
- # Check random start optimization.
- # Test the interpolating property.
- gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
- theta0=1e-2, thetaL=1e-4, thetaU=1e-1,
- random_start=random_start, verbose=False).fit(X, y)
- y_pred, MSE = gp.predict(X, eval_MSE=True)
- y2_pred, MSE2 = gp.predict(X2, eval_MSE=True)
-
- assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.)
- and np.allclose(MSE2, 0., atol=10))
-
-
-def test_2d(regr=regression.constant, corr=correlation.squared_exponential,
- random_start=10, beta0=None):
- # MLE estimation of a two-dimensional Gaussian Process model accounting for
- # anisotropy. Check random start optimization.
- # Test the interpolating property.
- b, kappa, e = 5., .5, .1
- g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2.
- X = np.array([[-4.61611719, -6.00099547],
- [4.10469096, 5.32782448],
- [0.00000000, -0.50000000],
- [-6.17289014, -4.6984743],
- [1.3109306, -6.93271427],
- [-5.03823144, 3.10584743],
- [-2.87600388, 6.74310541],
- [5.21301203, 4.26386883]])
- y = g(X).ravel()
-
- thetaL = [1e-4] * 2
- thetaU = [1e-1] * 2
- gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
- theta0=[1e-2] * 2, thetaL=thetaL,
- thetaU=thetaU,
- random_start=random_start, verbose=False)
- gp.fit(X, y)
- y_pred, MSE = gp.predict(X, eval_MSE=True)
-
- assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.))
-
- eps = np.finfo(gp.theta_.dtype).eps
- assert_true(np.all(gp.theta_ >= thetaL - eps)) # Lower bounds of hyperparameters
- assert_true(np.all(gp.theta_ <= thetaU + eps)) # Upper bounds of hyperparameters
-
-
-def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential,
- random_start=10, beta0=None):
- # MLE estimation of a two-dimensional Gaussian Process model accounting for
- # anisotropy. Check random start optimization.
- # Test the GP interpolation for 2D output
- b, kappa, e = 5., .5, .1
- g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2.
- f = lambda x: np.vstack((g(x), g(x))).T
- X = np.array([[-4.61611719, -6.00099547],
- [4.10469096, 5.32782448],
- [0.00000000, -0.50000000],
- [-6.17289014, -4.6984743],
- [1.3109306, -6.93271427],
- [-5.03823144, 3.10584743],
- [-2.87600388, 6.74310541],
- [5.21301203, 4.26386883]])
- y = f(X)
- gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
- theta0=[1e-2] * 2, thetaL=[1e-4] * 2,
- thetaU=[1e-1] * 2,
- random_start=random_start, verbose=False)
- gp.fit(X, y)
- y_pred, MSE = gp.predict(X, eval_MSE=True)
-
- assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.))
-
-
-def test_wrong_number_of_outputs():
- gp = GaussianProcess()
- assert_raises(ValueError, gp.fit, [[1, 2, 3], [4, 5, 6]], [1, 2, 3])
-
-
-def test_more_builtin_correlation_models(random_start=1):
- # Repeat test_1d and test_2d for several built-in correlation
- # models specified as strings.
- all_corr = ['absolute_exponential', 'squared_exponential', 'cubic',
- 'linear']
-
- for corr in all_corr:
- test_1d(regr='constant', corr=corr, random_start=random_start)
- test_2d(regr='constant', corr=corr, random_start=random_start)
- test_2d_2d(regr='constant', corr=corr, random_start=random_start)
-
-
-def test_ordinary_kriging():
- # Repeat test_1d and test_2d with given regression weights (beta0) for
- # different regression models (Ordinary Kriging).
- test_1d(regr='linear', beta0=[0., 0.5])
- test_1d(regr='quadratic', beta0=[0., 0.5, 0.5])
- test_2d(regr='linear', beta0=[0., 0.5, 0.5])
- test_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
- test_2d_2d(regr='linear', beta0=[0., 0.5, 0.5])
- test_2d_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
-
-
-def test_no_normalize():
- gp = GaussianProcess(normalize=False).fit(X, y)
- y_pred = gp.predict(X)
- assert_true(np.allclose(y_pred, y))
-
-
-def test_batch_size():
- # TypeError when using batch_size on Python 3, see
- # https://github.com/scikit-learn/scikit-learn/issues/7329 for more
- # details
- gp = GaussianProcess()
- gp.fit(X, y)
- gp.predict(X, batch_size=1)
- gp.predict(X, batch_size=1, eval_MSE=True)
-
-
-def test_random_starts():
- # Test that an increasing number of random-starts of GP fitting only
- # increases the reduced likelihood function of the optimal theta.
- n_samples, n_features = 50, 3
- rng = np.random.RandomState(0)
- X = rng.randn(n_samples, n_features) * 2 - 1
- y = np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1)
- best_likelihood = -np.inf
- for random_start in range(1, 5):
- gp = GaussianProcess(regr="constant", corr="squared_exponential",
- theta0=[1e-0] * n_features,
- thetaL=[1e-4] * n_features,
- thetaU=[1e+1] * n_features,
- random_start=random_start, random_state=0,
- verbose=False).fit(X, y)
- rlf = gp.reduced_likelihood_function()[0]
- assert_greater(rlf, best_likelihood - np.finfo(np.float32).eps)
- best_likelihood = rlf
-
-
-def test_mse_solving():
- # test the MSE estimate to be sane.
- # non-regression test for ignoring off-diagonals of feature covariance,
- # testing with nugget that renders covariance useless, only
- # using the mean function, with low effective rank of data
- gp = GaussianProcess(corr='absolute_exponential', theta0=1e-4,
- thetaL=1e-12, thetaU=1e-2, nugget=1e-2,
- optimizer='Welch', regr="linear", random_state=0)
-
- X, y = make_regression(n_informative=3, n_features=60, noise=50,
- random_state=0, effective_rank=1)
-
- gp.fit(X, y)
- assert_greater(1000, gp.predict(X, eval_MSE=True)[1].mean())
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index fdbecc358be35..708fb8030de38 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -223,9 +223,7 @@ def _yield_all_checks(name, estimator):
for check in _yield_clustering_checks(name, estimator):
yield check
yield check_fit2d_predict1d
- if name != 'GaussianProcess': # FIXME
- # XXX GaussianProcess deprecated in 0.20
- yield check_fit2d_1sample
+ yield check_fit2d_1sample
yield check_fit2d_1feature
yield check_fit1d
yield check_get_params_invariance
From 59e3f7d61852e255efdeaea16f088bffd293f7a2 Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Thu, 9 Nov 2017 17:59:44 +0100
Subject: [PATCH 06/36] remove code to be removed in 0.19
---
sklearn/multioutput.py | 37 -------------------------------------
1 file changed, 37 deletions(-)
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 5b4389fd0f31b..1e0285db2f737 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -247,43 +247,6 @@ def partial_fit(self, X, y, sample_weight=None):
super(MultiOutputRegressor, self).partial_fit(
X, y, sample_weight=sample_weight)
- def score(self, X, y, sample_weight=None):
- """Returns the coefficient of determination R^2 of the prediction.
-
- The coefficient R^2 is defined as (1 - u/v), where u is the residual
- sum of squares ((y_true - y_pred) ** 2).sum() and v is the regression
- sum of squares ((y_true - y_true.mean()) ** 2).sum().
- Best possible score is 1.0 and it can be negative (because the
- model can be arbitrarily worse). A constant model that always
- predicts the expected value of y, disregarding the input features,
- would get a R^2 score of 0.0.
-
- Notes
- -----
- R^2 is calculated by weighting all the targets equally using
- `multioutput='uniform_average'`.
-
- Parameters
- ----------
- X : array-like, shape (n_samples, n_features)
- Test samples.
-
- y : array-like, shape (n_samples) or (n_samples, n_outputs)
- True values for X.
-
- sample_weight : array-like, shape [n_samples], optional
- Sample weights.
-
- Returns
- -------
- score : float
- R^2 of self.predict(X) wrt. y.
- """
- # XXX remove in 0.19 when r2_score default for multioutput changes
- from .metrics import r2_score
- return r2_score(y, self.predict(X), sample_weight=sample_weight,
- multioutput='uniform_average')
-
class MultiOutputClassifier(MultiOutputEstimator, ClassifierMixin):
"""Multi target classification
From 2ec39c0b8ff31441d575055fc28095b0f11698bd Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Thu, 9 Nov 2017 18:04:39 +0100
Subject: [PATCH 07/36] remove ransac's residual_metric
---
sklearn/linear_model/ransac.py | 32 ++------------------
sklearn/linear_model/tests/test_ransac.py | 33 --------------------
sklearn/multioutput.py | 37 +++++++++++++++++++++++
3 files changed, 40 insertions(+), 62 deletions(-)
diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py
index fa3923dbebb14..322f9923b4925 100644
--- a/sklearn/linear_model/ransac.py
+++ b/sklearn/linear_model/ransac.py
@@ -135,17 +135,6 @@ class RANSACRegressor(BaseEstimator, MetaEstimatorMixin, RegressorMixin):
as 0.99 (the default) and e is the current fraction of inliers w.r.t.
the total number of samples.
- residual_metric : callable, optional
- Metric to reduce the dimensionality of the residuals to 1 for
- multi-dimensional target values ``y.shape[1] > 1``. By default the sum
- of absolute differences is used::
-
- lambda dy: np.sum(np.abs(dy), axis=1)
-
- .. deprecated:: 0.18
- ``residual_metric`` is deprecated from 0.18 and will be removed in
- 0.20. Use ``loss`` instead.
-
loss : string, callable, optional, default "absolute_loss"
String inputs, "absolute_loss" and "squared_loss" are supported which
find the absolute loss and squared loss per sample
@@ -205,8 +194,8 @@ def __init__(self, base_estimator=None, min_samples=None,
residual_threshold=None, is_data_valid=None,
is_model_valid=None, max_trials=100, max_skips=np.inf,
stop_n_inliers=np.inf, stop_score=np.inf,
- stop_probability=0.99, residual_metric=None,
- loss='absolute_loss', random_state=None):
+ stop_probability=0.99, loss='absolute_loss',
+ random_state=None):
self.base_estimator = base_estimator
self.min_samples = min_samples
@@ -218,7 +207,6 @@ def __init__(self, base_estimator=None, min_samples=None,
self.stop_n_inliers = stop_n_inliers
self.stop_score = stop_score
self.stop_probability = stop_probability
- self.residual_metric = residual_metric
self.random_state = random_state
self.loss = loss
@@ -281,12 +269,6 @@ def fit(self, X, y, sample_weight=None):
else:
residual_threshold = self.residual_threshold
- if self.residual_metric is not None:
- warnings.warn(
- "'residual_metric' was deprecated in version 0.18 and "
- "will be removed in version 0.20. Use 'loss' instead.",
- DeprecationWarning)
-
if self.loss == "absolute_loss":
if y.ndim == 1:
loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)
@@ -379,15 +361,7 @@ def fit(self, X, y, sample_weight=None):
# residuals of all data for current random sample model
y_pred = base_estimator.predict(X)
-
- # XXX: Deprecation: Remove this if block in 0.20
- if self.residual_metric is not None:
- diff = y_pred - y
- if diff.ndim == 1:
- diff = diff.reshape(-1, 1)
- residuals_subset = self.residual_metric(diff)
- else:
- residuals_subset = loss_function(y, y_pred)
+ residuals_subset = loss_function(y, y_pred)
# classify data into inliers and outliers
inlier_mask_subset = residuals_subset < residual_threshold
diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
index 6f8e716f9ad19..176d3348246be 100644
--- a/sklearn/linear_model/tests/test_ransac.py
+++ b/sklearn/linear_model/tests/test_ransac.py
@@ -352,39 +352,6 @@ def test_ransac_multi_dimensional_targets():
assert_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
-# XXX: Remove in 0.20
-def test_ransac_residual_metric():
- residual_metric1 = lambda dy: np.sum(np.abs(dy), axis=1)
- residual_metric2 = lambda dy: np.sum(dy ** 2, axis=1)
-
- yyy = np.column_stack([y, y, y])
-
- base_estimator = LinearRegression()
- ransac_estimator0 = RANSACRegressor(base_estimator, min_samples=2,
- residual_threshold=5, random_state=0)
- ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2,
- residual_threshold=5, random_state=0,
- residual_metric=residual_metric1)
- ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2,
- residual_threshold=5, random_state=0,
- residual_metric=residual_metric2)
-
- # multi-dimensional
- ransac_estimator0.fit(X, yyy)
- assert_warns(DeprecationWarning, ransac_estimator1.fit, X, yyy)
- assert_warns(DeprecationWarning, ransac_estimator2.fit, X, yyy)
- assert_array_almost_equal(ransac_estimator0.predict(X),
- ransac_estimator1.predict(X))
- assert_array_almost_equal(ransac_estimator0.predict(X),
- ransac_estimator2.predict(X))
-
- # one-dimensional
- ransac_estimator0.fit(X, y)
- assert_warns(DeprecationWarning, ransac_estimator2.fit, X, y)
- assert_array_almost_equal(ransac_estimator0.predict(X),
- ransac_estimator2.predict(X))
-
-
def test_ransac_residual_loss():
loss_multi1 = lambda y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1)
loss_multi2 = lambda y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1)
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 1e0285db2f737..5b4389fd0f31b 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -247,6 +247,43 @@ def partial_fit(self, X, y, sample_weight=None):
super(MultiOutputRegressor, self).partial_fit(
X, y, sample_weight=sample_weight)
+ def score(self, X, y, sample_weight=None):
+ """Returns the coefficient of determination R^2 of the prediction.
+
+ The coefficient R^2 is defined as (1 - u/v), where u is the residual
+ sum of squares ((y_true - y_pred) ** 2).sum() and v is the regression
+ sum of squares ((y_true - y_true.mean()) ** 2).sum().
+ Best possible score is 1.0 and it can be negative (because the
+ model can be arbitrarily worse). A constant model that always
+ predicts the expected value of y, disregarding the input features,
+ would get a R^2 score of 0.0.
+
+ Notes
+ -----
+ R^2 is calculated by weighting all the targets equally using
+ `multioutput='uniform_average'`.
+
+ Parameters
+ ----------
+ X : array-like, shape (n_samples, n_features)
+ Test samples.
+
+ y : array-like, shape (n_samples) or (n_samples, n_outputs)
+ True values for X.
+
+ sample_weight : array-like, shape [n_samples], optional
+ Sample weights.
+
+ Returns
+ -------
+ score : float
+ R^2 of self.predict(X) wrt. y.
+ """
+ # XXX remove in 0.19 when r2_score default for multioutput changes
+ from .metrics import r2_score
+ return r2_score(y, self.predict(X), sample_weight=sample_weight,
+ multioutput='uniform_average')
+
class MultiOutputClassifier(MultiOutputEstimator, ClassifierMixin):
"""Multi target classification
From c444763e9139b3f4cb2ca976dd8e474ec3a22c4f Mon Sep 17 00:00:00 2001
From: Andreas Mueller
Date: Fri, 8 Sep 2017 12:10:59 -0400
Subject: [PATCH 08/36] remove RandomizedPCA (also from docs references etc)
fixup! remove RandomizedPCA from docs references etc
---
benchmarks/bench_plot_incremental_pca.py | 15 +-
doc/modules/preprocessing.rst | 5 +-
sklearn/decomposition/__init__.py | 3 +-
sklearn/decomposition/incremental_pca.py | 1 -
sklearn/decomposition/pca.py | 245 -----------------------
sklearn/decomposition/tests/test_pca.py | 21 --
sklearn/decomposition/truncated_svd.py | 1 -
7 files changed, 6 insertions(+), 285 deletions(-)
diff --git a/benchmarks/bench_plot_incremental_pca.py b/benchmarks/bench_plot_incremental_pca.py
index 495d58f0f43ee..43b6ff9452c78 100644
--- a/benchmarks/bench_plot_incremental_pca.py
+++ b/benchmarks/bench_plot_incremental_pca.py
@@ -13,7 +13,7 @@
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_lfw_people
-from sklearn.decomposition import IncrementalPCA, RandomizedPCA, PCA
+from sklearn.decomposition import IncrementalPCA, PCA
def plot_results(X, y, label):
@@ -37,7 +37,6 @@ def plot_feature_times(all_times, batch_size, all_components, data):
plot_results(all_components, all_times['pca'], label="PCA")
plot_results(all_components, all_times['ipca'],
label="IncrementalPCA, bsize=%i" % batch_size)
- plot_results(all_components, all_times['rpca'], label="RandomizedPCA")
plt.legend(loc="upper left")
plt.suptitle("Algorithm runtime vs. n_components\n \
LFW, size %i x %i" % data.shape)
@@ -50,7 +49,6 @@ def plot_feature_errors(all_errors, batch_size, all_components, data):
plot_results(all_components, all_errors['pca'], label="PCA")
plot_results(all_components, all_errors['ipca'],
label="IncrementalPCA, bsize=%i" % batch_size)
- plot_results(all_components, all_errors['rpca'], label="RandomizedPCA")
plt.legend(loc="lower left")
plt.suptitle("Algorithm error vs. n_components\n"
"LFW, size %i x %i" % data.shape)
@@ -61,7 +59,6 @@ def plot_feature_errors(all_errors, batch_size, all_components, data):
def plot_batch_times(all_times, n_features, all_batch_sizes, data):
plt.figure()
plot_results(all_batch_sizes, all_times['pca'], label="PCA")
- plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA")
plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA")
plt.legend(loc="lower left")
plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \
@@ -92,11 +89,9 @@ def fixed_batch_size_comparison(data):
all_errors = defaultdict(list)
for n_components in all_features:
pca = PCA(n_components=n_components)
- rpca = RandomizedPCA(n_components=n_components, random_state=1999)
ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
- ('ipca', ipca),
- ('rpca', rpca)]}
+ ('ipca', ipca)]}
for k in sorted(results_dict.keys()):
all_times[k].append(results_dict[k]['time'])
@@ -116,9 +111,7 @@ def variable_batch_size_comparison(data):
all_times = defaultdict(list)
all_errors = defaultdict(list)
pca = PCA(n_components=n_components)
- rpca = RandomizedPCA(n_components=n_components, random_state=1999)
- results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
- ('rpca', rpca)]}
+ results_dict = {k: benchmark(est, data) for k, est in [('pca', pca)]}
# Create flat baselines to compare the variation over batch size
all_times['pca'].extend([results_dict['pca']['time']] *
@@ -138,8 +131,6 @@ def variable_batch_size_comparison(data):
all_errors['ipca'].append(results_dict['ipca']['error'])
plot_batch_times(all_times, n_components, batch_sizes, data)
- # RandomizedPCA error is always worse (approx 100x) than other PCA
- # tests
plot_batch_errors(all_errors, n_components, batch_sizes, data)
faces = fetch_lfw_people(resize=.2, min_faces_per_person=5)
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 8bcb14363d69c..29c77f5c32851 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -235,9 +235,8 @@ data.
independently, since a downstream model can further make some assumption
on the linear independence of the features.
- To address this issue you can use :class:`sklearn.decomposition.PCA`
- or :class:`sklearn.decomposition.RandomizedPCA` with ``whiten=True``
- to further remove the linear correlation across features.
+ To address this issue you can use :class:`sklearn.decomposition.PCA` with
+ ``whiten=True`` to further remove the linear correlation across features.
.. topic:: Scaling a 1D array
diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py
index faca56b91b1d8..34ad76ca46074 100644
--- a/sklearn/decomposition/__init__.py
+++ b/sklearn/decomposition/__init__.py
@@ -5,7 +5,7 @@
"""
from .nmf import NMF, non_negative_factorization
-from .pca import PCA, RandomizedPCA
+from .pca import PCA
from .incremental_pca import IncrementalPCA
from .kernel_pca import KernelPCA
from .sparse_pca import SparsePCA, MiniBatchSparsePCA
@@ -26,7 +26,6 @@
'MiniBatchSparsePCA',
'NMF',
'PCA',
- 'RandomizedPCA',
'SparseCoder',
'SparsePCA',
'dict_learning',
diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py
index 13e51090dd82e..9ed75928cf90c 100644
--- a/sklearn/decomposition/incremental_pca.py
+++ b/sklearn/decomposition/incremental_pca.py
@@ -136,7 +136,6 @@ class IncrementalPCA(_BasePCA):
See also
--------
PCA
- RandomizedPCA
KernelPCA
SparsePCA
TruncatedSVD
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
index 2b715b7e06824..4d528e5994a58 100644
--- a/sklearn/decomposition/pca.py
+++ b/sklearn/decomposition/pca.py
@@ -591,248 +591,3 @@ def score(self, X, y=None):
Average log-likelihood of the samples under the current model
"""
return np.mean(self.score_samples(X))
-
-
-@deprecated("RandomizedPCA was deprecated in 0.18 and will be removed in "
- "0.20. "
- "Use PCA(svd_solver='randomized') instead. The new implementation "
- "DOES NOT store whiten ``components_``. Apply transform to get "
- "them.")
-class RandomizedPCA(BaseEstimator, TransformerMixin):
- """Principal component analysis (PCA) using randomized SVD
-
- .. deprecated:: 0.18
- This class will be removed in 0.20.
- Use :class:`PCA` with parameter svd_solver 'randomized' instead.
- The new implementation DOES NOT store whiten ``components_``.
- Apply transform to get them.
-
- Linear dimensionality reduction using approximated Singular Value
- Decomposition of the data and keeping only the most significant
- singular vectors to project the data to a lower dimensional space.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n_components : int, optional
- Maximum number of components to keep. When not given or None, this
- is set to n_features (the second dimension of the training data).
-
- copy : bool
- If False, data passed to fit are overwritten and running
- fit(X).transform(X) will not yield the expected results,
- use fit_transform(X) instead.
-
- iterated_power : int, default=2
- Number of iterations for the power method.
-
- .. versionchanged:: 0.18
-
- whiten : bool, optional
- When True (False by default) the `components_` vectors are multiplied
- by the square root of (n_samples) and divided by the singular values to
- ensure uncorrelated outputs with unit component-wise variances.
-
- Whitening will remove some information from the transformed signal
- (the relative variance scales of the components) but can sometime
- improve the predictive accuracy of the downstream estimators by
- making their data respect some hard-wired assumptions.
-
- random_state : int, RandomState instance or None, optional, default=None
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- Attributes
- ----------
- components_ : array, shape (n_components, n_features)
- Components with maximum variance.
-
- explained_variance_ratio_ : array, shape (n_components,)
- Percentage of variance explained by each of the selected components.
- If k is not set then all components are stored and the sum of explained
- variances is equal to 1.0.
-
- singular_values_ : array, shape (n_components,)
- The singular values corresponding to each of the selected components.
- The singular values are equal to the 2-norms of the ``n_components``
- variables in the lower-dimensional space.
-
- mean_ : array, shape (n_features,)
- Per-feature empirical mean, estimated from the training set.
-
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.decomposition import RandomizedPCA
- >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
- >>> pca = RandomizedPCA(n_components=2)
- >>> pca.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- RandomizedPCA(copy=True, iterated_power=2, n_components=2,
- random_state=None, whiten=False)
- >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
- [ 0.99244... 0.00755...]
- >>> print(pca.singular_values_) # doctest: +ELLIPSIS
- [ 6.30061... 0.54980...]
-
- See also
- --------
- PCA
- TruncatedSVD
-
- References
- ----------
-
- .. [Halko2009] `Finding structure with randomness: Stochastic algorithms
- for constructing approximate matrix decompositions Halko, et al., 2009
- (arXiv:909)`
-
- .. [MRT] `A randomized algorithm for the decomposition of matrices
- Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert`
-
- """
-
- def __init__(self, n_components=None, copy=True, iterated_power=2,
- whiten=False, random_state=None):
- self.n_components = n_components
- self.copy = copy
- self.iterated_power = iterated_power
- self.whiten = whiten
- self.random_state = random_state
-
- def fit(self, X, y=None):
- """Fit the model with X by extracting the first principal components.
-
- Parameters
- ----------
- X : array-like, shape (n_samples, n_features)
- Training data, where n_samples in the number of samples
- and n_features is the number of features.
-
- y : Ignored
-
- Returns
- -------
- self : object
- Returns the instance itself.
- """
- self._fit(check_array(X))
- return self
-
- def _fit(self, X):
- """Fit the model to the data X.
-
- Parameters
- ----------
- X : array-like, shape (n_samples, n_features)
- Training vector, where n_samples in the number of samples and
- n_features is the number of features.
-
- Returns
- -------
- X : ndarray, shape (n_samples, n_features)
- The input data, copied, centered and whitened when requested.
- """
- random_state = check_random_state(self.random_state)
- X = np.atleast_2d(as_float_array(X, copy=self.copy))
-
- n_samples = X.shape[0]
-
- # Center data
- self.mean_ = np.mean(X, axis=0)
- X -= self.mean_
- if self.n_components is None:
- n_components = X.shape[1]
- else:
- n_components = self.n_components
-
- U, S, V = randomized_svd(X, n_components,
- n_iter=self.iterated_power,
- random_state=random_state)
-
- self.explained_variance_ = exp_var = (S ** 2) / (n_samples - 1)
- full_var = np.var(X, ddof=1, axis=0).sum()
- self.explained_variance_ratio_ = exp_var / full_var
- self.singular_values_ = S # Store the singular values.
-
- if self.whiten:
- self.components_ = V / S[:, np.newaxis] * sqrt(n_samples)
- else:
- self.components_ = V
-
- return X
-
- def transform(self, X):
- """Apply dimensionality reduction on X.
-
- X is projected on the first principal components previous extracted
- from a training set.
-
- Parameters
- ----------
- X : array-like, shape (n_samples, n_features)
- New data, where n_samples in the number of samples
- and n_features is the number of features.
-
- Returns
- -------
- X_new : array-like, shape (n_samples, n_components)
-
- """
- check_is_fitted(self, 'mean_')
-
- X = check_array(X)
- if self.mean_ is not None:
- X = X - self.mean_
-
- X = np.dot(X, self.components_.T)
- return X
-
- def fit_transform(self, X, y=None):
- """Fit the model with X and apply the dimensionality reduction on X.
-
- Parameters
- ----------
- X : array-like, shape (n_samples, n_features)
- New data, where n_samples in the number of samples
- and n_features is the number of features.
-
- y : Ignored
-
- Returns
- -------
- X_new : array-like, shape (n_samples, n_components)
-
- """
- X = check_array(X)
- X = self._fit(X)
- return np.dot(X, self.components_.T)
-
- def inverse_transform(self, X):
- """Transform data back to its original space.
-
- Returns an array X_original whose transform would be X.
-
- Parameters
- ----------
- X : array-like, shape (n_samples, n_components)
- New data, where n_samples in the number of samples
- and n_components is the number of components.
-
- Returns
- -------
- X_original array-like, shape (n_samples, n_features)
-
- Notes
- -----
- If whitening is enabled, inverse_transform does not compute the
- exact inverse operation of transform.
- """
- check_is_fitted(self, 'mean_')
-
- X_original = np.dot(X, self.components_)
- if self.mean_ is not None:
- X_original = X_original + self.mean_
- return X_original
diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
index f1889d1462d2b..b3cf33a4b2176 100644
--- a/sklearn/decomposition/tests/test_pca.py
+++ b/sklearn/decomposition/tests/test_pca.py
@@ -17,7 +17,6 @@
from sklearn import datasets
from sklearn.decomposition import PCA
-from sklearn.decomposition import RandomizedPCA
from sklearn.decomposition.pca import _assess_dimension_
from sklearn.decomposition.pca import _infer_dimension_
@@ -684,26 +683,6 @@ def test_svd_solver_auto():
assert_array_almost_equal(pca.components_, pca_test.components_)
-def test_deprecation_randomized_pca():
- rng = np.random.RandomState(0)
- X = rng.random_sample((5, 4))
-
- depr_message = ("Class RandomizedPCA is deprecated; RandomizedPCA was "
- "deprecated in 0.18 and will be "
- "removed in 0.20. Use PCA(svd_solver='randomized') "
- "instead. The new implementation DOES NOT store "
- "whiten ``components_``. Apply transform to get them.")
-
- def fit_deprecated(X):
- global Y
- rpca = RandomizedPCA(random_state=0)
- Y = rpca.fit_transform(X)
-
- assert_warns_message(DeprecationWarning, depr_message, fit_deprecated, X)
- Y_pca = PCA(svd_solver='randomized', random_state=0).fit_transform(X)
- assert_array_almost_equal(Y, Y_pca)
-
-
def test_pca_sparse_input():
X = np.random.RandomState(0).rand(5, 4)
X = sp.sparse.csr_matrix(X)
diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
index 726f9162eb925..268f8479f7a92 100644
--- a/sklearn/decomposition/truncated_svd.py
+++ b/sklearn/decomposition/truncated_svd.py
@@ -100,7 +100,6 @@ class TruncatedSVD(BaseEstimator, TransformerMixin):
See also
--------
PCA
- RandomizedPCA
References
----------
From a2e40d78eb088f3deff4ce099d0f45f781ce9665 Mon Sep 17 00:00:00 2001
From: Andreas Mueller
Date: Fri, 8 Sep 2017 12:17:25 -0400
Subject: [PATCH 09/36] remove references to old GP, GMM and sparse_center_data
Remove mixture/gmm
---
sklearn/linear_model/tests/test_base.py | 74 --
sklearn/mixture/__init__.py | 14 +-
sklearn/mixture/dpgmm.py | 869 ------------------------
sklearn/mixture/gmm.py | 853 -----------------------
sklearn/mixture/tests/test_dpgmm.py | 237 -------
sklearn/mixture/tests/test_gmm.py | 534 ---------------
6 files changed, 1 insertion(+), 2580 deletions(-)
delete mode 100644 sklearn/mixture/dpgmm.py
delete mode 100644 sklearn/mixture/gmm.py
delete mode 100644 sklearn/mixture/tests/test_dpgmm.py
delete mode 100644 sklearn/mixture/tests/test_gmm.py
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index ed53e1fbb4aa5..30e4cfdcced42 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -6,17 +6,14 @@
import numpy as np
from scipy import sparse
from scipy import linalg
-from itertools import product
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import ignore_warnings
from sklearn.linear_model.base import LinearRegression
from sklearn.linear_model.base import _preprocess_data
-from sklearn.linear_model.base import sparse_center_data, center_data
from sklearn.linear_model.base import _rescale_data
from sklearn.utils import check_random_state
from sklearn.utils.testing import assert_greater
@@ -402,74 +399,3 @@ def test_rescale_data():
rescaled_y2 = y * np.sqrt(sample_weight)
assert_array_almost_equal(rescaled_X, rescaled_X2)
assert_array_almost_equal(rescaled_y, rescaled_y2)
-
-
-@ignore_warnings # all deprecation warnings
-def test_deprecation_center_data():
- n_samples = 200
- n_features = 2
-
- w = 1.0 + rng.rand(n_samples)
- X = rng.rand(n_samples, n_features)
- y = rng.rand(n_samples)
-
- param_grid = product([True, False], [True, False], [True, False],
- [None, w])
-
- for (fit_intercept, normalize, copy, sample_weight) in param_grid:
-
- XX = X.copy() # such that we can try copy=False as well
-
- X1, y1, X1_mean, X1_var, y1_mean = \
- center_data(XX, y, fit_intercept=fit_intercept,
- normalize=normalize, copy=copy,
- sample_weight=sample_weight)
-
- XX = X.copy()
-
- X2, y2, X2_mean, X2_var, y2_mean = \
- _preprocess_data(XX, y, fit_intercept=fit_intercept,
- normalize=normalize, copy=copy,
- sample_weight=sample_weight)
-
- assert_array_almost_equal(X1, X2)
- assert_array_almost_equal(y1, y2)
- assert_array_almost_equal(X1_mean, X2_mean)
- assert_array_almost_equal(X1_var, X2_var)
- assert_array_almost_equal(y1_mean, y2_mean)
-
- # Sparse cases
- X = sparse.csr_matrix(X)
-
- for (fit_intercept, normalize, copy, sample_weight) in param_grid:
-
- X1, y1, X1_mean, X1_var, y1_mean = \
- center_data(X, y, fit_intercept=fit_intercept, normalize=normalize,
- copy=copy, sample_weight=sample_weight)
-
- X2, y2, X2_mean, X2_var, y2_mean = \
- _preprocess_data(X, y, fit_intercept=fit_intercept,
- normalize=normalize, copy=copy,
- sample_weight=sample_weight, return_mean=False)
-
- assert_array_almost_equal(X1.toarray(), X2.toarray())
- assert_array_almost_equal(y1, y2)
- assert_array_almost_equal(X1_mean, X2_mean)
- assert_array_almost_equal(X1_var, X2_var)
- assert_array_almost_equal(y1_mean, y2_mean)
-
- for (fit_intercept, normalize) in product([True, False], [True, False]):
-
- X1, y1, X1_mean, X1_var, y1_mean = \
- sparse_center_data(X, y, fit_intercept=fit_intercept,
- normalize=normalize)
-
- X2, y2, X2_mean, X2_var, y2_mean = \
- _preprocess_data(X, y, fit_intercept=fit_intercept,
- normalize=normalize, return_mean=True)
-
- assert_array_almost_equal(X1.toarray(), X2.toarray())
- assert_array_almost_equal(y1, y2)
- assert_array_almost_equal(X1_mean, X2_mean)
- assert_array_almost_equal(X1_var, X2_var)
- assert_array_almost_equal(y1_mean, y2_mean)
diff --git a/sklearn/mixture/__init__.py b/sklearn/mixture/__init__.py
index 3622518352cae..08f55802e201e 100644
--- a/sklearn/mixture/__init__.py
+++ b/sklearn/mixture/__init__.py
@@ -2,21 +2,9 @@
The :mod:`sklearn.mixture` module implements mixture modeling algorithms.
"""
-from .gmm import sample_gaussian, log_multivariate_normal_density
-from .gmm import GMM, distribute_covar_matrix_to_match_covariance_type
-from .gmm import _validate_covars
-from .dpgmm import DPGMM, VBGMM
-
from .gaussian_mixture import GaussianMixture
from .bayesian_mixture import BayesianGaussianMixture
-__all__ = ['DPGMM',
- 'GMM',
- 'VBGMM',
- '_validate_covars',
- 'distribute_covar_matrix_to_match_covariance_type',
- 'log_multivariate_normal_density',
- 'sample_gaussian',
- 'GaussianMixture',
+__all__ = ['GaussianMixture',
'BayesianGaussianMixture']
diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py
deleted file mode 100644
index ddc861b4c19f0..0000000000000
--- a/sklearn/mixture/dpgmm.py
+++ /dev/null
@@ -1,869 +0,0 @@
-"""Bayesian Gaussian Mixture Models and
-Dirichlet Process Gaussian Mixture Models"""
-from __future__ import print_function
-
-# Author: Alexandre Passos (alexandre.tp@gmail.com)
-# Bertrand Thirion
-#
-# Based on mixture.py by:
-# Ron Weiss
-# Fabian Pedregosa
-#
-
-# Important note for the deprecation cleaning of 0.20 :
-# All the function and classes of this file have been deprecated in 0.18.
-# When you remove this file please also remove the related files
-# - 'sklearn/mixture/gmm.py'
-# - 'sklearn/mixture/test_dpgmm.py'
-# - 'sklearn/mixture/test_gmm.py'
-
-import numpy as np
-from scipy.special import digamma as _digamma, gammaln as _gammaln
-from scipy import linalg
-from scipy.linalg import pinvh
-from scipy.spatial.distance import cdist
-
-from ..externals.six.moves import xrange
-from ..utils import check_random_state, check_array, deprecated
-from ..utils.fixes import logsumexp
-from ..utils.extmath import squared_norm, stable_cumsum
-from ..utils.validation import check_is_fitted
-from .. import cluster
-from .gmm import _GMMBase
-
-
-@deprecated("The function digamma is deprecated in 0.18 and "
- "will be removed in 0.20. Use scipy.special.digamma instead.")
-def digamma(x):
- return _digamma(x + np.finfo(np.float32).eps)
-
-
-@deprecated("The function gammaln is deprecated in 0.18 and "
- "will be removed in 0.20. Use scipy.special.gammaln instead.")
-def gammaln(x):
- return _gammaln(x + np.finfo(np.float32).eps)
-
-
-@deprecated("The function log_normalize is deprecated in 0.18 and "
- "will be removed in 0.20.")
-def log_normalize(v, axis=0):
- """Normalized probabilities from unnormalized log-probabilities"""
- v = np.rollaxis(v, axis)
- v = v.copy()
- v -= v.max(axis=0)
- out = logsumexp(v)
- v = np.exp(v - out)
- v += np.finfo(np.float32).eps
- v /= np.sum(v, axis=0)
- return np.swapaxes(v, 0, axis)
-
-
-@deprecated("The function wishart_log_det is deprecated in 0.18 and "
- "will be removed in 0.20.")
-def wishart_log_det(a, b, detB, n_features):
- """Expected value of the log of the determinant of a Wishart
-
- The expected value of the logarithm of the determinant of a
- wishart-distributed random variable with the specified parameters."""
- l = np.sum(digamma(0.5 * (a - np.arange(-1, n_features - 1))))
- l += n_features * np.log(2)
- return l + detB
-
-
-@deprecated("The function wishart_logz is deprecated in 0.18 and "
- "will be removed in 0.20.")
-def wishart_logz(v, s, dets, n_features):
- "The logarithm of the normalization constant for the wishart distribution"
- z = 0.
- z += 0.5 * v * n_features * np.log(2)
- z += (0.25 * (n_features * (n_features - 1)) * np.log(np.pi))
- z += 0.5 * v * np.log(dets)
- z += np.sum(gammaln(0.5 * (v - np.arange(n_features) + 1)))
- return z
-
-
-def _bound_wishart(a, B, detB):
- """Returns a function of the dof, scale matrix and its determinant
- used as an upper bound in variational approximation of the evidence"""
- n_features = B.shape[0]
- logprior = wishart_logz(a, B, detB, n_features)
- logprior -= wishart_logz(n_features,
- np.identity(n_features),
- 1, n_features)
- logprior += 0.5 * (a - 1) * wishart_log_det(a, B, detB, n_features)
- logprior += 0.5 * a * np.trace(B)
- return logprior
-
-
-##############################################################################
-# Variational bound on the log likelihood of each class
-##############################################################################
-
-
-def _sym_quad_form(x, mu, A):
- """helper function to calculate symmetric quadratic form x.T * A * x"""
- q = (cdist(x, mu[np.newaxis], "mahalanobis", VI=A) ** 2).reshape(-1)
- return q
-
-
-def _bound_state_log_lik(X, initial_bound, precs, means, covariance_type):
- """Update the bound with likelihood terms, for standard covariance types"""
- n_components, n_features = means.shape
- n_samples = X.shape[0]
- bound = np.empty((n_samples, n_components))
- bound[:] = initial_bound
- if covariance_type in ['diag', 'spherical']:
- for k in range(n_components):
- d = X - means[k]
- bound[:, k] -= 0.5 * np.sum(d * d * precs[k], axis=1)
- elif covariance_type == 'tied':
- for k in range(n_components):
- bound[:, k] -= 0.5 * _sym_quad_form(X, means[k], precs)
- elif covariance_type == 'full':
- for k in range(n_components):
- bound[:, k] -= 0.5 * _sym_quad_form(X, means[k], precs[k])
- return bound
-
-
-class _DPGMMBase(_GMMBase):
- """Variational Inference for the Infinite Gaussian Mixture Model.
-
- DPGMM stands for Dirichlet Process Gaussian Mixture Model, and it
- is an infinite mixture model with the Dirichlet Process as a prior
- distribution on the number of clusters. In practice the
- approximate inference algorithm uses a truncated distribution with
- a fixed maximum number of components, but almost always the number
- of components actually used depends on the data.
-
- Stick-breaking Representation of a Gaussian mixture model
- probability distribution. This class allows for easy and efficient
- inference of an approximate posterior distribution over the
- parameters of a Gaussian mixture model with a variable number of
- components (smaller than the truncation parameter n_components).
-
- Initialization is with normally-distributed means and identity
- covariance, for proper convergence.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n_components : int, default 1
- Number of mixture components.
-
- covariance_type : string, default 'diag'
- String describing the type of covariance parameters to
- use. Must be one of 'spherical', 'tied', 'diag', 'full'.
-
- alpha : float, default 1
- Real number representing the concentration parameter of
- the dirichlet process. Intuitively, the Dirichlet Process
- is as likely to start a new cluster for a point as it is
- to add that point to a cluster with alpha elements. A
- higher alpha means more clusters, as the expected number
- of clusters is ``alpha*log(N)``.
-
- tol : float, default 1e-3
- Convergence threshold.
-
- n_iter : int, default 10
- Maximum number of iterations to perform before convergence.
-
- params : string, default 'wmc'
- Controls which parameters are updated in the training
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars.
-
- init_params : string, default 'wmc'
- Controls which parameters are updated in the initialization
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars. Defaults to 'wmc'.
-
- verbose : int, default 0
- Controls output verbosity.
-
- Attributes
- ----------
- covariance_type : string
- String describing the type of covariance parameters used by
- the DP-GMM. Must be one of 'spherical', 'tied', 'diag', 'full'.
-
- n_components : int
- Number of mixture components.
-
- weights_ : array, shape (`n_components`,)
- Mixing weights for each mixture component.
-
- means_ : array, shape (`n_components`, `n_features`)
- Mean parameters for each mixture component.
-
- precs_ : array
- Precision (inverse covariance) parameters for each mixture
- component. The shape depends on `covariance_type`::
-
- (`n_components`, 'n_features') if 'spherical',
- (`n_features`, `n_features`) if 'tied',
- (`n_components`, `n_features`) if 'diag',
- (`n_components`, `n_features`, `n_features`) if 'full'
-
- converged_ : bool
- True when convergence was reached in fit(), False otherwise.
-
- See Also
- --------
- GMM : Finite Gaussian mixture model fit with EM
-
- VBGMM : Finite Gaussian mixture model fit with a variational
- algorithm, better for situations where there might be too little
- data to get a good estimate of the covariance matrix.
- """
- def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
- random_state=None, tol=1e-3, verbose=0, min_covar=None,
- n_iter=10, params='wmc', init_params='wmc'):
- self.alpha = alpha
- super(_DPGMMBase, self).__init__(n_components, covariance_type,
- random_state=random_state,
- tol=tol, min_covar=min_covar,
- n_iter=n_iter, params=params,
- init_params=init_params,
- verbose=verbose)
-
- def _get_precisions(self):
- """Return precisions as a full matrix."""
- if self.covariance_type == 'full':
- return self.precs_
- elif self.covariance_type in ['diag', 'spherical']:
- return [np.diag(cov) for cov in self.precs_]
- elif self.covariance_type == 'tied':
- return [self.precs_] * self.n_components
-
- def _get_covars(self):
- return [pinvh(c) for c in self._get_precisions()]
-
- def _set_covars(self, covars):
- raise NotImplementedError("""The variational algorithm does
- not support setting the covariance parameters.""")
-
- def score_samples(self, X):
- """Return the likelihood of the data under the model.
-
- Compute the bound on log probability of X under the model
- and return the posterior distribution (responsibilities) of
- each mixture component for each element of X.
-
- This is done by computing the parameters for the mean-field of
- z for each observation.
-
- Parameters
- ----------
- X : array_like, shape (n_samples, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- logprob : array_like, shape (n_samples,)
- Log probabilities of each data point in X
- responsibilities : array_like, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation
- """
- check_is_fitted(self, 'gamma_')
-
- X = check_array(X)
- if X.ndim == 1:
- X = X[:, np.newaxis]
- sd = digamma(self.gamma_.T[1] + self.gamma_.T[2])
- dgamma1 = digamma(self.gamma_.T[1]) - sd
- dgamma2 = np.zeros(self.n_components)
- dgamma2[0] = digamma(self.gamma_[0, 2]) - digamma(self.gamma_[0, 1] +
- self.gamma_[0, 2])
- for j in range(1, self.n_components):
- dgamma2[j] = dgamma2[j - 1] + digamma(self.gamma_[j - 1, 2])
- dgamma2[j] -= sd[j - 1]
- dgamma = dgamma1 + dgamma2
- # Free memory and developers cognitive load:
- del dgamma1, dgamma2, sd
-
- if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']:
- raise NotImplementedError("This ctype is not implemented: %s"
- % self.covariance_type)
- p = _bound_state_log_lik(X, self._initial_bound + self.bound_prec_,
- self.precs_, self.means_,
- self.covariance_type)
- z = p + dgamma
- z = log_normalize(z, axis=-1)
- bound = np.sum(z * p, axis=-1)
- return bound, z
-
- def _update_concentration(self, z):
- """Update the concentration parameters for each cluster"""
- sz = np.sum(z, axis=0)
- self.gamma_.T[1] = 1. + sz
- self.gamma_.T[2].fill(0)
- for i in range(self.n_components - 2, -1, -1):
- self.gamma_[i, 2] = self.gamma_[i + 1, 2] + sz[i]
- self.gamma_.T[2] += self.alpha
-
- def _update_means(self, X, z):
- """Update the variational distributions for the means"""
- n_features = X.shape[1]
- for k in range(self.n_components):
- if self.covariance_type in ['spherical', 'diag']:
- num = np.sum(z.T[k].reshape((-1, 1)) * X, axis=0)
- num *= self.precs_[k]
- den = 1. + self.precs_[k] * np.sum(z.T[k])
- self.means_[k] = num / den
- elif self.covariance_type in ['tied', 'full']:
- if self.covariance_type == 'tied':
- cov = self.precs_
- else:
- cov = self.precs_[k]
- den = np.identity(n_features) + cov * np.sum(z.T[k])
- num = np.sum(z.T[k].reshape((-1, 1)) * X, axis=0)
- num = np.dot(cov, num)
- self.means_[k] = linalg.lstsq(den, num)[0]
-
- def _update_precisions(self, X, z):
- """Update the variational distributions for the precisions"""
- n_features = X.shape[1]
- if self.covariance_type == 'spherical':
- self.dof_ = 0.5 * n_features * np.sum(z, axis=0)
- for k in range(self.n_components):
- # could be more memory efficient ?
- sq_diff = np.sum((X - self.means_[k]) ** 2, axis=1)
- self.scale_[k] = 1.
- self.scale_[k] += 0.5 * np.sum(z.T[k] * (sq_diff + n_features))
- self.bound_prec_[k] = (
- 0.5 * n_features * (
- digamma(self.dof_[k]) - np.log(self.scale_[k])))
- self.precs_ = np.tile(self.dof_ / self.scale_, [n_features, 1]).T
-
- elif self.covariance_type == 'diag':
- for k in range(self.n_components):
- self.dof_[k].fill(1. + 0.5 * np.sum(z.T[k], axis=0))
- sq_diff = (X - self.means_[k]) ** 2 # see comment above
- self.scale_[k] = np.ones(n_features) + 0.5 * np.dot(
- z.T[k], (sq_diff + 1))
- self.precs_[k] = self.dof_[k] / self.scale_[k]
- self.bound_prec_[k] = 0.5 * np.sum(digamma(self.dof_[k])
- - np.log(self.scale_[k]))
- self.bound_prec_[k] -= 0.5 * np.sum(self.precs_[k])
-
- elif self.covariance_type == 'tied':
- self.dof_ = 2 + X.shape[0] + n_features
- self.scale_ = (X.shape[0] + 1) * np.identity(n_features)
- for k in range(self.n_components):
- diff = X - self.means_[k]
- self.scale_ += np.dot(diff.T, z[:, k:k + 1] * diff)
- self.scale_ = pinvh(self.scale_)
- self.precs_ = self.dof_ * self.scale_
- self.det_scale_ = linalg.det(self.scale_)
- self.bound_prec_ = 0.5 * wishart_log_det(
- self.dof_, self.scale_, self.det_scale_, n_features)
- self.bound_prec_ -= 0.5 * self.dof_ * np.trace(self.scale_)
-
- elif self.covariance_type == 'full':
- for k in range(self.n_components):
- sum_resp = np.sum(z.T[k])
- self.dof_[k] = 2 + sum_resp + n_features
- self.scale_[k] = (sum_resp + 1) * np.identity(n_features)
- diff = X - self.means_[k]
- self.scale_[k] += np.dot(diff.T, z[:, k:k + 1] * diff)
- self.scale_[k] = pinvh(self.scale_[k])
- self.precs_[k] = self.dof_[k] * self.scale_[k]
- self.det_scale_[k] = linalg.det(self.scale_[k])
- self.bound_prec_[k] = 0.5 * wishart_log_det(
- self.dof_[k], self.scale_[k], self.det_scale_[k],
- n_features)
- self.bound_prec_[k] -= 0.5 * self.dof_[k] * np.trace(
- self.scale_[k])
-
- def _monitor(self, X, z, n, end=False):
- """Monitor the lower bound during iteration
-
- Debug method to help see exactly when it is failing to converge as
- expected.
-
- Note: this is very expensive and should not be used by default."""
- if self.verbose > 0:
- print("Bound after updating %8s: %f" % (n, self.lower_bound(X, z)))
- if end:
- print("Cluster proportions:", self.gamma_.T[1])
- print("covariance_type:", self.covariance_type)
-
- def _do_mstep(self, X, z, params):
- """Maximize the variational lower bound
-
- Update each of the parameters to maximize the lower bound."""
- self._monitor(X, z, "z")
- self._update_concentration(z)
- self._monitor(X, z, "gamma")
- if 'm' in params:
- self._update_means(X, z)
- self._monitor(X, z, "mu")
- if 'c' in params:
- self._update_precisions(X, z)
- self._monitor(X, z, "a and b", end=True)
-
- def _initialize_gamma(self):
- "Initializes the concentration parameters"
- self.gamma_ = self.alpha * np.ones((self.n_components, 3))
-
- def _bound_concentration(self):
- """The variational lower bound for the concentration parameter."""
- logprior = gammaln(self.alpha) * self.n_components
- logprior += np.sum((self.alpha - 1) * (
- digamma(self.gamma_.T[2]) - digamma(self.gamma_.T[1] +
- self.gamma_.T[2])))
- logprior += np.sum(- gammaln(self.gamma_.T[1] + self.gamma_.T[2]))
- logprior += np.sum(gammaln(self.gamma_.T[1]) +
- gammaln(self.gamma_.T[2]))
- logprior -= np.sum((self.gamma_.T[1] - 1) * (
- digamma(self.gamma_.T[1]) - digamma(self.gamma_.T[1] +
- self.gamma_.T[2])))
- logprior -= np.sum((self.gamma_.T[2] - 1) * (
- digamma(self.gamma_.T[2]) - digamma(self.gamma_.T[1] +
- self.gamma_.T[2])))
- return logprior
-
- def _bound_means(self):
- "The variational lower bound for the mean parameters"
- logprior = 0.
- logprior -= 0.5 * squared_norm(self.means_)
- logprior -= 0.5 * self.means_.shape[1] * self.n_components
- return logprior
-
- def _bound_precisions(self):
- """Returns the bound term related to precisions"""
- logprior = 0.
- if self.covariance_type == 'spherical':
- logprior += np.sum(gammaln(self.dof_))
- logprior -= np.sum(
- (self.dof_ - 1) * digamma(np.maximum(0.5, self.dof_)))
- logprior += np.sum(- np.log(self.scale_) + self.dof_
- - self.precs_[:, 0])
- elif self.covariance_type == 'diag':
- logprior += np.sum(gammaln(self.dof_))
- logprior -= np.sum(
- (self.dof_ - 1) * digamma(np.maximum(0.5, self.dof_)))
- logprior += np.sum(- np.log(self.scale_) + self.dof_ - self.precs_)
- elif self.covariance_type == 'tied':
- logprior += _bound_wishart(self.dof_, self.scale_, self.det_scale_)
- elif self.covariance_type == 'full':
- for k in range(self.n_components):
- logprior += _bound_wishart(self.dof_[k],
- self.scale_[k],
- self.det_scale_[k])
- return logprior
-
- def _bound_proportions(self, z):
- """Returns the bound term related to proportions"""
- dg12 = digamma(self.gamma_.T[1] + self.gamma_.T[2])
- dg1 = digamma(self.gamma_.T[1]) - dg12
- dg2 = digamma(self.gamma_.T[2]) - dg12
-
- cz = stable_cumsum(z[:, ::-1], axis=-1)[:, -2::-1]
- logprior = np.sum(cz * dg2[:-1]) + np.sum(z * dg1)
- del cz # Save memory
- z_non_zeros = z[z > np.finfo(np.float32).eps]
- logprior -= np.sum(z_non_zeros * np.log(z_non_zeros))
- return logprior
-
- def _logprior(self, z):
- logprior = self._bound_concentration()
- logprior += self._bound_means()
- logprior += self._bound_precisions()
- logprior += self._bound_proportions(z)
- return logprior
-
- def lower_bound(self, X, z):
- """returns a lower bound on model evidence based on X and membership"""
- check_is_fitted(self, 'means_')
-
- if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']:
- raise NotImplementedError("This ctype is not implemented: %s"
- % self.covariance_type)
- X = np.asarray(X)
- if X.ndim == 1:
- X = X[:, np.newaxis]
- c = np.sum(z * _bound_state_log_lik(X, self._initial_bound +
- self.bound_prec_, self.precs_,
- self.means_, self.covariance_type))
-
- return c + self._logprior(z)
-
- def _set_weights(self):
- for i in xrange(self.n_components):
- self.weights_[i] = self.gamma_[i, 1] / (self.gamma_[i, 1]
- + self.gamma_[i, 2])
- self.weights_ /= np.sum(self.weights_)
-
- def _fit(self, X, y=None):
- """Estimate model parameters with the variational
- algorithm.
-
- For a full derivation and description of the algorithm see
- doc/modules/dp-derivation.rst
- or
- http://scikit-learn.org/stable/modules/dp-derivation.html
-
- A initialization step is performed before entering the em
- algorithm. If you want to avoid this step, set the keyword
- argument init_params to the empty string '' when creating
- the object. Likewise, if you would like just to do an
- initialization, set n_iter=0.
-
- Parameters
- ----------
- X : array_like, shape (n, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- responsibilities : array, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation.
- """
- self.random_state_ = check_random_state(self.random_state)
-
- # initialization step
- X = check_array(X)
- if X.ndim == 1:
- X = X[:, np.newaxis]
-
- n_samples, n_features = X.shape
- z = np.ones((n_samples, self.n_components))
- z /= self.n_components
-
- self._initial_bound = - 0.5 * n_features * np.log(2 * np.pi)
- self._initial_bound -= np.log(2 * np.pi * np.e)
-
- if (self.init_params != '') or not hasattr(self, 'gamma_'):
- self._initialize_gamma()
-
- if 'm' in self.init_params or not hasattr(self, 'means_'):
- self.means_ = cluster.KMeans(
- n_clusters=self.n_components,
- random_state=self.random_state_).fit(X).cluster_centers_[::-1]
-
- if 'w' in self.init_params or not hasattr(self, 'weights_'):
- self.weights_ = np.tile(1.0 / self.n_components, self.n_components)
-
- if 'c' in self.init_params or not hasattr(self, 'precs_'):
- if self.covariance_type == 'spherical':
- self.dof_ = np.ones(self.n_components)
- self.scale_ = np.ones(self.n_components)
- self.precs_ = np.ones((self.n_components, n_features))
- self.bound_prec_ = 0.5 * n_features * (
- digamma(self.dof_) - np.log(self.scale_))
- elif self.covariance_type == 'diag':
- self.dof_ = 1 + 0.5 * n_features
- self.dof_ *= np.ones((self.n_components, n_features))
- self.scale_ = np.ones((self.n_components, n_features))
- self.precs_ = np.ones((self.n_components, n_features))
- self.bound_prec_ = 0.5 * (np.sum(digamma(self.dof_) -
- np.log(self.scale_), 1))
- self.bound_prec_ -= 0.5 * np.sum(self.precs_, 1)
- elif self.covariance_type == 'tied':
- self.dof_ = 1.
- self.scale_ = np.identity(n_features)
- self.precs_ = np.identity(n_features)
- self.det_scale_ = 1.
- self.bound_prec_ = 0.5 * wishart_log_det(
- self.dof_, self.scale_, self.det_scale_, n_features)
- self.bound_prec_ -= 0.5 * self.dof_ * np.trace(self.scale_)
- elif self.covariance_type == 'full':
- self.dof_ = (1 + self.n_components + n_samples)
- self.dof_ *= np.ones(self.n_components)
- self.scale_ = [2 * np.identity(n_features)
- for _ in range(self.n_components)]
- self.precs_ = [np.identity(n_features)
- for _ in range(self.n_components)]
- self.det_scale_ = np.ones(self.n_components)
- self.bound_prec_ = np.zeros(self.n_components)
- for k in range(self.n_components):
- self.bound_prec_[k] = wishart_log_det(
- self.dof_[k], self.scale_[k], self.det_scale_[k],
- n_features)
- self.bound_prec_[k] -= (self.dof_[k] *
- np.trace(self.scale_[k]))
- self.bound_prec_ *= 0.5
-
- # EM algorithms
- current_log_likelihood = None
- # reset self.converged_ to False
- self.converged_ = False
-
- for i in range(self.n_iter):
- prev_log_likelihood = current_log_likelihood
- # Expectation step
- curr_logprob, z = self.score_samples(X)
-
- current_log_likelihood = (
- curr_logprob.mean() + self._logprior(z) / n_samples)
-
- # Check for convergence.
- if prev_log_likelihood is not None:
- change = abs(current_log_likelihood - prev_log_likelihood)
- if change < self.tol:
- self.converged_ = True
- break
-
- # Maximization step
- self._do_mstep(X, z, self.params)
-
- if self.n_iter == 0:
- # Need to make sure that there is a z value to output
- # Output zeros because it was just a quick initialization
- z = np.zeros((X.shape[0], self.n_components))
-
- self._set_weights()
-
- return z
-
-
-@deprecated("The `DPGMM` class is not working correctly and it's better "
- "to use `sklearn.mixture.BayesianGaussianMixture` class with "
- "parameter `weight_concentration_prior_type='dirichlet_process'` "
- "instead. DPGMM is deprecated in 0.18 and will be "
- "removed in 0.20.")
-class DPGMM(_DPGMMBase):
- """Dirichlet Process Gaussian Mixture Models
-
- .. deprecated:: 0.18
- This class will be removed in 0.20.
- Use :class:`sklearn.mixture.BayesianGaussianMixture` with
- parameter ``weight_concentration_prior_type='dirichlet_process'``
- instead.
-
- """
-
- def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
- random_state=None, tol=1e-3, verbose=0, min_covar=None,
- n_iter=10, params='wmc', init_params='wmc'):
- super(DPGMM, self).__init__(
- n_components=n_components, covariance_type=covariance_type,
- alpha=alpha, random_state=random_state, tol=tol, verbose=verbose,
- min_covar=min_covar, n_iter=n_iter, params=params,
- init_params=init_params)
-
-
-@deprecated("The `VBGMM` class is not working correctly and it's better "
- "to use `sklearn.mixture.BayesianGaussianMixture` class with "
- "parameter `weight_concentration_prior_type="
- "'dirichlet_distribution'` instead. "
- "VBGMM is deprecated in 0.18 and will be removed in 0.20.")
-class VBGMM(_DPGMMBase):
- """Variational Inference for the Gaussian Mixture Model
-
- .. deprecated:: 0.18
- This class will be removed in 0.20.
- Use :class:`sklearn.mixture.BayesianGaussianMixture` with parameter
- ``weight_concentration_prior_type='dirichlet_distribution'`` instead.
-
- Variational inference for a Gaussian mixture model probability
- distribution. This class allows for easy and efficient inference
- of an approximate posterior distribution over the parameters of a
- Gaussian mixture model with a fixed number of components.
-
- Initialization is with normally-distributed means and identity
- covariance, for proper convergence.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n_components : int, default 1
- Number of mixture components.
-
- covariance_type : string, default 'diag'
- String describing the type of covariance parameters to
- use. Must be one of 'spherical', 'tied', 'diag', 'full'.
-
- alpha : float, default 1
- Real number representing the concentration parameter of
- the dirichlet distribution. Intuitively, the higher the
- value of alpha the more likely the variational mixture of
- Gaussians model will use all components it can.
-
- tol : float, default 1e-3
- Convergence threshold.
-
- n_iter : int, default 10
- Maximum number of iterations to perform before convergence.
-
- params : string, default 'wmc'
- Controls which parameters are updated in the training
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars.
-
- init_params : string, default 'wmc'
- Controls which parameters are updated in the initialization
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars. Defaults to 'wmc'.
-
- verbose : int, default 0
- Controls output verbosity.
-
- Attributes
- ----------
- covariance_type : string
- String describing the type of covariance parameters used by
- the DP-GMM. Must be one of 'spherical', 'tied', 'diag', 'full'.
-
- n_features : int
- Dimensionality of the Gaussians.
-
- n_components : int (read-only)
- Number of mixture components.
-
- weights_ : array, shape (`n_components`,)
- Mixing weights for each mixture component.
-
- means_ : array, shape (`n_components`, `n_features`)
- Mean parameters for each mixture component.
-
- precs_ : array
- Precision (inverse covariance) parameters for each mixture
- component. The shape depends on `covariance_type`::
-
- (`n_components`, 'n_features') if 'spherical',
- (`n_features`, `n_features`) if 'tied',
- (`n_components`, `n_features`) if 'diag',
- (`n_components`, `n_features`, `n_features`) if 'full'
-
- converged_ : bool
- True when convergence was reached in fit(), False
- otherwise.
-
- See Also
- --------
- GMM : Finite Gaussian mixture model fit with EM
- DPGMM : Infinite Gaussian mixture model, using the dirichlet
- process, fit with a variational algorithm
- """
-
- def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
- random_state=None, tol=1e-3, verbose=0,
- min_covar=None, n_iter=10, params='wmc', init_params='wmc'):
- super(VBGMM, self).__init__(
- n_components, covariance_type, random_state=random_state,
- tol=tol, verbose=verbose, min_covar=min_covar,
- n_iter=n_iter, params=params, init_params=init_params)
- self.alpha = alpha
-
- def _fit(self, X, y=None):
- """Estimate model parameters with the variational algorithm.
-
- For a full derivation and description of the algorithm see
- doc/modules/dp-derivation.rst
- or
- http://scikit-learn.org/stable/modules/dp-derivation.html
-
- A initialization step is performed before entering the EM
- algorithm. If you want to avoid this step, set the keyword
- argument init_params to the empty string '' when creating
- the object. Likewise, if you just would like to do an
- initialization, set n_iter=0.
-
- Parameters
- ----------
- X : array_like, shape (n, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- responsibilities : array, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation.
- """
- self.alpha_ = float(self.alpha) / self.n_components
- return super(VBGMM, self)._fit(X, y)
-
- def score_samples(self, X):
- """Return the likelihood of the data under the model.
-
- Compute the bound on log probability of X under the model
- and return the posterior distribution (responsibilities) of
- each mixture component for each element of X.
-
- This is done by computing the parameters for the mean-field of
- z for each observation.
-
- Parameters
- ----------
- X : array_like, shape (n_samples, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- logprob : array_like, shape (n_samples,)
- Log probabilities of each data point in X
- responsibilities : array_like, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation
- """
- check_is_fitted(self, 'gamma_')
-
- X = check_array(X)
- if X.ndim == 1:
- X = X[:, np.newaxis]
- dg = digamma(self.gamma_) - digamma(np.sum(self.gamma_))
-
- if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']:
- raise NotImplementedError("This ctype is not implemented: %s"
- % self.covariance_type)
- p = _bound_state_log_lik(X, self._initial_bound + self.bound_prec_,
- self.precs_, self.means_,
- self.covariance_type)
-
- z = p + dg
- z = log_normalize(z, axis=-1)
- bound = np.sum(z * p, axis=-1)
- return bound, z
-
- def _update_concentration(self, z):
- for i in range(self.n_components):
- self.gamma_[i] = self.alpha_ + np.sum(z.T[i])
-
- def _initialize_gamma(self):
- self.gamma_ = self.alpha_ * np.ones(self.n_components)
-
- def _bound_proportions(self, z):
- logprior = 0.
- dg = digamma(self.gamma_)
- dg -= digamma(np.sum(self.gamma_))
- logprior += np.sum(dg.reshape((-1, 1)) * z.T)
- z_non_zeros = z[z > np.finfo(np.float32).eps]
- logprior -= np.sum(z_non_zeros * np.log(z_non_zeros))
- return logprior
-
- def _bound_concentration(self):
- logprior = gammaln(np.sum(self.gamma_)) - gammaln(self.n_components
- * self.alpha_)
- logprior -= np.sum(gammaln(self.gamma_) - gammaln(self.alpha_))
- sg = digamma(np.sum(self.gamma_))
- logprior += np.sum((self.gamma_ - self.alpha_)
- * (digamma(self.gamma_) - sg))
- return logprior
-
- def _monitor(self, X, z, n, end=False):
- """Monitor the lower bound during iteration
-
- Debug method to help see exactly when it is failing to converge as
- expected.
-
- Note: this is very expensive and should not be used by default."""
- if self.verbose > 0:
- print("Bound after updating %8s: %f" % (n, self.lower_bound(X, z)))
- if end:
- print("Cluster proportions:", self.gamma_)
- print("covariance_type:", self.covariance_type)
-
- def _set_weights(self):
- self.weights_[:] = self.gamma_
- self.weights_ /= np.sum(self.weights_)
diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py
deleted file mode 100644
index 207eff9f1502a..0000000000000
--- a/sklearn/mixture/gmm.py
+++ /dev/null
@@ -1,853 +0,0 @@
-"""
-Gaussian Mixture Models.
-
-This implementation corresponds to frequentist (non-Bayesian) formulation
-of Gaussian Mixture Models.
-"""
-
-# Author: Ron Weiss
-# Fabian Pedregosa
-# Bertrand Thirion
-
-# Important note for the deprecation cleaning of 0.20 :
-# All the functions and classes of this file have been deprecated in 0.18.
-# When you remove this file please also remove the related files
-# - 'sklearn/mixture/dpgmm.py'
-# - 'sklearn/mixture/test_dpgmm.py'
-# - 'sklearn/mixture/test_gmm.py'
-from time import time
-
-import numpy as np
-from scipy import linalg
-
-from ..base import BaseEstimator
-from ..utils import check_random_state, check_array, deprecated
-from ..utils.fixes import logsumexp
-from ..utils.validation import check_is_fitted
-from .. import cluster
-
-from sklearn.externals.six.moves import zip
-
-EPS = np.finfo(float).eps
-
-@deprecated("The function log_multivariate_normal_density is deprecated in 0.18"
- " and will be removed in 0.20.")
-def log_multivariate_normal_density(X, means, covars, covariance_type='diag'):
- """Compute the log probability under a multivariate Gaussian distribution.
-
- Parameters
- ----------
- X : array_like, shape (n_samples, n_features)
- List of n_features-dimensional data points. Each row corresponds to a
- single data point.
-
- means : array_like, shape (n_components, n_features)
- List of n_features-dimensional mean vectors for n_components Gaussians.
- Each row corresponds to a single mean vector.
-
- covars : array_like
- List of n_components covariance parameters for each Gaussian. The shape
- depends on `covariance_type`:
- (n_components, n_features) if 'spherical',
- (n_features, n_features) if 'tied',
- (n_components, n_features) if 'diag',
- (n_components, n_features, n_features) if 'full'
-
- covariance_type : string
- Type of the covariance parameters. Must be one of
- 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'.
-
- Returns
- -------
- lpr : array_like, shape (n_samples, n_components)
- Array containing the log probabilities of each data point in
- X under each of the n_components multivariate Gaussian distributions.
- """
- log_multivariate_normal_density_dict = {
- 'spherical': _log_multivariate_normal_density_spherical,
- 'tied': _log_multivariate_normal_density_tied,
- 'diag': _log_multivariate_normal_density_diag,
- 'full': _log_multivariate_normal_density_full}
- return log_multivariate_normal_density_dict[covariance_type](
- X, means, covars)
-
-
-@deprecated("The function sample_gaussian is deprecated in 0.18"
- " and will be removed in 0.20."
- " Use numpy.random.multivariate_normal instead.")
-def sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
- random_state=None):
- """Generate random samples from a Gaussian distribution.
-
- Parameters
- ----------
- mean : array_like, shape (n_features,)
- Mean of the distribution.
-
- covar : array_like
- Covariance of the distribution. The shape depends on `covariance_type`:
- scalar if 'spherical',
- (n_features) if 'diag',
- (n_features, n_features) if 'tied', or 'full'
-
- covariance_type : string, optional
- Type of the covariance parameters. Must be one of
- 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'.
-
- n_samples : int, optional
- Number of samples to generate. Defaults to 1.
-
- Returns
- -------
- X : array
- Randomly generated sample. The shape depends on `n_samples`:
- (n_features,) if `1`
- (n_features, n_samples) otherwise
- """
- return _sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
- random_state=None)
-
-
-def _sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
- random_state=None):
- rng = check_random_state(random_state)
- n_dim = len(mean)
- rand = rng.randn(n_dim, n_samples)
- if n_samples == 1:
- rand.shape = (n_dim,)
-
- if covariance_type == 'spherical':
- rand *= np.sqrt(covar)
- elif covariance_type == 'diag':
- rand = np.dot(np.diag(np.sqrt(covar)), rand)
- else:
- s, U = linalg.eigh(covar)
- s.clip(0, out=s) # get rid of tiny negatives
- np.sqrt(s, out=s)
- U *= s
- rand = np.dot(U, rand)
-
- return (rand.T + mean).T
-
-
-class _GMMBase(BaseEstimator):
- """Gaussian Mixture Model.
-
- Representation of a Gaussian mixture model probability distribution.
- This class allows for easy evaluation of, sampling from, and
- maximum-likelihood estimation of the parameters of a GMM distribution.
-
- Initializes parameters such that every mixture component has zero
- mean and identity covariance.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n_components : int, optional
- Number of mixture components. Defaults to 1.
-
- covariance_type : string, optional
- String describing the type of covariance parameters to
- use. Must be one of 'spherical', 'tied', 'diag', 'full'.
- Defaults to 'diag'.
-
- random_state : int, RandomState instance or None, optional (default=None)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- min_covar : float, optional
- Floor on the diagonal of the covariance matrix to prevent
- overfitting. Defaults to 1e-3.
-
- tol : float, optional
- Convergence threshold. EM iterations will stop when average
- gain in log-likelihood is below this threshold. Defaults to 1e-3.
-
- n_iter : int, optional
- Number of EM iterations to perform.
-
- n_init : int, optional
- Number of initializations to perform. The best results is kept.
-
- params : string, optional
- Controls which parameters are updated in the training
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars. Defaults to 'wmc'.
-
- init_params : string, optional
- Controls which parameters are updated in the initialization
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars. Defaults to 'wmc'.
-
- verbose : int, default: 0
- Enable verbose output. If 1 then it always prints the current
- initialization and iteration step. If greater than 1 then
- it prints additionally the change and time needed for each step.
-
- Attributes
- ----------
- weights_ : array, shape (`n_components`,)
- This attribute stores the mixing weights for each mixture component.
-
- means_ : array, shape (`n_components`, `n_features`)
- Mean parameters for each mixture component.
-
- covars_ : array
- Covariance parameters for each mixture component. The shape
- depends on `covariance_type`::
-
- (n_components, n_features) if 'spherical',
- (n_features, n_features) if 'tied',
- (n_components, n_features) if 'diag',
- (n_components, n_features, n_features) if 'full'
-
- converged_ : bool
- True when convergence was reached in fit(), False otherwise.
-
- See Also
- --------
-
- DPGMM : Infinite gaussian mixture model, using the Dirichlet
- process, fit with a variational algorithm
-
-
- VBGMM : Finite gaussian mixture model fit with a variational
- algorithm, better for situations where there might be too little
- data to get a good estimate of the covariance matrix.
-
- Examples
- --------
-
- >>> import numpy as np
- >>> from sklearn import mixture
- >>> np.random.seed(1)
- >>> g = mixture.GMM(n_components=2)
- >>> # Generate random observations with two modes centered on 0
- >>> # and 10 to use for training.
- >>> obs = np.concatenate((np.random.randn(100, 1),
- ... 10 + np.random.randn(300, 1)))
- >>> g.fit(obs) # doctest: +NORMALIZE_WHITESPACE
- GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
- n_components=2, n_init=1, n_iter=100, params='wmc',
- random_state=None, tol=0.001, verbose=0)
- >>> np.round(g.weights_, 2)
- array([ 0.75, 0.25])
- >>> np.round(g.means_, 2)
- array([[ 10.05],
- [ 0.06]])
- >>> np.round(g.covars_, 2) # doctest: +SKIP
- array([[[ 1.02]],
- [[ 0.96]]])
- >>> g.predict([[0], [2], [9], [10]]) # doctest: +ELLIPSIS
- array([1, 1, 0, 0]...)
- >>> np.round(g.score([[0], [2], [9], [10]]), 2)
- array([-2.19, -4.58, -1.75, -1.21])
- >>> # Refit the model on new data (initial parameters remain the
- >>> # same), this time with an even split between the two modes.
- >>> g.fit(20 * [[0]] + 20 * [[10]]) # doctest: +NORMALIZE_WHITESPACE
- GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
- n_components=2, n_init=1, n_iter=100, params='wmc',
- random_state=None, tol=0.001, verbose=0)
- >>> np.round(g.weights_, 2)
- array([ 0.5, 0.5])
-
- """
-
- def __init__(self, n_components=1, covariance_type='diag',
- random_state=None, tol=1e-3, min_covar=1e-3,
- n_iter=100, n_init=1, params='wmc', init_params='wmc',
- verbose=0):
- self.n_components = n_components
- self.covariance_type = covariance_type
- self.tol = tol
- self.min_covar = min_covar
- self.random_state = random_state
- self.n_iter = n_iter
- self.n_init = n_init
- self.params = params
- self.init_params = init_params
- self.verbose = verbose
-
- if covariance_type not in ['spherical', 'tied', 'diag', 'full']:
- raise ValueError('Invalid value for covariance_type: %s' %
- covariance_type)
-
- if n_init < 1:
- raise ValueError('GMM estimation requires at least one run')
-
- def _get_covars(self):
- """Covariance parameters for each mixture component.
-
- The shape depends on ``cvtype``::
-
- (n_states, n_features) if 'spherical',
- (n_features, n_features) if 'tied',
- (n_states, n_features) if 'diag',
- (n_states, n_features, n_features) if 'full'
-
- """
- if self.covariance_type == 'full':
- return self.covars_
- elif self.covariance_type == 'diag':
- return [np.diag(cov) for cov in self.covars_]
- elif self.covariance_type == 'tied':
- return [self.covars_] * self.n_components
- elif self.covariance_type == 'spherical':
- return [np.diag(cov) for cov in self.covars_]
-
- def _set_covars(self, covars):
- """Provide values for covariance."""
- covars = np.asarray(covars)
- _validate_covars(covars, self.covariance_type, self.n_components)
- self.covars_ = covars
-
- def score_samples(self, X):
- """Return the per-sample likelihood of the data under the model.
-
- Compute the log probability of X under the model and
- return the posterior distribution (responsibilities) of each
- mixture component for each element of X.
-
- Parameters
- ----------
- X : array_like, shape (n_samples, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- logprob : array_like, shape (n_samples,)
- Log probabilities of each data point in X.
-
- responsibilities : array_like, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation
- """
- check_is_fitted(self, 'means_')
-
- X = check_array(X)
- if X.ndim == 1:
- X = X[:, np.newaxis]
- if X.size == 0:
- return np.array([]), np.empty((0, self.n_components))
- if X.shape[1] != self.means_.shape[1]:
- raise ValueError('The shape of X is not compatible with self')
-
- lpr = (log_multivariate_normal_density(X, self.means_, self.covars_,
- self.covariance_type) +
- np.log(self.weights_))
- logprob = logsumexp(lpr, axis=1)
- responsibilities = np.exp(lpr - logprob[:, np.newaxis])
- return logprob, responsibilities
-
- def score(self, X, y=None):
- """Compute the log probability under the model.
-
- Parameters
- ----------
- X : array_like, shape (n_samples, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- logprob : array_like, shape (n_samples,)
- Log probabilities of each data point in X
- """
- logprob, _ = self.score_samples(X)
- return logprob
-
- def predict(self, X):
- """Predict label for data.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
-
- Returns
- -------
- C : array, shape = (n_samples,) component memberships
- """
- logprob, responsibilities = self.score_samples(X)
- return responsibilities.argmax(axis=1)
-
- def predict_proba(self, X):
- """Predict posterior probability of data under each Gaussian
- in the model.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
-
- Returns
- -------
- responsibilities : array-like, shape = (n_samples, n_components)
- Returns the probability of the sample for each Gaussian
- (state) in the model.
- """
- logprob, responsibilities = self.score_samples(X)
- return responsibilities
-
- def sample(self, n_samples=1, random_state=None):
- """Generate random samples from the model.
-
- Parameters
- ----------
- n_samples : int, optional
- Number of samples to generate. Defaults to 1.
-
- Returns
- -------
- X : array_like, shape (n_samples, n_features)
- List of samples
- """
- check_is_fitted(self, 'means_')
-
- if random_state is None:
- random_state = self.random_state
- random_state = check_random_state(random_state)
- weight_cdf = np.cumsum(self.weights_)
-
- X = np.empty((n_samples, self.means_.shape[1]))
- rand = random_state.rand(n_samples)
- # decide which component to use for each sample
- comps = weight_cdf.searchsorted(rand)
- # for each component, generate all needed samples
- for comp in range(self.n_components):
- # occurrences of current component in X
- comp_in_X = (comp == comps)
- # number of those occurrences
- num_comp_in_X = comp_in_X.sum()
- if num_comp_in_X > 0:
- if self.covariance_type == 'tied':
- cv = self.covars_
- elif self.covariance_type == 'spherical':
- cv = self.covars_[comp][0]
- else:
- cv = self.covars_[comp]
- X[comp_in_X] = _sample_gaussian(
- self.means_[comp], cv, self.covariance_type,
- num_comp_in_X, random_state=random_state).T
- return X
-
- def fit_predict(self, X, y=None):
- """Fit and then predict labels for data.
-
- Warning: Due to the final maximization step in the EM algorithm,
- with low iterations the prediction may not be 100% accurate.
-
- .. versionadded:: 0.17
- *fit_predict* method in Gaussian Mixture Model.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
-
- Returns
- -------
- C : array, shape = (n_samples,) component memberships
- """
- return self._fit(X, y).argmax(axis=1)
-
- def _fit(self, X, y=None, do_prediction=False):
- """Estimate model parameters with the EM algorithm.
-
- A initialization step is performed before entering the
- expectation-maximization (EM) algorithm. If you want to avoid
- this step, set the keyword argument init_params to the empty
- string '' when creating the GMM object. Likewise, if you would
- like just to do an initialization, set n_iter=0.
-
- Parameters
- ----------
- X : array_like, shape (n, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- responsibilities : array, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation.
- """
-
- # initialization step
- X = check_array(X, dtype=np.float64, ensure_min_samples=2,
- estimator=self)
- if X.shape[0] < self.n_components:
- raise ValueError(
- 'GMM estimation with %s components, but got only %s samples' %
- (self.n_components, X.shape[0]))
-
- max_log_prob = -np.infty
-
- if self.verbose > 0:
- print('Expectation-maximization algorithm started.')
-
- for init in range(self.n_init):
- if self.verbose > 0:
- print('Initialization ' + str(init + 1))
- start_init_time = time()
-
- if 'm' in self.init_params or not hasattr(self, 'means_'):
- self.means_ = cluster.KMeans(
- n_clusters=self.n_components,
- random_state=self.random_state).fit(X).cluster_centers_
- if self.verbose > 1:
- print('\tMeans have been initialized.')
-
- if 'w' in self.init_params or not hasattr(self, 'weights_'):
- self.weights_ = np.tile(1.0 / self.n_components,
- self.n_components)
- if self.verbose > 1:
- print('\tWeights have been initialized.')
-
- if 'c' in self.init_params or not hasattr(self, 'covars_'):
- cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1])
- if not cv.shape:
- cv.shape = (1, 1)
- self.covars_ = \
- distribute_covar_matrix_to_match_covariance_type(
- cv, self.covariance_type, self.n_components)
- if self.verbose > 1:
- print('\tCovariance matrices have been initialized.')
-
- # EM algorithms
- current_log_likelihood = None
- # reset self.converged_ to False
- self.converged_ = False
-
- for i in range(self.n_iter):
- if self.verbose > 0:
- print('\tEM iteration ' + str(i + 1))
- start_iter_time = time()
- prev_log_likelihood = current_log_likelihood
- # Expectation step
- log_likelihoods, responsibilities = self.score_samples(X)
- current_log_likelihood = log_likelihoods.mean()
-
- # Check for convergence.
- if prev_log_likelihood is not None:
- change = abs(current_log_likelihood - prev_log_likelihood)
- if self.verbose > 1:
- print('\t\tChange: ' + str(change))
- if change < self.tol:
- self.converged_ = True
- if self.verbose > 0:
- print('\t\tEM algorithm converged.')
- break
-
- # Maximization step
- self._do_mstep(X, responsibilities, self.params,
- self.min_covar)
- if self.verbose > 1:
- print('\t\tEM iteration ' + str(i + 1) + ' took {0:.5f}s'.format(
- time() - start_iter_time))
-
- # if the results are better, keep it
- if self.n_iter:
- if current_log_likelihood > max_log_prob:
- max_log_prob = current_log_likelihood
- best_params = {'weights': self.weights_,
- 'means': self.means_,
- 'covars': self.covars_}
- if self.verbose > 1:
- print('\tBetter parameters were found.')
-
- if self.verbose > 1:
- print('\tInitialization ' + str(init + 1) + ' took {0:.5f}s'.format(
- time() - start_init_time))
-
- # check the existence of an init param that was not subject to
- # likelihood computation issue.
- if np.isneginf(max_log_prob) and self.n_iter:
- raise RuntimeError(
- "EM algorithm was never able to compute a valid likelihood " +
- "given initial parameters. Try different init parameters " +
- "(or increasing n_init) or check for degenerate data.")
-
- if self.n_iter:
- self.covars_ = best_params['covars']
- self.means_ = best_params['means']
- self.weights_ = best_params['weights']
- else: # self.n_iter == 0 occurs when using GMM within HMM
- # Need to make sure that there are responsibilities to output
- # Output zeros because it was just a quick initialization
- responsibilities = np.zeros((X.shape[0], self.n_components))
-
- return responsibilities
-
- def fit(self, X, y=None):
- """Estimate model parameters with the EM algorithm.
-
- A initialization step is performed before entering the
- expectation-maximization (EM) algorithm. If you want to avoid
- this step, set the keyword argument init_params to the empty
- string '' when creating the GMM object. Likewise, if you would
- like just to do an initialization, set n_iter=0.
-
- Parameters
- ----------
- X : array_like, shape (n, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- self
- """
- self._fit(X, y)
- return self
-
- def _do_mstep(self, X, responsibilities, params, min_covar=0):
- """Perform the Mstep of the EM algorithm and return the cluster weights.
- """
- weights = responsibilities.sum(axis=0)
- weighted_X_sum = np.dot(responsibilities.T, X)
- inverse_weights = 1.0 / (weights[:, np.newaxis] + 10 * EPS)
-
- if 'w' in params:
- self.weights_ = (weights / (weights.sum() + 10 * EPS) + EPS)
- if 'm' in params:
- self.means_ = weighted_X_sum * inverse_weights
- if 'c' in params:
- covar_mstep_func = _covar_mstep_funcs[self.covariance_type]
- self.covars_ = covar_mstep_func(
- self, X, responsibilities, weighted_X_sum, inverse_weights,
- min_covar)
- return weights
-
- def _n_parameters(self):
- """Return the number of free parameters in the model."""
- ndim = self.means_.shape[1]
- if self.covariance_type == 'full':
- cov_params = self.n_components * ndim * (ndim + 1) / 2.
- elif self.covariance_type == 'diag':
- cov_params = self.n_components * ndim
- elif self.covariance_type == 'tied':
- cov_params = ndim * (ndim + 1) / 2.
- elif self.covariance_type == 'spherical':
- cov_params = self.n_components
- mean_params = ndim * self.n_components
- return int(cov_params + mean_params + self.n_components - 1)
-
- def bic(self, X):
- """Bayesian information criterion for the current model fit
- and the proposed data.
-
- Parameters
- ----------
- X : array of shape(n_samples, n_dimensions)
-
- Returns
- -------
- bic : float (the lower the better)
- """
- return (-2 * self.score(X).sum() +
- self._n_parameters() * np.log(X.shape[0]))
-
- def aic(self, X):
- """Akaike information criterion for the current model fit
- and the proposed data.
-
- Parameters
- ----------
- X : array of shape(n_samples, n_dimensions)
-
- Returns
- -------
- aic : float (the lower the better)
- """
- return - 2 * self.score(X).sum() + 2 * self._n_parameters()
-
-
-@deprecated("The class GMM is deprecated in 0.18 and will be "
- " removed in 0.20. Use class GaussianMixture instead.")
-class GMM(_GMMBase):
- """
- Legacy Gaussian Mixture Model
-
- .. deprecated:: 0.18
- This class will be removed in 0.20.
- Use :class:`sklearn.mixture.GaussianMixture` instead.
-
- """
-
- def __init__(self, n_components=1, covariance_type='diag',
- random_state=None, tol=1e-3, min_covar=1e-3,
- n_iter=100, n_init=1, params='wmc', init_params='wmc',
- verbose=0):
- super(GMM, self).__init__(
- n_components=n_components, covariance_type=covariance_type,
- random_state=random_state, tol=tol, min_covar=min_covar,
- n_iter=n_iter, n_init=n_init, params=params,
- init_params=init_params, verbose=verbose)
-
-#########################################################################
-# some helper routines
-#########################################################################
-
-
-def _log_multivariate_normal_density_diag(X, means, covars):
- """Compute Gaussian log-density at X for a diagonal model."""
- n_samples, n_dim = X.shape
- lpr = -0.5 * (n_dim * np.log(2 * np.pi) + np.sum(np.log(covars), 1)
- + np.sum((means ** 2) / covars, 1)
- - 2 * np.dot(X, (means / covars).T)
- + np.dot(X ** 2, (1.0 / covars).T))
- return lpr
-
-
-def _log_multivariate_normal_density_spherical(X, means, covars):
- """Compute Gaussian log-density at X for a spherical model."""
- cv = covars.copy()
- if covars.ndim == 1:
- cv = cv[:, np.newaxis]
- if cv.shape[1] == 1:
- cv = np.tile(cv, (1, X.shape[-1]))
- return _log_multivariate_normal_density_diag(X, means, cv)
-
-
-def _log_multivariate_normal_density_tied(X, means, covars):
- """Compute Gaussian log-density at X for a tied model."""
- cv = np.tile(covars, (means.shape[0], 1, 1))
- return _log_multivariate_normal_density_full(X, means, cv)
-
-
-def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7):
- """Log probability for full covariance matrices."""
- n_samples, n_dim = X.shape
- nmix = len(means)
- log_prob = np.empty((n_samples, nmix))
- for c, (mu, cv) in enumerate(zip(means, covars)):
- try:
- cv_chol = linalg.cholesky(cv, lower=True)
- except linalg.LinAlgError:
- # The model is most probably stuck in a component with too
- # few observations, we need to reinitialize this components
- try:
- cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim),
- lower=True)
- except linalg.LinAlgError:
- raise ValueError("'covars' must be symmetric, "
- "positive-definite")
-
- cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol)))
- cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T
- log_prob[:, c] = - .5 * (np.sum(cv_sol ** 2, axis=1) +
- n_dim * np.log(2 * np.pi) + cv_log_det)
-
- return log_prob
-
-
-def _validate_covars(covars, covariance_type, n_components):
- """Do basic checks on matrix covariance sizes and values."""
- from scipy import linalg
- if covariance_type == 'spherical':
- if len(covars) != n_components:
- raise ValueError("'spherical' covars have length n_components")
- elif np.any(covars <= 0):
- raise ValueError("'spherical' covars must be non-negative")
- elif covariance_type == 'tied':
- if covars.shape[0] != covars.shape[1]:
- raise ValueError("'tied' covars must have shape (n_dim, n_dim)")
- elif (not np.allclose(covars, covars.T)
- or np.any(linalg.eigvalsh(covars) <= 0)):
- raise ValueError("'tied' covars must be symmetric, "
- "positive-definite")
- elif covariance_type == 'diag':
- if len(covars.shape) != 2:
- raise ValueError("'diag' covars must have shape "
- "(n_components, n_dim)")
- elif np.any(covars <= 0):
- raise ValueError("'diag' covars must be non-negative")
- elif covariance_type == 'full':
- if len(covars.shape) != 3:
- raise ValueError("'full' covars must have shape "
- "(n_components, n_dim, n_dim)")
- elif covars.shape[1] != covars.shape[2]:
- raise ValueError("'full' covars must have shape "
- "(n_components, n_dim, n_dim)")
- for n, cv in enumerate(covars):
- if (not np.allclose(cv, cv.T)
- or np.any(linalg.eigvalsh(cv) <= 0)):
- raise ValueError("component %d of 'full' covars must be "
- "symmetric, positive-definite" % n)
- else:
- raise ValueError("covariance_type must be one of " +
- "'spherical', 'tied', 'diag', 'full'")
-
-
-@deprecated("The function distribute_covar_matrix_to_match_covariance_type"
- "is deprecated in 0.18 and will be removed in 0.20.")
-def distribute_covar_matrix_to_match_covariance_type(
- tied_cv, covariance_type, n_components):
- """Create all the covariance matrices from a given template."""
- if covariance_type == 'spherical':
- cv = np.tile(tied_cv.mean() * np.ones(tied_cv.shape[1]),
- (n_components, 1))
- elif covariance_type == 'tied':
- cv = tied_cv
- elif covariance_type == 'diag':
- cv = np.tile(np.diag(tied_cv), (n_components, 1))
- elif covariance_type == 'full':
- cv = np.tile(tied_cv, (n_components, 1, 1))
- else:
- raise ValueError("covariance_type must be one of " +
- "'spherical', 'tied', 'diag', 'full'")
- return cv
-
-
-def _covar_mstep_diag(gmm, X, responsibilities, weighted_X_sum, norm,
- min_covar):
- """Perform the covariance M step for diagonal cases."""
- avg_X2 = np.dot(responsibilities.T, X * X) * norm
- avg_means2 = gmm.means_ ** 2
- avg_X_means = gmm.means_ * weighted_X_sum * norm
- return avg_X2 - 2 * avg_X_means + avg_means2 + min_covar
-
-
-def _covar_mstep_spherical(*args):
- """Perform the covariance M step for spherical cases."""
- cv = _covar_mstep_diag(*args)
- return np.tile(cv.mean(axis=1)[:, np.newaxis], (1, cv.shape[1]))
-
-
-def _covar_mstep_full(gmm, X, responsibilities, weighted_X_sum, norm,
- min_covar):
- """Perform the covariance M step for full cases."""
- # Eq. 12 from K. Murphy, "Fitting a Conditional Linear Gaussian
- # Distribution"
- n_features = X.shape[1]
- cv = np.empty((gmm.n_components, n_features, n_features))
- for c in range(gmm.n_components):
- post = responsibilities[:, c]
- mu = gmm.means_[c]
- diff = X - mu
- with np.errstate(under='ignore'):
- # Underflow Errors in doing post * X.T are not important
- avg_cv = np.dot(post * diff.T, diff) / (post.sum() + 10 * EPS)
- cv[c] = avg_cv + min_covar * np.eye(n_features)
- return cv
-
-
-def _covar_mstep_tied(gmm, X, responsibilities, weighted_X_sum, norm,
- min_covar):
- """Perform the covariance M step for tied cases."""
- # Eq. 15 from K. Murphy, "Fitting a Conditional Linear Gaussian
- # Distribution"
- avg_X2 = np.dot(X.T, X)
- avg_means2 = np.dot(gmm.means_.T, weighted_X_sum)
- out = avg_X2 - avg_means2
- out *= 1. / X.shape[0]
- out.flat[::len(out) + 1] += min_covar
- return out
-
-_covar_mstep_funcs = {'spherical': _covar_mstep_spherical,
- 'diag': _covar_mstep_diag,
- 'tied': _covar_mstep_tied,
- 'full': _covar_mstep_full,
- }
diff --git a/sklearn/mixture/tests/test_dpgmm.py b/sklearn/mixture/tests/test_dpgmm.py
deleted file mode 100644
index 8ca38626b4cef..0000000000000
--- a/sklearn/mixture/tests/test_dpgmm.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# Important note for the deprecation cleaning of 0.20 :
-# All the function and classes of this file have been deprecated in 0.18.
-# When you remove this file please also remove the related files
-# - 'sklearn/mixture/dpgmm.py'
-# - 'sklearn/mixture/gmm.py'
-# - 'sklearn/mixture/test_gmm.py'
-import unittest
-import sys
-
-import numpy as np
-
-from sklearn.mixture import DPGMM, VBGMM
-from sklearn.mixture.dpgmm import log_normalize
-from sklearn.datasets import make_blobs
-from sklearn.utils.testing import assert_array_less, assert_equal
-from sklearn.utils.testing import assert_warns_message, ignore_warnings
-from sklearn.mixture.tests.test_gmm import GMMTester
-from sklearn.externals.six.moves import cStringIO as StringIO
-from sklearn.mixture.dpgmm import digamma, gammaln
-from sklearn.mixture.dpgmm import wishart_log_det, wishart_logz
-
-
-np.seterr(all='warn')
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_class_weights():
- # check that the class weights are updated
- # simple 3 cluster dataset
- X, y = make_blobs(random_state=1)
- for Model in [DPGMM, VBGMM]:
- dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50)
- dpgmm.fit(X)
- # get indices of components that are used:
- indices = np.unique(dpgmm.predict(X))
- active = np.zeros(10, dtype=np.bool)
- active[indices] = True
- # used components are important
- assert_array_less(.1, dpgmm.weights_[active])
- # others are not
- assert_array_less(dpgmm.weights_[~active], .05)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_verbose_boolean():
- # checks that the output for the verbose output is the same
- # for the flag values '1' and 'True'
- # simple 3 cluster dataset
- X, y = make_blobs(random_state=1)
- for Model in [DPGMM, VBGMM]:
- dpgmm_bool = Model(n_components=10, random_state=1, alpha=20,
- n_iter=50, verbose=True)
- dpgmm_int = Model(n_components=10, random_state=1, alpha=20,
- n_iter=50, verbose=1)
-
- old_stdout = sys.stdout
- sys.stdout = StringIO()
- try:
- # generate output with the boolean flag
- dpgmm_bool.fit(X)
- verbose_output = sys.stdout
- verbose_output.seek(0)
- bool_output = verbose_output.readline()
- # generate output with the int flag
- dpgmm_int.fit(X)
- verbose_output = sys.stdout
- verbose_output.seek(0)
- int_output = verbose_output.readline()
- assert_equal(bool_output, int_output)
- finally:
- sys.stdout = old_stdout
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_verbose_first_level():
- # simple 3 cluster dataset
- X, y = make_blobs(random_state=1)
- for Model in [DPGMM, VBGMM]:
- dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50,
- verbose=1)
-
- old_stdout = sys.stdout
- sys.stdout = StringIO()
- try:
- dpgmm.fit(X)
- finally:
- sys.stdout = old_stdout
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_verbose_second_level():
- # simple 3 cluster dataset
- X, y = make_blobs(random_state=1)
- for Model in [DPGMM, VBGMM]:
- dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50,
- verbose=2)
-
- old_stdout = sys.stdout
- sys.stdout = StringIO()
- try:
- dpgmm.fit(X)
- finally:
- sys.stdout = old_stdout
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_digamma():
- assert_warns_message(DeprecationWarning, "The function digamma is"
- " deprecated in 0.18 and will be removed in 0.20. "
- "Use scipy.special.digamma instead.", digamma, 3)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_gammaln():
- assert_warns_message(DeprecationWarning, "The function gammaln"
- " is deprecated in 0.18 and will be removed"
- " in 0.20. Use scipy.special.gammaln instead.",
- gammaln, 3)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_log_normalize():
- v = np.array([0.1, 0.8, 0.01, 0.09])
- a = np.log(2 * v)
- result = assert_warns_message(DeprecationWarning, "The function "
- "log_normalize is deprecated in 0.18 and"
- " will be removed in 0.20.",
- log_normalize, a)
- assert np.allclose(v, result, rtol=0.01)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_wishart_log_det():
- a = np.array([0.1, 0.8, 0.01, 0.09])
- b = np.array([0.2, 0.7, 0.05, 0.1])
- assert_warns_message(DeprecationWarning, "The function "
- "wishart_log_det is deprecated in 0.18 and"
- " will be removed in 0.20.",
- wishart_log_det, a, b, 2, 4)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_wishart_logz():
- assert_warns_message(DeprecationWarning, "The function "
- "wishart_logz is deprecated in 0.18 and "
- "will be removed in 0.20.", wishart_logz,
- 3, np.identity(3), 1, 3)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_DPGMM_deprecation():
- assert_warns_message(
- DeprecationWarning, "The `DPGMM` class is not working correctly and "
- "it's better to use `sklearn.mixture.BayesianGaussianMixture` class "
- "with parameter `weight_concentration_prior_type='dirichlet_process'` "
- "instead. DPGMM is deprecated in 0.18 and will be removed in 0.20.",
- DPGMM)
-
-
-def do_model(self, **kwds):
- return VBGMM(verbose=False, **kwds)
-
-
-class DPGMMTester(GMMTester):
- model = DPGMM
- do_test_eval = False
-
- def score(self, g, train_obs):
- _, z = g.score_samples(train_obs)
- return g.lower_bound(train_obs, z)
-
-
-class TestDPGMMWithSphericalCovars(unittest.TestCase, DPGMMTester):
- covariance_type = 'spherical'
- setUp = GMMTester._setUp
-
-
-class TestDPGMMWithDiagCovars(unittest.TestCase, DPGMMTester):
- covariance_type = 'diag'
- setUp = GMMTester._setUp
-
-
-class TestDPGMMWithTiedCovars(unittest.TestCase, DPGMMTester):
- covariance_type = 'tied'
- setUp = GMMTester._setUp
-
-
-class TestDPGMMWithFullCovars(unittest.TestCase, DPGMMTester):
- covariance_type = 'full'
- setUp = GMMTester._setUp
-
-
-def test_VBGMM_deprecation():
- assert_warns_message(
- DeprecationWarning, "The `VBGMM` class is not working correctly and "
- "it's better to use `sklearn.mixture.BayesianGaussianMixture` class "
- "with parameter `weight_concentration_prior_type="
- "'dirichlet_distribution'` instead. VBGMM is deprecated "
- "in 0.18 and will be removed in 0.20.", VBGMM)
-
-
-class VBGMMTester(GMMTester):
- model = do_model
- do_test_eval = False
-
- def score(self, g, train_obs):
- _, z = g.score_samples(train_obs)
- return g.lower_bound(train_obs, z)
-
-
-class TestVBGMMWithSphericalCovars(unittest.TestCase, VBGMMTester):
- covariance_type = 'spherical'
- setUp = GMMTester._setUp
-
-
-class TestVBGMMWithDiagCovars(unittest.TestCase, VBGMMTester):
- covariance_type = 'diag'
- setUp = GMMTester._setUp
-
-
-class TestVBGMMWithTiedCovars(unittest.TestCase, VBGMMTester):
- covariance_type = 'tied'
- setUp = GMMTester._setUp
-
-
-class TestVBGMMWithFullCovars(unittest.TestCase, VBGMMTester):
- covariance_type = 'full'
- setUp = GMMTester._setUp
-
-
-def test_vbgmm_no_modify_alpha():
- alpha = 2.
- n_components = 3
- X, y = make_blobs(random_state=1)
- vbgmm = VBGMM(n_components=n_components, alpha=alpha, n_iter=1)
- assert_equal(vbgmm.alpha, alpha)
- assert_equal(vbgmm.fit(X).alpha_, float(alpha) / n_components)
diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py
deleted file mode 100644
index 137703adfcad4..0000000000000
--- a/sklearn/mixture/tests/test_gmm.py
+++ /dev/null
@@ -1,534 +0,0 @@
-# Important note for the deprecation cleaning of 0.20 :
-# All the functions and classes of this file have been deprecated in 0.18.
-# When you remove this file please remove the related files
-# - 'sklearn/mixture/dpgmm.py'
-# - 'sklearn/mixture/gmm.py'
-# - 'sklearn/mixture/test_dpgmm.py'
-import unittest
-import copy
-import sys
-
-import numpy as np
-from numpy.testing import assert_array_equal, assert_array_almost_equal
-
-from scipy import stats
-from sklearn import mixture
-from sklearn.datasets.samples_generator import make_spd_matrix
-from sklearn.utils.testing import (assert_true, assert_greater,
- assert_raise_message, assert_warns_message,
- ignore_warnings, assert_raises)
-from sklearn.metrics.cluster import adjusted_rand_score
-from sklearn.externals.six.moves import cStringIO as StringIO
-
-
-rng = np.random.RandomState(0)
-
-
-def test_sample_gaussian():
- # Test sample generation from mixture.sample_gaussian where covariance
- # is diagonal, spherical and full
-
- n_features, n_samples = 2, 300
- axis = 1
- mu = rng.randint(10) * rng.rand(n_features)
- cv = (rng.rand(n_features) + 1.0) ** 2
-
- samples = mixture.gmm._sample_gaussian(
- mu, cv, covariance_type='diag', n_samples=n_samples)
-
- assert_true(np.allclose(samples.mean(axis), mu, atol=1.3))
- assert_true(np.allclose(samples.var(axis), cv, atol=1.5))
-
- # the same for spherical covariances
- cv = (rng.rand() + 1.0) ** 2
- samples = mixture.gmm._sample_gaussian(
- mu, cv, covariance_type='spherical', n_samples=n_samples)
-
- assert_true(np.allclose(samples.mean(axis), mu, atol=1.5))
- assert_true(np.allclose(
- samples.var(axis), np.repeat(cv, n_features), atol=1.5))
-
- # and for full covariances
- A = rng.randn(n_features, n_features)
- cv = np.dot(A.T, A) + np.eye(n_features)
- samples = mixture.gmm._sample_gaussian(
- mu, cv, covariance_type='full', n_samples=n_samples)
- assert_true(np.allclose(samples.mean(axis), mu, atol=1.3))
- assert_true(np.allclose(np.cov(samples), cv, atol=2.5))
-
- # Numerical stability check: in SciPy 0.12.0 at least, eigh may return
- # tiny negative values in its second return value.
- x = mixture.gmm._sample_gaussian(
- [0, 0], [[4, 3], [1, .1]], covariance_type='full', random_state=42)
- assert_true(np.isfinite(x).all())
-
-
-def _naive_lmvnpdf_diag(X, mu, cv):
- # slow and naive implementation of lmvnpdf
- ref = np.empty((len(X), len(mu)))
- stds = np.sqrt(cv)
- for i, (m, std) in enumerate(zip(mu, stds)):
- ref[:, i] = np.log(stats.norm.pdf(X, m, std)).sum(axis=1)
- return ref
-
-
-def test_lmvnpdf_diag():
- # test a slow and naive implementation of lmvnpdf and
- # compare it to the vectorized version (mixture.lmvnpdf) to test
- # for correctness
- n_features, n_components, n_samples = 2, 3, 10
- mu = rng.randint(10) * rng.rand(n_components, n_features)
- cv = (rng.rand(n_components, n_features) + 1.0) ** 2
- X = rng.randint(10) * rng.rand(n_samples, n_features)
-
- ref = _naive_lmvnpdf_diag(X, mu, cv)
- lpr = assert_warns_message(DeprecationWarning, "The function"
- " log_multivariate_normal_density is "
- "deprecated in 0.18 and will be removed in 0.20.",
- mixture.log_multivariate_normal_density,
- X, mu, cv, 'diag')
- assert_array_almost_equal(lpr, ref)
-
-
-def test_lmvnpdf_spherical():
- n_features, n_components, n_samples = 2, 3, 10
-
- mu = rng.randint(10) * rng.rand(n_components, n_features)
- spherecv = rng.rand(n_components, 1) ** 2 + 1
- X = rng.randint(10) * rng.rand(n_samples, n_features)
-
- cv = np.tile(spherecv, (n_features, 1))
- reference = _naive_lmvnpdf_diag(X, mu, cv)
- lpr = assert_warns_message(DeprecationWarning, "The function"
- " log_multivariate_normal_density is "
- "deprecated in 0.18 and will be removed in 0.20.",
- mixture.log_multivariate_normal_density,
- X, mu, spherecv, 'spherical')
- assert_array_almost_equal(lpr, reference)
-
-def test_lmvnpdf_full():
- n_features, n_components, n_samples = 2, 3, 10
-
- mu = rng.randint(10) * rng.rand(n_components, n_features)
- cv = (rng.rand(n_components, n_features) + 1.0) ** 2
- X = rng.randint(10) * rng.rand(n_samples, n_features)
-
- fullcv = np.array([np.diag(x) for x in cv])
-
- reference = _naive_lmvnpdf_diag(X, mu, cv)
- lpr = assert_warns_message(DeprecationWarning, "The function"
- " log_multivariate_normal_density is "
- "deprecated in 0.18 and will be removed in 0.20.",
- mixture.log_multivariate_normal_density,
- X, mu, fullcv, 'full')
- assert_array_almost_equal(lpr, reference)
-
-
-def test_lvmpdf_full_cv_non_positive_definite():
- n_features, n_samples = 2, 10
- rng = np.random.RandomState(0)
- X = rng.randint(10) * rng.rand(n_samples, n_features)
- mu = np.mean(X, 0)
- cv = np.array([[[-1, 0], [0, 1]]])
- expected_message = "'covars' must be symmetric, positive-definite"
- assert_raise_message(ValueError, expected_message,
- mixture.log_multivariate_normal_density,
- X, mu, cv, 'full')
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_GMM_attributes():
- n_components, n_features = 10, 4
- covariance_type = 'diag'
- g = mixture.GMM(n_components, covariance_type, random_state=rng)
- weights = rng.rand(n_components)
- weights = weights / weights.sum()
- means = rng.randint(-20, 20, (n_components, n_features))
-
- assert_true(g.n_components == n_components)
- assert_true(g.covariance_type == covariance_type)
-
- g.weights_ = weights
- assert_array_almost_equal(g.weights_, weights)
- g.means_ = means
- assert_array_almost_equal(g.means_, means)
-
- covars = (0.1 + 2 * rng.rand(n_components, n_features)) ** 2
- g.covars_ = covars
- assert_array_almost_equal(g.covars_, covars)
- assert_raises(ValueError, g._set_covars, [])
- assert_raises(ValueError, g._set_covars,
- np.zeros((n_components - 2, n_features)))
-
- assert_raises(ValueError, mixture.GMM, n_components=20,
- covariance_type='badcovariance_type')
-
-
-class GMMTester():
- do_test_eval = True
-
- def _setUp(self):
- self.n_components = 10
- self.n_features = 4
- self.weights = rng.rand(self.n_components)
- self.weights = self.weights / self.weights.sum()
- self.means = rng.randint(-20, 20, (self.n_components, self.n_features))
- self.threshold = -0.5
- self.I = np.eye(self.n_features)
- self.covars = {
- 'spherical': (0.1 + 2 * rng.rand(self.n_components,
- self.n_features)) ** 2,
- 'tied': (make_spd_matrix(self.n_features, random_state=0)
- + 5 * self.I),
- 'diag': (0.1 + 2 * rng.rand(self.n_components,
- self.n_features)) ** 2,
- 'full': np.array([make_spd_matrix(self.n_features, random_state=0)
- + 5 * self.I for x in range(self.n_components)])}
-
- # This function tests the deprecated old GMM class
- @ignore_warnings(category=DeprecationWarning)
- def test_eval(self):
- if not self.do_test_eval:
- return # DPGMM does not support setting the means and
- # covariances before fitting There is no way of fixing this
- # due to the variational parameters being more expressive than
- # covariance matrices
- g = self.model(n_components=self.n_components,
- covariance_type=self.covariance_type, random_state=rng)
- # Make sure the means are far apart so responsibilities.argmax()
- # picks the actual component used to generate the observations.
- g.means_ = 20 * self.means
- g.covars_ = self.covars[self.covariance_type]
- g.weights_ = self.weights
-
- gaussidx = np.repeat(np.arange(self.n_components), 5)
- n_samples = len(gaussidx)
- X = rng.randn(n_samples, self.n_features) + g.means_[gaussidx]
-
- with ignore_warnings(category=DeprecationWarning):
- ll, responsibilities = g.score_samples(X)
-
- self.assertEqual(len(ll), n_samples)
- self.assertEqual(responsibilities.shape,
- (n_samples, self.n_components))
- assert_array_almost_equal(responsibilities.sum(axis=1),
- np.ones(n_samples))
- assert_array_equal(responsibilities.argmax(axis=1), gaussidx)
-
- # This function tests the deprecated old GMM class
- @ignore_warnings(category=DeprecationWarning)
- def test_sample(self, n=100):
- g = self.model(n_components=self.n_components,
- covariance_type=self.covariance_type,
- random_state=rng)
- # Make sure the means are far apart so responsibilities.argmax()
- # picks the actual component used to generate the observations.
- g.means_ = 20 * self.means
- g.covars_ = np.maximum(self.covars[self.covariance_type], 0.1)
- g.weights_ = self.weights
-
- with ignore_warnings(category=DeprecationWarning):
- samples = g.sample(n)
- self.assertEqual(samples.shape, (n, self.n_features))
-
- # This function tests the deprecated old GMM class
- @ignore_warnings(category=DeprecationWarning)
- def test_train(self, params='wmc'):
- g = mixture.GMM(n_components=self.n_components,
- covariance_type=self.covariance_type)
- with ignore_warnings(category=DeprecationWarning):
- g.weights_ = self.weights
- g.means_ = self.means
- g.covars_ = 20 * self.covars[self.covariance_type]
-
- # Create a training set by sampling from the predefined distribution.
- with ignore_warnings(category=DeprecationWarning):
- X = g.sample(n_samples=100)
- g = self.model(n_components=self.n_components,
- covariance_type=self.covariance_type,
- random_state=rng, min_covar=1e-1,
- n_iter=1, init_params=params)
- g.fit(X)
-
- # Do one training iteration at a time so we can keep track of
- # the log likelihood to make sure that it increases after each
- # iteration.
- trainll = []
- with ignore_warnings(category=DeprecationWarning):
- for _ in range(5):
- g.params = params
- g.init_params = ''
- g.fit(X)
- trainll.append(self.score(g, X))
- g.n_iter = 10
- g.init_params = ''
- g.params = params
- g.fit(X) # finish fitting
-
- # Note that the log likelihood will sometimes decrease by a
- # very small amount after it has more or less converged due to
- # the addition of min_covar to the covariance (to prevent
- # underflow). This is why the threshold is set to -0.5
- # instead of 0.
- with ignore_warnings(category=DeprecationWarning):
- delta_min = np.diff(trainll).min()
- self.assertTrue(
- delta_min > self.threshold,
- "The min nll increase is %f which is lower than the admissible"
- " threshold of %f, for model %s. The likelihoods are %s."
- % (delta_min, self.threshold, self.covariance_type, trainll))
-
- # This function tests the deprecated old GMM class
- @ignore_warnings(category=DeprecationWarning)
- def test_train_degenerate(self, params='wmc'):
- # Train on degenerate data with 0 in some dimensions
- # Create a training set by sampling from the predefined
- # distribution.
- X = rng.randn(100, self.n_features)
- X.T[1:] = 0
- g = self.model(n_components=2,
- covariance_type=self.covariance_type,
- random_state=rng, min_covar=1e-3, n_iter=5,
- init_params=params)
- with ignore_warnings(category=DeprecationWarning):
- g.fit(X)
- trainll = g.score(X)
- self.assertTrue(np.sum(np.abs(trainll / 100 / X.shape[1])) < 5)
-
- # This function tests the deprecated old GMM class
- @ignore_warnings(category=DeprecationWarning)
- def test_train_1d(self, params='wmc'):
- # Train on 1-D data
- # Create a training set by sampling from the predefined
- # distribution.
- X = rng.randn(100, 1)
- # X.T[1:] = 0
- g = self.model(n_components=2,
- covariance_type=self.covariance_type,
- random_state=rng, min_covar=1e-7, n_iter=5,
- init_params=params)
- with ignore_warnings(category=DeprecationWarning):
- g.fit(X)
- trainll = g.score(X)
- if isinstance(g, mixture.dpgmm._DPGMMBase):
- self.assertTrue(np.sum(np.abs(trainll / 100)) < 5)
- else:
- self.assertTrue(np.sum(np.abs(trainll / 100)) < 2)
-
- # This function tests the deprecated old GMM class
- @ignore_warnings(category=DeprecationWarning)
- def score(self, g, X):
- with ignore_warnings(category=DeprecationWarning):
- return g.score(X).sum()
-
-
-class TestGMMWithSphericalCovars(unittest.TestCase, GMMTester):
- covariance_type = 'spherical'
- model = mixture.GMM
- setUp = GMMTester._setUp
-
-
-class TestGMMWithDiagonalCovars(unittest.TestCase, GMMTester):
- covariance_type = 'diag'
- model = mixture.GMM
- setUp = GMMTester._setUp
-
-
-class TestGMMWithTiedCovars(unittest.TestCase, GMMTester):
- covariance_type = 'tied'
- model = mixture.GMM
- setUp = GMMTester._setUp
-
-
-class TestGMMWithFullCovars(unittest.TestCase, GMMTester):
- covariance_type = 'full'
- model = mixture.GMM
- setUp = GMMTester._setUp
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_multiple_init():
- # Test that multiple inits does not much worse than a single one
- X = rng.randn(30, 5)
- X[:10] += 2
- g = mixture.GMM(n_components=2, covariance_type='spherical',
- random_state=rng, min_covar=1e-7, n_iter=5)
- with ignore_warnings(category=DeprecationWarning):
- train1 = g.fit(X).score(X).sum()
- g.n_init = 5
- train2 = g.fit(X).score(X).sum()
- assert_true(train2 >= train1 - 1.e-2)
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_n_parameters():
- n_samples, n_dim, n_components = 7, 5, 2
- X = rng.randn(n_samples, n_dim)
- n_params = {'spherical': 13, 'diag': 21, 'tied': 26, 'full': 41}
- for cv_type in ['full', 'tied', 'diag', 'spherical']:
- with ignore_warnings(category=DeprecationWarning):
- g = mixture.GMM(n_components=n_components, covariance_type=cv_type,
- random_state=rng, min_covar=1e-7, n_iter=1)
- g.fit(X)
- assert_true(g._n_parameters() == n_params[cv_type])
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_1d_1component():
- # Test all of the covariance_types return the same BIC score for
- # 1-dimensional, 1 component fits.
- n_samples, n_dim, n_components = 100, 1, 1
- X = rng.randn(n_samples, n_dim)
- g_full = mixture.GMM(n_components=n_components, covariance_type='full',
- random_state=rng, min_covar=1e-7, n_iter=1)
- with ignore_warnings(category=DeprecationWarning):
- g_full.fit(X)
- g_full_bic = g_full.bic(X)
- for cv_type in ['tied', 'diag', 'spherical']:
- g = mixture.GMM(n_components=n_components, covariance_type=cv_type,
- random_state=rng, min_covar=1e-7, n_iter=1)
- g.fit(X)
- assert_array_almost_equal(g.bic(X), g_full_bic)
-
-
-def assert_fit_predict_correct(model, X):
- model2 = copy.deepcopy(model)
-
- predictions_1 = model.fit(X).predict(X)
- predictions_2 = model2.fit_predict(X)
-
- assert adjusted_rand_score(predictions_1, predictions_2) == 1.0
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_fit_predict():
- """
- test that gmm.fit_predict is equivalent to gmm.fit + gmm.predict
- """
- lrng = np.random.RandomState(101)
-
- n_samples, n_dim, n_comps = 100, 2, 2
- mu = np.array([[8, 8]])
- component_0 = lrng.randn(n_samples, n_dim)
- component_1 = lrng.randn(n_samples, n_dim) + mu
- X = np.vstack((component_0, component_1))
-
- for m_constructor in (mixture.GMM, mixture.VBGMM, mixture.DPGMM):
- model = m_constructor(n_components=n_comps, covariance_type='full',
- min_covar=1e-7, n_iter=5,
- random_state=np.random.RandomState(0))
- assert_fit_predict_correct(model, X)
-
- model = mixture.GMM(n_components=n_comps, n_iter=0)
- z = model.fit_predict(X)
- assert np.all(z == 0), "Quick Initialization Failed!"
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_aic():
- # Test the aic and bic criteria
- n_samples, n_dim, n_components = 50, 3, 2
- X = rng.randn(n_samples, n_dim)
- SGH = 0.5 * (X.var() + np.log(2 * np.pi)) # standard gaussian entropy
-
- for cv_type in ['full', 'tied', 'diag', 'spherical']:
- g = mixture.GMM(n_components=n_components, covariance_type=cv_type,
- random_state=rng, min_covar=1e-7)
- g.fit(X)
- aic = 2 * n_samples * SGH * n_dim + 2 * g._n_parameters()
- bic = (2 * n_samples * SGH * n_dim +
- np.log(n_samples) * g._n_parameters())
- bound = n_dim * 3. / np.sqrt(n_samples)
- assert_true(np.abs(g.aic(X) - aic) / n_samples < bound)
- assert_true(np.abs(g.bic(X) - bic) / n_samples < bound)
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def check_positive_definite_covars(covariance_type):
- r"""Test that covariance matrices do not become non positive definite
-
- Due to the accumulation of round-off errors, the computation of the
- covariance matrices during the learning phase could lead to non-positive
- definite covariance matrices. Namely the use of the formula:
-
- .. math:: C = (\sum_i w_i x_i x_i^T) - \mu \mu^T
-
- instead of:
-
- .. math:: C = \sum_i w_i (x_i - \mu)(x_i - \mu)^T
-
- while mathematically equivalent, was observed a ``LinAlgError`` exception,
- when computing a ``GMM`` with full covariance matrices and fixed mean.
-
- This function ensures that some later optimization will not introduce the
- problem again.
- """
- rng = np.random.RandomState(1)
- # we build a dataset with 2 2d component. The components are unbalanced
- # (respective weights 0.9 and 0.1)
- X = rng.randn(100, 2)
- X[-10:] += (3, 3) # Shift the 10 last points
-
- gmm = mixture.GMM(2, params="wc", covariance_type=covariance_type,
- min_covar=1e-3)
-
- # This is a non-regression test for issue #2640. The following call used
- # to trigger:
- # numpy.linalg.linalg.LinAlgError: 2-th leading minor not positive definite
- gmm.fit(X)
-
- if covariance_type == "diag" or covariance_type == "spherical":
- assert_greater(gmm.covars_.min(), 0)
- else:
- if covariance_type == "tied":
- covs = [gmm.covars_]
- else:
- covs = gmm.covars_
-
- for c in covs:
- assert_greater(np.linalg.det(c), 0)
-
-
-def test_positive_definite_covars():
- # Check positive definiteness for all covariance types
- for covariance_type in ["full", "tied", "diag", "spherical"]:
- yield check_positive_definite_covars, covariance_type
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_verbose_first_level():
- # Create sample data
- X = rng.randn(30, 5)
- X[:10] += 2
- g = mixture.GMM(n_components=2, n_init=2, verbose=1)
-
- old_stdout = sys.stdout
- sys.stdout = StringIO()
- try:
- g.fit(X)
- finally:
- sys.stdout = old_stdout
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_verbose_second_level():
- # Create sample data
- X = rng.randn(30, 5)
- X[:10] += 2
- g = mixture.GMM(n_components=2, n_init=2, verbose=2)
-
- old_stdout = sys.stdout
- sys.stdout = StringIO()
- try:
- g.fit(X)
- finally:
- sys.stdout = old_stdout
From 7d4b2c11583e6bf2594d1cc1c445d66da7335d6f Mon Sep 17 00:00:00 2001
From: Andreas Mueller
Date: Fri, 8 Sep 2017 12:30:36 -0400
Subject: [PATCH 10/36] more cleanup of deprecated scorers
---
sklearn/metrics/scorer.py | 13 +------------
1 file changed, 1 insertion(+), 12 deletions(-)
diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 05231826a8998..ebb6c7ca25ffe 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -225,18 +225,13 @@ def get_scorer(scoring):
scorer : callable
The scorer.
"""
- valid = True
if isinstance(scoring, six.string_types):
try:
scorer = SCORERS[scoring]
except KeyError:
- scorers = [scorer for scorer in SCORERS
- if SCORERS[scorer]._deprecation_msg is None]
- valid = False # Don't raise here to make the error message elegant
- if not valid:
raise ValueError('%r is not a valid scoring value. '
'Valid options are %s'
- % (scoring, sorted(scorers)))
+ % (scoring, sorted(SCORERS.keys())))
else:
scorer = scoring
return scorer
@@ -513,11 +508,6 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
# Score function for probabilistic classification
neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False,
needs_proba=True)
-deprecation_msg = ('Scoring method log_loss was renamed to '
- 'neg_log_loss in version 0.18 and will be removed in 0.20.')
-log_loss_scorer = make_scorer(log_loss, greater_is_better=False,
- needs_proba=True)
-log_loss_scorer._deprecation_msg = deprecation_msg
brier_score_loss_scorer = make_scorer(brier_score_loss,
greater_is_better=False,
needs_proba=True)
@@ -546,7 +536,6 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
accuracy=accuracy_scorer, roc_auc=roc_auc_scorer,
balanced_accuracy=balanced_accuracy_scorer,
average_precision=average_precision_scorer,
- log_loss=log_loss_scorer,
neg_log_loss=neg_log_loss_scorer,
brier_score_loss=brier_score_loss_scorer,
# Cluster metrics that use supervised evaluation
From 2ffa7bdad5b18ea5d516e305fcef57738c215e4a Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Fri, 10 Nov 2017 16:09:28 +0100
Subject: [PATCH 11/36] More in scoring
---
sklearn/metrics/tests/test_score_objects.py | 19 -------------------
1 file changed, 19 deletions(-)
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 6af6418635d59..836cdc0f934f8 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -499,25 +499,6 @@ def test_scorer_memmap_input():
yield check_scorer_memmap, name
-def test_deprecated_names():
- X, y = make_blobs(random_state=0, centers=2)
- X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
- clf = LogisticRegression(random_state=0)
- clf.fit(X_train, y_train)
-
- for name in ('mean_absolute_error', 'mean_squared_error',
- 'median_absolute_error', 'log_loss'):
- warning_msg = "Scoring method %s was renamed to" % name
- for scorer in (get_scorer(name), SCORERS[name]):
- assert_warns_message(DeprecationWarning,
- warning_msg,
- scorer, clf, X, y)
-
- assert_warns_message(DeprecationWarning,
- warning_msg,
- cross_val_score, clf, X, y, scoring=name)
-
-
def test_scoring_is_not_metric():
assert_raises_regexp(ValueError, 'make_scorer', check_scoring,
LogisticRegression(), f1_score)
From 0bf414616b133155f2cad12c063761eec75e0c5c Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Fri, 10 Nov 2017 16:22:53 +0100
Subject: [PATCH 12/36] Remove `hamming_loss` deprecated parameter `classes`
---
sklearn/metrics/classification.py | 14 +-------------
sklearn/metrics/tests/test_classification.py | 1 -
2 files changed, 1 insertion(+), 14 deletions(-)
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 7d8b887c66624..c14c8ffe855af 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -1528,8 +1528,7 @@ class 2 1.00 0.67 0.80 3
return report
-def hamming_loss(y_true, y_pred, labels=None, sample_weight=None,
- classes=None):
+def hamming_loss(y_true, y_pred, labels=None, sample_weight=None):
"""Compute the average Hamming loss.
The Hamming loss is the fraction of labels that are incorrectly predicted.
@@ -1555,13 +1554,6 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None,
.. versionadded:: 0.18
- classes : array, shape = [n_labels], optional
- Integer array of labels.
-
- .. deprecated:: 0.18
- This parameter has been deprecated in favor of ``labels`` in
- version 0.18 and will be removed in 0.20. Use ``labels`` instead.
-
Returns
-------
loss : float or int,
@@ -1609,10 +1601,6 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None,
>>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2)))
0.75
"""
- if classes is not None:
- warnings.warn("'classes' was renamed to 'labels' in version 0.18 and "
- "will be removed in 0.20.", DeprecationWarning)
- labels = classes
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
check_consistent_length(y_true, y_pred, sample_weight)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index c259036807f7f..4f51f614d2a47 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -936,7 +936,6 @@ def test_multilabel_hamming_loss():
assert_equal(hamming_loss(y1, np.zeros_like(y1), sample_weight=w), 2. / 3)
# sp_hamming only works with 1-D arrays
assert_equal(hamming_loss(y1[0], y2[0]), sp_hamming(y1[0], y2[0]))
- assert_warns(DeprecationWarning, hamming_loss, y1, y2, classes=[0, 1])
def test_multilabel_jaccard_similarity_score():
From b36341e23c0b515773f5027edef6f18ac80c61d9 Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Fri, 10 Nov 2017 16:41:01 +0100
Subject: [PATCH 13/36] splitter classes (issue:6660) Fix minor stuff
---
examples/model_selection/plot_nested_cross_validation_iris.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/model_selection/plot_nested_cross_validation_iris.py b/examples/model_selection/plot_nested_cross_validation_iris.py
index 917746c359d4b..b40dc91fc4d8f 100644
--- a/examples/model_selection/plot_nested_cross_validation_iris.py
+++ b/examples/model_selection/plot_nested_cross_validation_iris.py
@@ -75,7 +75,7 @@
# Choose cross-validation techniques for the inner and outer loops,
# independently of the dataset.
- # E.g "LabelKFold", "LeaveOneOut", "LeaveOneLabelOut", etc.
+ # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)
From 4b7aa69655a3ab24398e9ded0c2daf880b49215f Mon Sep 17 00:00:00 2001
From: Joan Massich
Date: Fri, 10 Nov 2017 17:45:42 +0100
Subject: [PATCH 14/36] Fix doctest expected output
---
doc/modules/model_evaluation.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 4a19e27e9c11c..a122728e825a6 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -104,7 +104,7 @@ Usage examples:
>>> model = svm.SVC()
>>> cross_val_score(model, X, y, scoring='wrong_choice')
Traceback (most recent call last):
- ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
+ ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
.. note::
From 195fcf38a9d43133a60ea1fda49619e3b4b8a34a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Tue, 22 May 2018 13:55:54 -0400
Subject: [PATCH 15/36] unused imports
---
sklearn/decomposition/pca.py | 6 ++----
sklearn/utils/estimator_checks.py | 2 +-
2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
index 0c0ec5d179a8a..75080092f55ae 100644
--- a/sklearn/decomposition/pca.py
+++ b/sklearn/decomposition/pca.py
@@ -22,9 +22,7 @@
from ..externals import six
from .base import _BasePCA
-from ..base import BaseEstimator, TransformerMixin
-from ..utils import deprecated
-from ..utils import check_random_state, as_float_array
+from ..utils import check_random_state
from ..utils import check_array
from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
from ..utils.extmath import stable_cumsum
@@ -590,4 +588,4 @@ def score(self, X, y=None):
ll : float
Average log-likelihood of the samples under the current model
"""
- return np.mean(self.score_samples(X))
\ No newline at end of file
+ return np.mean(self.score_samples(X))
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index f7a7a5bceefec..71b5fbe2706c3 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -38,7 +38,7 @@
from sklearn.utils.testing import create_memmap_backed_data
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
-from sklearn.base import (clone, TransformerMixin, ClusterMixin,
+from sklearn.base import (clone, ClusterMixin,
BaseEstimator, is_classifier, is_regressor,
is_outlier_detector)
From daa5e4b53a759d3f39081428fbbb2a032b2d2899 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Tue, 22 May 2018 13:56:37 -0400
Subject: [PATCH 16/36] add vscode to gitignore
---
.gitignore | 1 +
1 file changed, 1 insertion(+)
diff --git a/.gitignore b/.gitignore
index 9fa8c09bdf0b0..55050f0a31ed8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,7 @@ benchmarks/bench_covertype_data/
*.prefs
.pydevproject
.idea
+.vscode
*.c
*.cpp
From b72c9b9dfebccc593ffa5a8bb87b5e37e15a448b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Tue, 22 May 2018 13:58:02 -0400
Subject: [PATCH 17/36] delete files again after botched merge.
---
sklearn/cross_validation.py | 2069 ------------------------
sklearn/grid_search.py | 1046 ------------
sklearn/tests/test_cross_validation.py | 1252 --------------
sklearn/tests/test_grid_search.py | 815 ----------
4 files changed, 5182 deletions(-)
delete mode 100644 sklearn/cross_validation.py
delete mode 100644 sklearn/grid_search.py
delete mode 100644 sklearn/tests/test_cross_validation.py
delete mode 100644 sklearn/tests/test_grid_search.py
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
deleted file mode 100644
index e06466b2fcf27..0000000000000
--- a/sklearn/cross_validation.py
+++ /dev/null
@@ -1,2069 +0,0 @@
-"""
-The :mod:`sklearn.cross_validation` module includes utilities for cross-
-validation and performance evaluation.
-"""
-
-# Author: Alexandre Gramfort ,
-# Gael Varoquaux ,
-# Olivier Grisel
-# License: BSD 3 clause
-
-from __future__ import print_function
-from __future__ import division
-
-import warnings
-from itertools import chain, combinations
-from math import ceil, floor, factorial
-import numbers
-import time
-from abc import ABCMeta, abstractmethod
-
-import numpy as np
-import scipy.sparse as sp
-
-from .base import is_classifier, clone
-from .utils import indexable, check_random_state, safe_indexing
-from .utils.validation import (_is_arraylike, _num_samples,
- column_or_1d)
-from .utils.multiclass import type_of_target
-from .externals.joblib import Parallel, delayed, logger
-from .externals.six import with_metaclass
-from .externals.six.moves import zip
-from .metrics.scorer import check_scoring
-from .gaussian_process.kernels import Kernel as GPKernel
-from .exceptions import FitFailedWarning
-
-
-warnings.warn("This module was deprecated in version 0.18 in favor of the "
- "model_selection module into which all the refactored classes "
- "and functions are moved. Also note that the interface of the "
- "new CV iterators are different from that of this module. "
- "This module will be removed in 0.20.", DeprecationWarning)
-
-
-__all__ = ['KFold',
- 'LabelKFold',
- 'LeaveOneLabelOut',
- 'LeaveOneOut',
- 'LeavePLabelOut',
- 'LeavePOut',
- 'ShuffleSplit',
- 'StratifiedKFold',
- 'StratifiedShuffleSplit',
- 'PredefinedSplit',
- 'LabelShuffleSplit',
- 'check_cv',
- 'cross_val_score',
- 'cross_val_predict',
- 'permutation_test_score',
- 'train_test_split']
-
-
-class _PartitionIterator(with_metaclass(ABCMeta)):
- """Base class for CV iterators where train_mask = ~test_mask
-
- Implementations must define `_iter_test_masks` or `_iter_test_indices`.
-
- Parameters
- ----------
- n : int
- Total number of elements in dataset.
- """
-
- def __init__(self, n):
- if abs(n - int(n)) >= np.finfo('f').eps:
- raise ValueError("n must be an integer")
- self.n = int(n)
-
- def __iter__(self):
- ind = np.arange(self.n)
- for test_index in self._iter_test_masks():
- train_index = np.logical_not(test_index)
- train_index = ind[train_index]
- test_index = ind[test_index]
- yield train_index, test_index
-
- # Since subclasses must implement either _iter_test_masks or
- # _iter_test_indices, neither can be abstract.
- def _iter_test_masks(self):
- """Generates boolean masks corresponding to test sets.
-
- By default, delegates to _iter_test_indices()
- """
- for test_index in self._iter_test_indices():
- test_mask = self._empty_mask()
- test_mask[test_index] = True
- yield test_mask
-
- def _iter_test_indices(self):
- """Generates integer indices corresponding to test sets."""
- raise NotImplementedError
-
- def _empty_mask(self):
- return np.zeros(self.n, dtype=np.bool)
-
-
-class LeaveOneOut(_PartitionIterator):
- """Leave-One-Out cross validation iterator.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.LeaveOneOut` instead.
-
- Provides train/test indices to split data in train test sets. Each
- sample is used once as a test set (singleton) while the remaining
- samples form the training set.
-
- Note: ``LeaveOneOut(n)`` is equivalent to ``KFold(n, n_folds=n)`` and
- ``LeavePOut(n, p=1)``.
-
- Due to the high number of test sets (which is the same as the
- number of samples) this cross validation method can be very costly.
- For large datasets one should favor KFold, StratifiedKFold or
- ShuffleSplit.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n : int
- Total number of elements in dataset.
-
- Examples
- --------
- >>> from sklearn import cross_validation
- >>> X = np.array([[1, 2], [3, 4]])
- >>> y = np.array([1, 2])
- >>> loo = cross_validation.LeaveOneOut(2)
- >>> len(loo)
- 2
- >>> print(loo)
- sklearn.cross_validation.LeaveOneOut(n=2)
- >>> for train_index, test_index in loo:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- ... print(X_train, X_test, y_train, y_test)
- TRAIN: [1] TEST: [0]
- [[3 4]] [[1 2]] [2] [1]
- TRAIN: [0] TEST: [1]
- [[1 2]] [[3 4]] [1] [2]
-
- See also
- --------
- LeaveOneLabelOut for splitting the data according to explicit,
- domain-specific stratification of the dataset.
- """
-
- def _iter_test_indices(self):
- return range(self.n)
-
- def __repr__(self):
- return '%s.%s(n=%i)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.n,
- )
-
- def __len__(self):
- return self.n
-
-
-class LeavePOut(_PartitionIterator):
- """Leave-P-Out cross validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.LeavePOut` instead.
-
- Provides train/test indices to split data in train test sets. This results
- in testing on all distinct samples of size p, while the remaining n - p
- samples form the training set in each iteration.
-
- Note: ``LeavePOut(n, p)`` is NOT equivalent to ``KFold(n, n_folds=n // p)``
- which creates non-overlapping test sets.
-
- Due to the high number of iterations which grows combinatorically with the
- number of samples this cross validation method can be very costly. For
- large datasets one should favor KFold, StratifiedKFold or ShuffleSplit.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n : int
- Total number of elements in dataset.
-
- p : int
- Size of the test sets.
-
- Examples
- --------
- >>> from sklearn import cross_validation
- >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- >>> y = np.array([1, 2, 3, 4])
- >>> lpo = cross_validation.LeavePOut(4, 2)
- >>> len(lpo)
- 6
- >>> print(lpo)
- sklearn.cross_validation.LeavePOut(n=4, p=2)
- >>> for train_index, test_index in lpo:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- TRAIN: [2 3] TEST: [0 1]
- TRAIN: [1 3] TEST: [0 2]
- TRAIN: [1 2] TEST: [0 3]
- TRAIN: [0 3] TEST: [1 2]
- TRAIN: [0 2] TEST: [1 3]
- TRAIN: [0 1] TEST: [2 3]
- """
-
- def __init__(self, n, p):
- super(LeavePOut, self).__init__(n)
- self.p = p
-
- def _iter_test_indices(self):
- for comb in combinations(range(self.n), self.p):
- yield np.array(comb)
-
- def __repr__(self):
- return '%s.%s(n=%i, p=%i)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.n,
- self.p,
- )
-
- def __len__(self):
- return int(factorial(self.n) / factorial(self.n - self.p)
- / factorial(self.p))
-
-
-class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)):
- """Base class to validate KFold approaches"""
-
- @abstractmethod
- def __init__(self, n, n_folds, shuffle, random_state):
- super(_BaseKFold, self).__init__(n)
-
- if abs(n_folds - int(n_folds)) >= np.finfo('f').eps:
- raise ValueError("n_folds must be an integer")
- self.n_folds = n_folds = int(n_folds)
-
- if n_folds <= 1:
- raise ValueError(
- "k-fold cross validation requires at least one"
- " train / test split by setting n_folds=2 or more,"
- " got n_folds={0}.".format(n_folds))
- if n_folds > self.n:
- raise ValueError(
- ("Cannot have number of folds n_folds={0} greater"
- " than the number of samples: {1}.").format(n_folds, n))
-
- if not isinstance(shuffle, bool):
- raise TypeError("shuffle must be True or False;"
- " got {0}".format(shuffle))
- self.shuffle = shuffle
- self.random_state = random_state
-
-
-class KFold(_BaseKFold):
- """K-Folds cross validation iterator.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.KFold` instead.
-
- Provides train/test indices to split data in train test sets. Split
- dataset into k consecutive folds (without shuffling by default).
-
- Each fold is then used as a validation set once while the k - 1 remaining
- fold(s) form the training set.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n : int
- Total number of elements.
-
- n_folds : int, default=3
- Number of folds. Must be at least 2.
-
- shuffle : boolean, optional
- Whether to shuffle the data before splitting into batches.
-
- random_state : int, RandomState instance or None, optional, default=None
- If int, random_state is the seed used by the random number
- generator; If RandomState instance, random_state is the random number
- generator; If None, the random number generator is the RandomState
- instance used by `np.random`. Used when ``shuffle`` == True.
-
- Examples
- --------
- >>> from sklearn.cross_validation import KFold
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([1, 2, 3, 4])
- >>> kf = KFold(4, n_folds=2)
- >>> len(kf)
- 2
- >>> print(kf) # doctest: +NORMALIZE_WHITESPACE
- sklearn.cross_validation.KFold(n=4, n_folds=2, shuffle=False,
- random_state=None)
- >>> for train_index, test_index in kf:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- TRAIN: [2 3] TEST: [0 1]
- TRAIN: [0 1] TEST: [2 3]
-
- Notes
- -----
- The first n % n_folds folds have size n // n_folds + 1, other folds have
- size n // n_folds.
-
- See also
- --------
- StratifiedKFold take label information into account to avoid building
- folds with imbalanced class distributions (for binary or multiclass
- classification tasks).
-
- LabelKFold: K-fold iterator variant with non-overlapping labels.
- """
-
- def __init__(self, n, n_folds=3, shuffle=False,
- random_state=None):
- super(KFold, self).__init__(n, n_folds, shuffle, random_state)
- self.idxs = np.arange(n)
- if shuffle:
- rng = check_random_state(self.random_state)
- rng.shuffle(self.idxs)
-
- def _iter_test_indices(self):
- n = self.n
- n_folds = self.n_folds
- fold_sizes = (n // n_folds) * np.ones(n_folds, dtype=np.int)
- fold_sizes[:n % n_folds] += 1
- current = 0
- for fold_size in fold_sizes:
- start, stop = current, current + fold_size
- yield self.idxs[start:stop]
- current = stop
-
- def __repr__(self):
- return '%s.%s(n=%i, n_folds=%i, shuffle=%s, random_state=%s)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.n,
- self.n_folds,
- self.shuffle,
- self.random_state,
- )
-
- def __len__(self):
- return self.n_folds
-
-
-class LabelKFold(_BaseKFold):
- """K-fold iterator variant with non-overlapping labels.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.GroupKFold` instead.
-
- The same label will not appear in two different folds (the number of
- distinct labels has to be at least equal to the number of folds).
-
- The folds are approximately balanced in the sense that the number of
- distinct labels is approximately the same in each fold.
-
- .. versionadded:: 0.17
-
- Parameters
- ----------
- labels : array-like with shape (n_samples, )
- Contains a label for each sample.
- The folds are built so that the same label does not appear in two
- different folds.
-
- n_folds : int, default=3
- Number of folds. Must be at least 2.
-
- Examples
- --------
- >>> from sklearn.cross_validation import LabelKFold
- >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- >>> y = np.array([1, 2, 3, 4])
- >>> labels = np.array([0, 0, 2, 2])
- >>> label_kfold = LabelKFold(labels, n_folds=2)
- >>> len(label_kfold)
- 2
- >>> print(label_kfold)
- sklearn.cross_validation.LabelKFold(n_labels=4, n_folds=2)
- >>> for train_index, test_index in label_kfold:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- ... print(X_train, X_test, y_train, y_test)
- ...
- TRAIN: [0 1] TEST: [2 3]
- [[1 2]
- [3 4]] [[5 6]
- [7 8]] [1 2] [3 4]
- TRAIN: [2 3] TEST: [0 1]
- [[5 6]
- [7 8]] [[1 2]
- [3 4]] [3 4] [1 2]
-
- See also
- --------
- LeaveOneLabelOut for splitting the data according to explicit,
- domain-specific stratification of the dataset.
- """
- def __init__(self, labels, n_folds=3):
- super(LabelKFold, self).__init__(len(labels), n_folds,
- shuffle=False, random_state=None)
-
- unique_labels, labels = np.unique(labels, return_inverse=True)
- n_labels = len(unique_labels)
-
- if n_folds > n_labels:
- raise ValueError(
- ("Cannot have number of folds n_folds={0} greater"
- " than the number of labels: {1}.").format(n_folds,
- n_labels))
-
- # Weight labels by their number of occurrences
- n_samples_per_label = np.bincount(labels)
-
- # Distribute the most frequent labels first
- indices = np.argsort(n_samples_per_label)[::-1]
- n_samples_per_label = n_samples_per_label[indices]
-
- # Total weight of each fold
- n_samples_per_fold = np.zeros(n_folds)
-
- # Mapping from label index to fold index
- label_to_fold = np.zeros(len(unique_labels))
-
- # Distribute samples by adding the largest weight to the lightest fold
- for label_index, weight in enumerate(n_samples_per_label):
- lightest_fold = np.argmin(n_samples_per_fold)
- n_samples_per_fold[lightest_fold] += weight
- label_to_fold[indices[label_index]] = lightest_fold
-
- self.idxs = label_to_fold[labels]
-
- def _iter_test_indices(self):
- for f in range(self.n_folds):
- yield np.where(self.idxs == f)[0]
-
- def __repr__(self):
- return '{0}.{1}(n_labels={2}, n_folds={3})'.format(
- self.__class__.__module__,
- self.__class__.__name__,
- self.n,
- self.n_folds,
- )
-
- def __len__(self):
- return self.n_folds
-
-
-class StratifiedKFold(_BaseKFold):
- """Stratified K-Folds cross validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.StratifiedKFold` instead.
-
- Provides train/test indices to split data in train test sets.
-
- This cross-validation object is a variation of KFold that
- returns stratified folds. The folds are made by preserving
- the percentage of samples for each class.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- y : array-like, [n_samples]
- Samples to split in K folds.
-
- n_folds : int, default=3
- Number of folds. Must be at least 2.
-
- shuffle : boolean, optional
- Whether to shuffle each stratification of the data before splitting
- into batches.
-
- random_state : int, RandomState instance or None, optional, default=None
- If int, random_state is the seed used by the random number
- generator; If RandomState instance, random_state is the random number
- generator; If None, the random number generator is the RandomState
- instance used by `np.random`. Used when ``shuffle`` == True.
-
- Examples
- --------
- >>> from sklearn.cross_validation import StratifiedKFold
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 1, 1])
- >>> skf = StratifiedKFold(y, n_folds=2)
- >>> len(skf)
- 2
- >>> print(skf) # doctest: +NORMALIZE_WHITESPACE
- sklearn.cross_validation.StratifiedKFold(labels=[0 0 1 1], n_folds=2,
- shuffle=False, random_state=None)
- >>> for train_index, test_index in skf:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- TRAIN: [1 3] TEST: [0 2]
- TRAIN: [0 2] TEST: [1 3]
-
- Notes
- -----
- Train and test sizes may be different in each fold, with a difference of at
- most ``n_classes``.
-
- See also
- --------
- LabelKFold: K-fold iterator variant with non-overlapping labels.
- """
-
- def __init__(self, y, n_folds=3, shuffle=False,
- random_state=None):
- super(StratifiedKFold, self).__init__(
- len(y), n_folds, shuffle, random_state)
- y = np.asarray(y)
- n_samples = y.shape[0]
- unique_labels, y_inversed = np.unique(y, return_inverse=True)
- label_counts = np.bincount(y_inversed)
- min_labels = np.min(label_counts)
- if np.all(self.n_folds > label_counts):
- raise ValueError("All the n_labels for individual classes"
- " are less than %d folds."
- % (self.n_folds))
- if self.n_folds > min_labels:
- warnings.warn(("The least populated class in y has only %d"
- " members, which is too few. The minimum"
- " number of labels for any class cannot"
- " be less than n_folds=%d."
- % (min_labels, self.n_folds)), Warning)
-
- # don't want to use the same seed in each label's shuffle
- if self.shuffle:
- rng = check_random_state(self.random_state)
- else:
- rng = self.random_state
-
- # pre-assign each sample to a test fold index using individual KFold
- # splitting strategies for each label so as to respect the
- # balance of labels
- per_label_cvs = [
- KFold(max(c, self.n_folds), self.n_folds, shuffle=self.shuffle,
- random_state=rng) for c in label_counts]
- test_folds = np.zeros(n_samples, dtype=np.int)
- for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
- for label, (_, test_split) in zip(unique_labels, per_label_splits):
- label_test_folds = test_folds[y == label]
- # the test split can be too big because we used
- # KFold(max(c, self.n_folds), self.n_folds) instead of
- # KFold(c, self.n_folds) to make it possible to not crash even
- # if the data is not 100% stratifiable for all the labels
- # (we use a warning instead of raising an exception)
- # If this is the case, let's trim it:
- test_split = test_split[test_split < len(label_test_folds)]
- label_test_folds[test_split] = test_fold_idx
- test_folds[y == label] = label_test_folds
-
- self.test_folds = test_folds
- self.y = y
-
- def _iter_test_masks(self):
- for i in range(self.n_folds):
- yield self.test_folds == i
-
- def __repr__(self):
- return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.y,
- self.n_folds,
- self.shuffle,
- self.random_state,
- )
-
- def __len__(self):
- return self.n_folds
-
-
-class LeaveOneLabelOut(_PartitionIterator):
- """Leave-One-Label_Out cross-validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.LeaveOneGroupOut` instead.
-
- Provides train/test indices to split data according to a third-party
- provided label. This label information can be used to encode arbitrary
- domain specific stratifications of the samples as integers.
-
- For instance the labels could be the year of collection of the samples
- and thus allow for cross-validation against time-based splits.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- labels : array-like of int with shape (n_samples,)
- Arbitrary domain-specific stratification of the data to be used
- to draw the splits.
-
- Examples
- --------
- >>> from sklearn import cross_validation
- >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- >>> y = np.array([1, 2, 1, 2])
- >>> labels = np.array([1, 1, 2, 2])
- >>> lol = cross_validation.LeaveOneLabelOut(labels)
- >>> len(lol)
- 2
- >>> print(lol)
- sklearn.cross_validation.LeaveOneLabelOut(labels=[1 1 2 2])
- >>> for train_index, test_index in lol:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- ... print(X_train, X_test, y_train, y_test)
- TRAIN: [2 3] TEST: [0 1]
- [[5 6]
- [7 8]] [[1 2]
- [3 4]] [1 2] [1 2]
- TRAIN: [0 1] TEST: [2 3]
- [[1 2]
- [3 4]] [[5 6]
- [7 8]] [1 2] [1 2]
-
- See also
- --------
- LabelKFold: K-fold iterator variant with non-overlapping labels.
- """
-
- def __init__(self, labels):
- super(LeaveOneLabelOut, self).__init__(len(labels))
- # We make a copy of labels to avoid side-effects during iteration
- self.labels = np.array(labels, copy=True)
- self.unique_labels = np.unique(labels)
- self.n_unique_labels = len(self.unique_labels)
-
- def _iter_test_masks(self):
- for i in self.unique_labels:
- yield self.labels == i
-
- def __repr__(self):
- return '%s.%s(labels=%s)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.labels,
- )
-
- def __len__(self):
- return self.n_unique_labels
-
-
-class LeavePLabelOut(_PartitionIterator):
- """Leave-P-Label_Out cross-validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.LeavePGroupsOut` instead.
-
- Provides train/test indices to split data according to a third-party
- provided label. This label information can be used to encode arbitrary
- domain specific stratifications of the samples as integers.
-
- For instance the labels could be the year of collection of the samples
- and thus allow for cross-validation against time-based splits.
-
- The difference between LeavePLabelOut and LeaveOneLabelOut is that
- the former builds the test sets with all the samples assigned to
- ``p`` different values of the labels while the latter uses samples
- all assigned the same labels.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- labels : array-like of int with shape (n_samples,)
- Arbitrary domain-specific stratification of the data to be used
- to draw the splits.
-
- p : int
- Number of samples to leave out in the test split.
-
- Examples
- --------
- >>> from sklearn import cross_validation
- >>> X = np.array([[1, 2], [3, 4], [5, 6]])
- >>> y = np.array([1, 2, 1])
- >>> labels = np.array([1, 2, 3])
- >>> lpl = cross_validation.LeavePLabelOut(labels, p=2)
- >>> len(lpl)
- 3
- >>> print(lpl)
- sklearn.cross_validation.LeavePLabelOut(labels=[1 2 3], p=2)
- >>> for train_index, test_index in lpl:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- ... print(X_train, X_test, y_train, y_test)
- TRAIN: [2] TEST: [0 1]
- [[5 6]] [[1 2]
- [3 4]] [1] [1 2]
- TRAIN: [1] TEST: [0 2]
- [[3 4]] [[1 2]
- [5 6]] [2] [1 1]
- TRAIN: [0] TEST: [1 2]
- [[1 2]] [[3 4]
- [5 6]] [1] [2 1]
-
- See also
- --------
- LabelKFold: K-fold iterator variant with non-overlapping labels.
- """
-
- def __init__(self, labels, p):
- # We make a copy of labels to avoid side-effects during iteration
- super(LeavePLabelOut, self).__init__(len(labels))
- self.labels = np.array(labels, copy=True)
- self.unique_labels = np.unique(labels)
- self.n_unique_labels = len(self.unique_labels)
- self.p = p
-
- def _iter_test_masks(self):
- comb = combinations(range(self.n_unique_labels), self.p)
- for idx in comb:
- test_index = self._empty_mask()
- idx = np.array(idx)
- for l in self.unique_labels[idx]:
- test_index[self.labels == l] = True
- yield test_index
-
- def __repr__(self):
- return '%s.%s(labels=%s, p=%s)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.labels,
- self.p,
- )
-
- def __len__(self):
- return int(factorial(self.n_unique_labels) /
- factorial(self.n_unique_labels - self.p) /
- factorial(self.p))
-
-
-class BaseShuffleSplit(with_metaclass(ABCMeta)):
- """Base class for ShuffleSplit and StratifiedShuffleSplit"""
-
- def __init__(self, n, n_iter=10, test_size=0.1, train_size=None,
- random_state=None):
- self.n = n
- self.n_iter = n_iter
- self.test_size = test_size
- self.train_size = train_size
- self.random_state = random_state
- self.n_train, self.n_test = _validate_shuffle_split(n, test_size,
- train_size)
-
- def __iter__(self):
- for train, test in self._iter_indices():
- yield train, test
- return
-
- @abstractmethod
- def _iter_indices(self):
- """Generate (train, test) indices"""
-
-
-class ShuffleSplit(BaseShuffleSplit):
- """Random permutation cross-validation iterator.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.ShuffleSplit` instead.
-
- Yields indices to split data into training and test sets.
-
- Note: contrary to other cross-validation strategies, random splits
- do not guarantee that all folds will be different, although this is
- still very likely for sizeable datasets.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n : int
- Total number of elements in the dataset.
-
- n_iter : int (default 10)
- Number of re-shuffling & splitting iterations.
-
- test_size : float (default 0.1), int, or None
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the test split. If
- int, represents the absolute number of test samples. If None,
- the value is automatically set to the complement of the train size.
-
- train_size : float, int, or None (default is None)
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the train split. If
- int, represents the absolute number of train samples. If None,
- the value is automatically set to the complement of the test size.
-
- random_state : int, RandomState instance or None, optional (default None)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- Examples
- --------
- >>> from sklearn import cross_validation
- >>> rs = cross_validation.ShuffleSplit(4, n_iter=3,
- ... test_size=.25, random_state=0)
- >>> len(rs)
- 3
- >>> print(rs)
- ... # doctest: +ELLIPSIS
- ShuffleSplit(4, n_iter=3, test_size=0.25, ...)
- >>> for train_index, test_index in rs:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ...
- TRAIN: [3 1 0] TEST: [2]
- TRAIN: [2 1 3] TEST: [0]
- TRAIN: [0 2 1] TEST: [3]
-
- >>> rs = cross_validation.ShuffleSplit(4, n_iter=3,
- ... train_size=0.5, test_size=.25, random_state=0)
- >>> for train_index, test_index in rs:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ...
- TRAIN: [3 1] TEST: [2]
- TRAIN: [2 1] TEST: [0]
- TRAIN: [0 2] TEST: [3]
-
- """
-
- def _iter_indices(self):
- rng = check_random_state(self.random_state)
- for i in range(self.n_iter):
- # random partition
- permutation = rng.permutation(self.n)
- ind_test = permutation[:self.n_test]
- ind_train = permutation[self.n_test:self.n_test + self.n_train]
- yield ind_train, ind_test
-
- def __repr__(self):
- return ('%s(%d, n_iter=%d, test_size=%s, '
- 'random_state=%s)' % (
- self.__class__.__name__,
- self.n,
- self.n_iter,
- str(self.test_size),
- self.random_state,
- ))
-
- def __len__(self):
- return self.n_iter
-
-
-def _validate_shuffle_split(n, test_size, train_size):
- if test_size is None and train_size is None:
- raise ValueError(
- 'test_size and train_size can not both be None')
-
- if test_size is not None:
- if np.asarray(test_size).dtype.kind == 'f':
- if test_size >= 1.:
- raise ValueError(
- 'test_size=%f should be smaller '
- 'than 1.0 or be an integer' % test_size)
- elif np.asarray(test_size).dtype.kind == 'i':
- if test_size >= n:
- raise ValueError(
- 'test_size=%d should be smaller '
- 'than the number of samples %d' % (test_size, n))
- else:
- raise ValueError("Invalid value for test_size: %r" % test_size)
-
- if train_size is not None:
- if np.asarray(train_size).dtype.kind == 'f':
- if train_size >= 1.:
- raise ValueError("train_size=%f should be smaller "
- "than 1.0 or be an integer" % train_size)
- elif np.asarray(test_size).dtype.kind == 'f' and \
- train_size + test_size > 1.:
- raise ValueError('The sum of test_size and train_size = %f, '
- 'should be smaller than 1.0. Reduce '
- 'test_size and/or train_size.' %
- (train_size + test_size))
- elif np.asarray(train_size).dtype.kind == 'i':
- if train_size >= n:
- raise ValueError("train_size=%d should be smaller "
- "than the number of samples %d" %
- (train_size, n))
- else:
- raise ValueError("Invalid value for train_size: %r" % train_size)
-
- if np.asarray(test_size).dtype.kind == 'f':
- n_test = ceil(test_size * n)
- elif np.asarray(test_size).dtype.kind == 'i':
- n_test = float(test_size)
-
- if train_size is None:
- n_train = n - n_test
- else:
- if np.asarray(train_size).dtype.kind == 'f':
- n_train = floor(train_size * n)
- else:
- n_train = float(train_size)
-
- if test_size is None:
- n_test = n - n_train
-
- if n_train + n_test > n:
- raise ValueError('The sum of train_size and test_size = %d, '
- 'should be smaller than the number of '
- 'samples %d. Reduce test_size and/or '
- 'train_size.' % (n_train + n_test, n))
-
- return int(n_train), int(n_test)
-
-
-def _approximate_mode(class_counts, n_draws, rng):
- """Computes approximate mode of multivariate hypergeometric.
-
- This is an approximation to the mode of the multivariate
- hypergeometric given by class_counts and n_draws.
- It shouldn't be off by more than one.
-
- It is the mostly likely outcome of drawing n_draws many
- samples from the population given by class_counts.
-
- Parameters
- ----------
- class_counts : ndarray of int
- Population per class.
- n_draws : int
- Number of draws (samples to draw) from the overall population.
- rng : random state
- Used to break ties.
-
- Returns
- -------
- sampled_classes : ndarray of int
- Number of samples drawn from each class.
- np.sum(sampled_classes) == n_draws
- """
- # this computes a bad approximation to the mode of the
- # multivariate hypergeometric given by class_counts and n_draws
- continuous = n_draws * class_counts / class_counts.sum()
- # floored means we don't overshoot n_samples, but probably undershoot
- floored = np.floor(continuous)
- # we add samples according to how much "left over" probability
- # they had, until we arrive at n_samples
- need_to_add = int(n_draws - floored.sum())
- if need_to_add > 0:
- remainder = continuous - floored
- values = np.sort(np.unique(remainder))[::-1]
- # add according to remainder, but break ties
- # randomly to avoid biases
- for value in values:
- inds, = np.where(remainder == value)
- # if we need_to_add less than what's in inds
- # we draw randomly from them.
- # if we need to add more, we add them all and
- # go to the next value
- add_now = min(len(inds), need_to_add)
- inds = rng.choice(inds, size=add_now, replace=False)
- floored[inds] += 1
- need_to_add -= add_now
- if need_to_add == 0:
- break
- return floored.astype(np.int)
-
-
-class StratifiedShuffleSplit(BaseShuffleSplit):
- """Stratified ShuffleSplit cross validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.StratifiedShuffleSplit` instead.
-
- Provides train/test indices to split data in train test sets.
-
- This cross-validation object is a merge of StratifiedKFold and
- ShuffleSplit, which returns stratified randomized folds. The folds
- are made by preserving the percentage of samples for each class.
-
- Note: like the ShuffleSplit strategy, stratified random splits
- do not guarantee that all folds will be different, although this is
- still very likely for sizeable datasets.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- y : array, [n_samples]
- Labels of samples.
-
- n_iter : int (default 10)
- Number of re-shuffling & splitting iterations.
-
- test_size : float (default 0.1), int, or None
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the test split. If
- int, represents the absolute number of test samples. If None,
- the value is automatically set to the complement of the train size.
-
- train_size : float, int, or None (default is None)
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the train split. If
- int, represents the absolute number of train samples. If None,
- the value is automatically set to the complement of the test size.
-
- random_state : int, RandomState instance or None, optional (default None)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- Examples
- --------
- >>> from sklearn.cross_validation import StratifiedShuffleSplit
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 1, 1])
- >>> sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)
- >>> len(sss)
- 3
- >>> print(sss) # doctest: +ELLIPSIS
- StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, ...)
- >>> for train_index, test_index in sss:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- TRAIN: [1 2] TEST: [3 0]
- TRAIN: [0 2] TEST: [1 3]
- TRAIN: [0 2] TEST: [3 1]
- """
-
- def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
- random_state=None):
-
- super(StratifiedShuffleSplit, self).__init__(
- len(y), n_iter, test_size, train_size, random_state)
-
- self.y = np.array(y)
- self.classes, self.y_indices = np.unique(y, return_inverse=True)
- n_cls = self.classes.shape[0]
-
- if np.min(np.bincount(self.y_indices)) < 2:
- raise ValueError("The least populated class in y has only 1"
- " member, which is too few. The minimum"
- " number of labels for any class cannot"
- " be less than 2.")
-
- if self.n_train < n_cls:
- raise ValueError('The train_size = %d should be greater or '
- 'equal to the number of classes = %d' %
- (self.n_train, n_cls))
- if self.n_test < n_cls:
- raise ValueError('The test_size = %d should be greater or '
- 'equal to the number of classes = %d' %
- (self.n_test, n_cls))
-
- def _iter_indices(self):
- rng = check_random_state(self.random_state)
- cls_count = np.bincount(self.y_indices)
-
- for n in range(self.n_iter):
- # if there are ties in the class-counts, we want
- # to make sure to break them anew in each iteration
- n_i = _approximate_mode(cls_count, self.n_train, rng)
- class_counts_remaining = cls_count - n_i
- t_i = _approximate_mode(class_counts_remaining, self.n_test, rng)
-
- train = []
- test = []
-
- for i, _ in enumerate(self.classes):
- permutation = rng.permutation(cls_count[i])
- perm_indices_class_i = np.where(
- (i == self.y_indices))[0][permutation]
-
- train.extend(perm_indices_class_i[:n_i[i]])
- test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
- train = rng.permutation(train)
- test = rng.permutation(test)
-
- yield train, test
-
- def __repr__(self):
- return ('%s(labels=%s, n_iter=%d, test_size=%s, '
- 'random_state=%s)' % (
- self.__class__.__name__,
- self.y,
- self.n_iter,
- str(self.test_size),
- self.random_state,
- ))
-
- def __len__(self):
- return self.n_iter
-
-
-class PredefinedSplit(_PartitionIterator):
- """Predefined split cross validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.PredefinedSplit` instead.
-
- Splits the data into training/test set folds according to a predefined
- scheme. Each sample can be assigned to at most one test set fold, as
- specified by the user through the ``test_fold`` parameter.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- test_fold : "array-like, shape (n_samples,)
- test_fold[i] gives the test set fold of sample i. A value of -1
- indicates that the corresponding sample is not part of any test set
- folds, but will instead always be put into the training fold.
-
- Examples
- --------
- >>> from sklearn.cross_validation import PredefinedSplit
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 1, 1])
- >>> ps = PredefinedSplit(test_fold=[0, 1, -1, 1])
- >>> len(ps)
- 2
- >>> print(ps) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
- sklearn.cross_validation.PredefinedSplit(test_fold=[ 0 1 -1 1])
- >>> for train_index, test_index in ps:
- ... print("TRAIN:", train_index, "TEST:", test_index)
- ... X_train, X_test = X[train_index], X[test_index]
- ... y_train, y_test = y[train_index], y[test_index]
- TRAIN: [1 2 3] TEST: [0]
- TRAIN: [0 2] TEST: [1 3]
- """
-
- def __init__(self, test_fold):
- super(PredefinedSplit, self).__init__(len(test_fold))
- self.test_fold = np.array(test_fold, dtype=np.int)
- self.test_fold = column_or_1d(self.test_fold)
- self.unique_folds = np.unique(self.test_fold)
- self.unique_folds = self.unique_folds[self.unique_folds != -1]
-
- def _iter_test_indices(self):
- for f in self.unique_folds:
- yield np.where(self.test_fold == f)[0]
-
- def __repr__(self):
- return '%s.%s(test_fold=%s)' % (
- self.__class__.__module__,
- self.__class__.__name__,
- self.test_fold)
-
- def __len__(self):
- return len(self.unique_folds)
-
-
-class LabelShuffleSplit(ShuffleSplit):
- """Shuffle-Labels-Out cross-validation iterator
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.GroupShuffleSplit` instead.
-
- Provides randomized train/test indices to split data according to a
- third-party provided label. This label information can be used to encode
- arbitrary domain specific stratifications of the samples as integers.
-
- For instance the labels could be the year of collection of the samples
- and thus allow for cross-validation against time-based splits.
-
- The difference between LeavePLabelOut and LabelShuffleSplit is that
- the former generates splits using all subsets of size ``p`` unique labels,
- whereas LabelShuffleSplit generates a user-determined number of random
- test splits, each with a user-determined fraction of unique labels.
-
- For example, a less computationally intensive alternative to
- ``LeavePLabelOut(labels, p=10)`` would be
- ``LabelShuffleSplit(labels, test_size=10, n_iter=100)``.
-
- Note: The parameters ``test_size`` and ``train_size`` refer to labels, and
- not to samples, as in ShuffleSplit.
-
- .. versionadded:: 0.17
-
- Parameters
- ----------
- labels : array, [n_samples]
- Labels of samples
-
- n_iter : int (default 5)
- Number of re-shuffling and splitting iterations.
-
- test_size : float (default 0.2), int, or None
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the labels to include in the test split. If
- int, represents the absolute number of test labels. If None,
- the value is automatically set to the complement of the train size.
-
- train_size : float, int, or None (default is None)
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the labels to include in the train split. If
- int, represents the absolute number of train labels. If None,
- the value is automatically set to the complement of the test size.
-
- random_state : int, RandomState instance or None, optional (default None)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- """
- def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None,
- random_state=None):
-
- classes, label_indices = np.unique(labels, return_inverse=True)
-
- super(LabelShuffleSplit, self).__init__(
- len(classes),
- n_iter=n_iter,
- test_size=test_size,
- train_size=train_size,
- random_state=random_state)
-
- self.labels = labels
- self.classes = classes
- self.label_indices = label_indices
-
- def __repr__(self):
- return ('%s(labels=%s, n_iter=%d, test_size=%s, '
- 'random_state=%s)' % (
- self.__class__.__name__,
- self.labels,
- self.n_iter,
- str(self.test_size),
- self.random_state,
- ))
-
- def __len__(self):
- return self.n_iter
-
- def _iter_indices(self):
- for label_train, label_test in super(LabelShuffleSplit,
- self)._iter_indices():
- # these are the indices of classes in the partition
- # invert them into data indices
-
- train = np.flatnonzero(np.in1d(self.label_indices, label_train))
- test = np.flatnonzero(np.in1d(self.label_indices, label_test))
-
- yield train, test
-
-
-##############################################################################
-def _index_param_value(X, v, indices):
- """Private helper function for parameter value indexing."""
- if not _is_arraylike(v) or _num_samples(v) != _num_samples(X):
- # pass through: skip indexing
- return v
- if sp.issparse(v):
- v = v.tocsr()
- return safe_indexing(v, indices)
-
-
-def cross_val_predict(estimator, X, y=None, cv=None, n_jobs=1,
- verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
- """Generate cross-validated estimates for each input data point
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.cross_val_predict` instead.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object implementing 'fit' and 'predict'
- The object to use to fit the data.
-
- X : array-like
- The data to fit. Can be, for example a list, or an array at least 2d.
-
- y : array-like, optional, default: None
- The target variable to try to predict in the case of
- supervised learning.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass, :class:`StratifiedKFold` is used. In all
- other cases, :class:`KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- n_jobs : integer, optional
- The number of CPUs to use to do the computation. -1 means
- 'all CPUs'.
-
- verbose : integer, optional
- The verbosity level.
-
- fit_params : dict, optional
- Parameters to pass to the fit method of the estimator.
-
- pre_dispatch : int, or string, optional
- Controls the number of jobs that get dispatched during parallel
- execution. Reducing this number can be useful to avoid an
- explosion of memory consumption when more jobs get dispatched
- than CPUs can process. This parameter can be:
-
- - None, in which case all the jobs are immediately
- created and spawned. Use this for lightweight and
- fast-running jobs, to avoid delays due to on-demand
- spawning of the jobs
-
- - An int, giving the exact number of total jobs that are
- spawned
-
- - A string, giving an expression as a function of n_jobs,
- as in '2*n_jobs'
-
- Returns
- -------
- preds : ndarray
- This is the result of calling 'predict'
-
- Examples
- --------
- >>> from sklearn import datasets, linear_model
- >>> from sklearn.cross_validation import cross_val_predict
- >>> diabetes = datasets.load_diabetes()
- >>> X = diabetes.data[:150]
- >>> y = diabetes.target[:150]
- >>> lasso = linear_model.Lasso()
- >>> y_pred = cross_val_predict(lasso, X, y)
- """
- X, y = indexable(X, y)
-
- cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
- # We clone the estimator to make sure that all the folds are
- # independent, and that it is pickle-able.
- parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
- pre_dispatch=pre_dispatch)
- preds_blocks = parallel(delayed(_fit_and_predict)(clone(estimator), X, y,
- train, test, verbose,
- fit_params)
- for train, test in cv)
-
- preds = [p for p, _ in preds_blocks]
- locs = np.concatenate([loc for _, loc in preds_blocks])
- if not _check_is_partition(locs, _num_samples(X)):
- raise ValueError('cross_val_predict only works for partitions')
- inv_locs = np.empty(len(locs), dtype=int)
- inv_locs[locs] = np.arange(len(locs))
-
- # Check for sparse predictions
- if sp.issparse(preds[0]):
- preds = sp.vstack(preds, format=preds[0].format)
- else:
- preds = np.concatenate(preds)
- return preds[inv_locs]
-
-
-def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params):
- """Fit estimator and predict values for a given dataset split.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object implementing 'fit' and 'predict'
- The object to use to fit the data.
-
- X : array-like of shape at least 2D
- The data to fit.
-
- y : array-like, optional, default: None
- The target variable to try to predict in the case of
- supervised learning.
-
- train : array-like, shape (n_train_samples,)
- Indices of training samples.
-
- test : array-like, shape (n_test_samples,)
- Indices of test samples.
-
- verbose : integer
- The verbosity level.
-
- fit_params : dict or None
- Parameters that will be passed to ``estimator.fit``.
-
- Returns
- -------
- preds : sequence
- Result of calling 'estimator.predict'
-
- test : array-like
- This is the value of the test parameter
- """
- # Adjust length of sample weights
- fit_params = fit_params if fit_params is not None else {}
- fit_params = dict([(k, _index_param_value(X, v, train))
- for k, v in fit_params.items()])
-
- X_train, y_train = _safe_split(estimator, X, y, train)
- X_test, _ = _safe_split(estimator, X, y, test, train)
-
- if y_train is None:
- estimator.fit(X_train, **fit_params)
- else:
- estimator.fit(X_train, y_train, **fit_params)
- preds = estimator.predict(X_test)
- return preds, test
-
-
-def _check_is_partition(locs, n):
- """Check whether locs is a reordering of the array np.arange(n)
-
- Parameters
- ----------
- locs : ndarray
- integer array to test
- n : int
- number of expected elements
-
- Returns
- -------
- is_partition : bool
- True iff sorted(locs) is range(n)
- """
- if len(locs) != n:
- return False
- hit = np.zeros(n, bool)
- hit[locs] = True
- if not np.all(hit):
- return False
- return True
-
-
-def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
- verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
- """Evaluate a score by cross-validation
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.cross_val_score` instead.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object implementing 'fit'
- The object to use to fit the data.
-
- X : array-like
- The data to fit. Can be, for example a list, or an array at least 2d.
-
- y : array-like, optional, default: None
- The target variable to try to predict in the case of
- supervised learning.
-
- scoring : string, callable or None, optional, default: None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass, :class:`StratifiedKFold` is used. In all
- other cases, :class:`KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- n_jobs : integer, optional
- The number of CPUs to use to do the computation. -1 means
- 'all CPUs'.
-
- verbose : integer, optional
- The verbosity level.
-
- fit_params : dict, optional
- Parameters to pass to the fit method of the estimator.
-
- pre_dispatch : int, or string, optional
- Controls the number of jobs that get dispatched during parallel
- execution. Reducing this number can be useful to avoid an
- explosion of memory consumption when more jobs get dispatched
- than CPUs can process. This parameter can be:
-
- - None, in which case all the jobs are immediately
- created and spawned. Use this for lightweight and
- fast-running jobs, to avoid delays due to on-demand
- spawning of the jobs
-
- - An int, giving the exact number of total jobs that are
- spawned
-
- - A string, giving an expression as a function of n_jobs,
- as in '2*n_jobs'
-
- Returns
- -------
- scores : array of float, shape=(len(list(cv)),)
- Array of scores of the estimator for each run of the cross validation.
-
- Examples
- --------
- >>> from sklearn import datasets, linear_model
- >>> from sklearn.cross_validation import cross_val_score
- >>> diabetes = datasets.load_diabetes()
- >>> X = diabetes.data[:150]
- >>> y = diabetes.target[:150]
- >>> lasso = linear_model.Lasso()
- >>> print(cross_val_score(lasso, X, y)) # doctest: +ELLIPSIS
- [0.33150734 0.08022311 0.03531764]
-
- See Also
- ---------
- :func:`sklearn.metrics.make_scorer`:
- Make a scorer from a performance metric or loss function.
-
- """
- X, y = indexable(X, y)
-
- cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
- scorer = check_scoring(estimator, scoring=scoring)
- # We clone the estimator to make sure that all the folds are
- # independent, and that it is pickle-able.
- parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
- pre_dispatch=pre_dispatch)
- scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
- train, test, verbose, None,
- fit_params)
- for train, test in cv)
- return np.array(scores)[:, 0]
-
-
-def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
- parameters, fit_params, return_train_score=False,
- return_parameters=False, error_score='raise'):
- """Fit estimator and compute scores for a given dataset split.
-
- Parameters
- ----------
- estimator : estimator object implementing 'fit'
- The object to use to fit the data.
-
- X : array-like of shape at least 2D
- The data to fit.
-
- y : array-like, optional, default: None
- The target variable to try to predict in the case of
- supervised learning.
-
- scorer : callable
- A scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
-
- train : array-like, shape (n_train_samples,)
- Indices of training samples.
-
- test : array-like, shape (n_test_samples,)
- Indices of test samples.
-
- verbose : integer
- The verbosity level.
-
- error_score : 'raise' (default) or numeric
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
-
- parameters : dict or None
- Parameters to be set on the estimator.
-
- fit_params : dict or None
- Parameters that will be passed to ``estimator.fit``.
-
- return_train_score : boolean, optional, default: False
- Compute and return score on training set.
-
- return_parameters : boolean, optional, default: False
- Return parameters that has been used for the estimator.
-
- Returns
- -------
- train_score : float, optional
- Score on training set, returned only if `return_train_score` is `True`.
-
- test_score : float
- Score on test set.
-
- n_test_samples : int
- Number of test samples.
-
- scoring_time : float
- Time spent for fitting and scoring in seconds.
-
- parameters : dict or None, optional
- The parameters that have been evaluated.
- """
- if verbose > 1:
- if parameters is None:
- msg = ''
- else:
- msg = '%s' % (', '.join('%s=%s' % (k, v)
- for k, v in parameters.items()))
- print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
-
- # Adjust length of sample weights
- fit_params = fit_params if fit_params is not None else {}
- fit_params = dict([(k, _index_param_value(X, v, train))
- for k, v in fit_params.items()])
-
- if parameters is not None:
- estimator.set_params(**parameters)
-
- start_time = time.time()
-
- X_train, y_train = _safe_split(estimator, X, y, train)
- X_test, y_test = _safe_split(estimator, X, y, test, train)
-
- try:
- if y_train is None:
- estimator.fit(X_train, **fit_params)
- else:
- estimator.fit(X_train, y_train, **fit_params)
-
- except Exception as e:
- if error_score == 'raise':
- raise
- elif isinstance(error_score, numbers.Number):
- test_score = error_score
- if return_train_score:
- train_score = error_score
- warnings.warn("Classifier fit failed. The score on this train-test"
- " partition for these parameters will be set to %f. "
- "Details: \n%r" % (error_score, e), FitFailedWarning)
- else:
- raise ValueError("error_score must be the string 'raise' or a"
- " numeric value. (Hint: if using 'raise', please"
- " make sure that it has been spelled correctly.)"
- )
-
- else:
- test_score = _score(estimator, X_test, y_test, scorer)
- if return_train_score:
- train_score = _score(estimator, X_train, y_train, scorer)
-
- scoring_time = time.time() - start_time
-
- if verbose > 2:
- msg += ", score=%f" % test_score
- if verbose > 1:
- end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
- print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
-
- ret = [train_score] if return_train_score else []
- ret.extend([test_score, _num_samples(X_test), scoring_time])
- if return_parameters:
- ret.append(parameters)
- return ret
-
-
-def _safe_split(estimator, X, y, indices, train_indices=None):
- """Create subset of dataset and properly handle kernels."""
- if hasattr(estimator, 'kernel') and callable(estimator.kernel) \
- and not isinstance(estimator.kernel, GPKernel):
- # cannot compute the kernel values with custom function
- raise ValueError("Cannot use a custom kernel function. "
- "Precompute the kernel matrix instead.")
-
- if not hasattr(X, "shape"):
- if getattr(estimator, "_pairwise", False):
- raise ValueError("Precomputed kernels or affinity matrices have "
- "to be passed as arrays or sparse matrices.")
- X_subset = [X[idx] for idx in indices]
- else:
- if getattr(estimator, "_pairwise", False):
- # X is a precomputed square kernel matrix
- if X.shape[0] != X.shape[1]:
- raise ValueError("X should be a square kernel matrix")
- if train_indices is None:
- X_subset = X[np.ix_(indices, indices)]
- else:
- X_subset = X[np.ix_(indices, train_indices)]
- else:
- X_subset = safe_indexing(X, indices)
-
- if y is not None:
- y_subset = safe_indexing(y, indices)
- else:
- y_subset = None
-
- return X_subset, y_subset
-
-
-def _score(estimator, X_test, y_test, scorer):
- """Compute the score of an estimator on a given test set."""
- if y_test is None:
- score = scorer(estimator, X_test)
- else:
- score = scorer(estimator, X_test, y_test)
- if hasattr(score, 'item'):
- try:
- # e.g. unwrap memmapped scalars
- score = score.item()
- except ValueError:
- # non-scalar?
- pass
- if not isinstance(score, numbers.Number):
- raise ValueError("scoring must return a number, got %s (%s) instead."
- % (str(score), type(score)))
- return score
-
-
-def _permutation_test_score(estimator, X, y, cv, scorer):
- """Auxiliary function for permutation_test_score"""
- avg_score = []
- for train, test in cv:
- X_train, y_train = _safe_split(estimator, X, y, train)
- X_test, y_test = _safe_split(estimator, X, y, test, train)
- estimator.fit(X_train, y_train)
- avg_score.append(scorer(estimator, X_test, y_test))
- return np.mean(avg_score)
-
-
-def _shuffle(y, labels, random_state):
- """Return a shuffled copy of y eventually shuffle among same labels."""
- if labels is None:
- ind = random_state.permutation(len(y))
- else:
- ind = np.arange(len(labels))
- for label in np.unique(labels):
- this_mask = (labels == label)
- ind[this_mask] = random_state.permutation(ind[this_mask])
- return safe_indexing(y, ind)
-
-
-def check_cv(cv, X=None, y=None, classifier=False):
- """Input checker utility for building a CV in a user friendly way.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.check_cv` instead.
-
- Parameters
- ----------
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if classifier is True and ``y`` is binary or
- multiclass, :class:`StratifiedKFold` is used. In all other cases,
- :class:`KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- X : array-like
- The data the cross-val object will be applied on.
-
- y : array-like
- The target variable for a supervised learning problem.
-
- classifier : boolean optional
- Whether the task is a classification task, in which case
- stratified KFold will be used.
-
- Returns
- -------
- checked_cv : a cross-validation generator instance.
- The return value is guaranteed to be a cv generator instance, whatever
- the input type.
- """
- is_sparse = sp.issparse(X)
- if cv is None:
- cv = 3
- if isinstance(cv, numbers.Integral):
- if classifier:
- if type_of_target(y) in ['binary', 'multiclass']:
- cv = StratifiedKFold(y, cv)
- else:
- cv = KFold(_num_samples(y), cv)
- else:
- if not is_sparse:
- n_samples = len(X)
- else:
- n_samples = X.shape[0]
- cv = KFold(n_samples, cv)
- return cv
-
-
-def permutation_test_score(estimator, X, y, cv=None,
- n_permutations=100, n_jobs=1, labels=None,
- random_state=0, verbose=0, scoring=None):
- """Evaluate the significance of a cross-validated score with permutations
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.permutation_test_score` instead.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object implementing 'fit'
- The object to use to fit the data.
-
- X : array-like of shape at least 2D
- The data to fit.
-
- y : array-like
- The target variable to try to predict in the case of
- supervised learning.
-
- scoring : string, callable or None, optional, default: None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass, :class:`StratifiedKFold` is used. In all
- other cases, :class:`KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- n_permutations : integer, optional
- Number of times to permute ``y``.
-
- n_jobs : integer, optional
- The number of CPUs to use to do the computation. -1 means
- 'all CPUs'.
-
- labels : array-like of shape [n_samples] (optional)
- Labels constrain the permutation among groups of samples with
- a same label.
-
- random_state : int, RandomState instance or None, optional (default=0)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- verbose : integer, optional
- The verbosity level.
-
- Returns
- -------
- score : float
- The true score without permuting targets.
-
- permutation_scores : array, shape (n_permutations,)
- The scores obtained for each permutations.
-
- pvalue : float
- The p-value, which approximates the probability that the score would
- be obtained by chance. This is calculated as:
-
- `(C + 1) / (n_permutations + 1)`
-
- Where C is the number of permutations whose score >= the true score.
-
- The best possible p-value is 1/(n_permutations + 1), the worst is 1.0.
-
- Notes
- -----
- This function implements Test 1 in:
-
- Ojala and Garriga. Permutation Tests for Studying Classifier
- Performance. The Journal of Machine Learning Research (2010)
- vol. 11
-
- """
- X, y = indexable(X, y)
- cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
- scorer = check_scoring(estimator, scoring=scoring)
- random_state = check_random_state(random_state)
-
- # We clone the estimator to make sure that all the folds are
- # independent, and that it is pickle-able.
- score = _permutation_test_score(clone(estimator), X, y, cv, scorer)
- permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
- delayed(_permutation_test_score)(
- clone(estimator), X, _shuffle(y, labels, random_state), cv,
- scorer)
- for _ in range(n_permutations))
- permutation_scores = np.array(permutation_scores)
- pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
- return score, permutation_scores, pvalue
-
-
-def train_test_split(*arrays, **options):
- """Split arrays or matrices into random train and test subsets
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.train_test_split` instead.
-
- Quick utility that wraps input validation and
- ``next(iter(ShuffleSplit(n_samples)))`` and application to input
- data into a single call for splitting (and optionally subsampling)
- data in a oneliner.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- *arrays : sequence of indexables with same length / shape[0]
- Allowed inputs are lists, numpy arrays, scipy-sparse
- matrices or pandas dataframes.
-
- test_size : float, int, or None (default is None)
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the test split. If
- int, represents the absolute number of test samples. If None,
- the value is automatically set to the complement of the train size.
- If train size is also None, test size is set to 0.25.
-
- train_size : float, int, or None (default is None)
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the train split. If
- int, represents the absolute number of train samples. If None,
- the value is automatically set to the complement of the test size.
-
- random_state : int, RandomState instance or None, optional (default=None)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- stratify : array-like or None (default is None)
- If not None, data is split in a stratified fashion, using this as
- the labels array.
-
- .. versionadded:: 0.17
- *stratify* splitting
-
- Returns
- -------
- splitting : list, length = 2 * len(arrays),
- List containing train-test split of inputs.
-
- .. versionadded:: 0.16
- If the input is sparse, the output will be a
- ``scipy.sparse.csr_matrix``. Else, output type is the same as the
- input type.
-
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.cross_validation import train_test_split
- >>> X, y = np.arange(10).reshape((5, 2)), range(5)
- >>> X
- array([[0, 1],
- [2, 3],
- [4, 5],
- [6, 7],
- [8, 9]])
- >>> list(y)
- [0, 1, 2, 3, 4]
-
- >>> X_train, X_test, y_train, y_test = train_test_split(
- ... X, y, test_size=0.33, random_state=42)
- ...
- >>> X_train
- array([[4, 5],
- [0, 1],
- [6, 7]])
- >>> y_train
- [2, 0, 3]
- >>> X_test
- array([[2, 3],
- [8, 9]])
- >>> y_test
- [1, 4]
-
- """
- n_arrays = len(arrays)
- if n_arrays == 0:
- raise ValueError("At least one array required as input")
-
- test_size = options.pop('test_size', None)
- train_size = options.pop('train_size', None)
- random_state = options.pop('random_state', None)
- stratify = options.pop('stratify', None)
-
- if options:
- raise TypeError("Invalid parameters passed: %s" % str(options))
-
- if test_size is None and train_size is None:
- test_size = 0.25
- arrays = indexable(*arrays)
- if stratify is not None:
- cv = StratifiedShuffleSplit(stratify, test_size=test_size,
- train_size=train_size,
- random_state=random_state)
- else:
- n_samples = _num_samples(arrays[0])
- cv = ShuffleSplit(n_samples, test_size=test_size,
- train_size=train_size,
- random_state=random_state)
-
- train, test = next(iter(cv))
- return list(chain.from_iterable((safe_indexing(a, train),
- safe_indexing(a, test)) for a in arrays))
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
deleted file mode 100644
index 6ba673d2fbf7c..0000000000000
--- a/sklearn/grid_search.py
+++ /dev/null
@@ -1,1046 +0,0 @@
-"""
-The :mod:`sklearn.grid_search` includes utilities to fine-tune the parameters
-of an estimator.
-"""
-from __future__ import print_function
-
-# Author: Alexandre Gramfort ,
-# Gael Varoquaux
-# Andreas Mueller
-# Olivier Grisel
-# License: BSD 3 clause
-
-from abc import ABCMeta, abstractmethod
-from collections import Mapping, namedtuple, Sized
-from functools import partial, reduce
-from itertools import product
-import operator
-import warnings
-
-import numpy as np
-
-from .base import BaseEstimator, is_classifier, clone
-from .base import MetaEstimatorMixin
-from .cross_validation import check_cv
-from .cross_validation import _fit_and_score
-from .externals.joblib import Parallel, delayed
-from .externals import six
-from .utils import check_random_state
-from .utils.random import sample_without_replacement
-from .utils.validation import _num_samples, indexable
-from .utils.metaestimators import if_delegate_has_method
-from .metrics.scorer import check_scoring
-
-
-__all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point',
- 'ParameterSampler', 'RandomizedSearchCV']
-
-
-warnings.warn("This module was deprecated in version 0.18 in favor of the "
- "model_selection module into which all the refactored classes "
- "and functions are moved. This module will be removed in 0.20.",
- DeprecationWarning)
-
-
-class ParameterGrid(object):
- """Grid of parameters with a discrete number of values for each.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.ParameterGrid` instead.
-
- Can be used to iterate over parameter value combinations with the
- Python built-in function iter.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- param_grid : dict of string to sequence, or sequence of such
- The parameter grid to explore, as a dictionary mapping estimator
- parameters to sequences of allowed values.
-
- An empty dict signifies default parameters.
-
- A sequence of dicts signifies a sequence of grids to search, and is
- useful to avoid exploring parameter combinations that make no sense
- or have no effect. See the examples below.
-
- Examples
- --------
- >>> from sklearn.grid_search import ParameterGrid
- >>> param_grid = {'a': [1, 2], 'b': [True, False]}
- >>> list(ParameterGrid(param_grid)) == (
- ... [{'a': 1, 'b': True}, {'a': 1, 'b': False},
- ... {'a': 2, 'b': True}, {'a': 2, 'b': False}])
- True
-
- >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
- >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
- ... {'kernel': 'rbf', 'gamma': 1},
- ... {'kernel': 'rbf', 'gamma': 10}]
- True
- >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
- True
-
- See also
- --------
- :class:`GridSearchCV`:
- uses ``ParameterGrid`` to perform a full parallelized parameter search.
- """
-
- def __init__(self, param_grid):
- if isinstance(param_grid, Mapping):
- # wrap dictionary in a singleton list to support either dict
- # or list of dicts
- param_grid = [param_grid]
- self.param_grid = param_grid
-
- def __iter__(self):
- """Iterate over the points in the grid.
-
- Returns
- -------
- params : iterator over dict of string to any
- Yields dictionaries mapping each estimator parameter to one of its
- allowed values.
- """
- for p in self.param_grid:
- # Always sort the keys of a dictionary, for reproducibility
- items = sorted(p.items())
- if not items:
- yield {}
- else:
- keys, values = zip(*items)
- for v in product(*values):
- params = dict(zip(keys, v))
- yield params
-
- def __len__(self):
- """Number of points on the grid."""
- # Product function that can handle iterables (np.product can't).
- product = partial(reduce, operator.mul)
- return sum(product(len(v) for v in p.values()) if p else 1
- for p in self.param_grid)
-
- def __getitem__(self, ind):
- """Get the parameters that would be ``ind``th in iteration
-
- Parameters
- ----------
- ind : int
- The iteration index
-
- Returns
- -------
- params : dict of string to any
- Equal to list(self)[ind]
- """
- # This is used to make discrete sampling without replacement memory
- # efficient.
- for sub_grid in self.param_grid:
- # XXX: could memoize information used here
- if not sub_grid:
- if ind == 0:
- return {}
- else:
- ind -= 1
- continue
-
- # Reverse so most frequent cycling parameter comes first
- keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
- sizes = [len(v_list) for v_list in values_lists]
- total = np.product(sizes)
-
- if ind >= total:
- # Try the next grid
- ind -= total
- else:
- out = {}
- for key, v_list, n in zip(keys, values_lists, sizes):
- ind, offset = divmod(ind, n)
- out[key] = v_list[offset]
- return out
-
- raise IndexError('ParameterGrid index out of range')
-
-
-class ParameterSampler(object):
- """Generator on parameters sampled from given distributions.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.ParameterSampler` instead.
-
- Non-deterministic iterable over random candidate combinations for hyper-
- parameter search. If all parameters are presented as a list,
- sampling without replacement is performed. If at least one parameter
- is given as a distribution, sampling with replacement is used.
- It is highly recommended to use continuous distributions for continuous
- parameters.
-
- Note that as of SciPy 0.12, the ``scipy.stats.distributions`` do not accept
- a custom RNG instance and always use the singleton RNG from
- ``numpy.random``. Hence setting ``random_state`` will not guarantee a
- deterministic iteration whenever ``scipy.stats`` distributions are used to
- define the parameter search space.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- param_distributions : dict
- Dictionary where the keys are parameters and values
- are distributions from which a parameter is to be sampled.
- Distributions either have to provide a ``rvs`` function
- to sample from them, or can be given as a list of values,
- where a uniform distribution is assumed.
-
- n_iter : integer
- Number of parameter settings that are produced.
-
- random_state : int, RandomState instance or None, optional (default=None)
- Pseudo random number generator state used for random uniform sampling
- from lists of possible values instead of scipy.stats distributions.
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- Returns
- -------
- params : dict of string to any
- **Yields** dictionaries mapping each estimator parameter to
- as sampled value.
-
- Examples
- --------
- >>> from sklearn.grid_search import ParameterSampler
- >>> from scipy.stats.distributions import expon
- >>> import numpy as np
- >>> np.random.seed(0)
- >>> param_grid = {'a':[1, 2], 'b': expon()}
- >>> param_list = list(ParameterSampler(param_grid, n_iter=4))
- >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
- ... for d in param_list]
- >>> rounded_list == [{'b': 0.89856, 'a': 1},
- ... {'b': 0.923223, 'a': 1},
- ... {'b': 1.878964, 'a': 2},
- ... {'b': 1.038159, 'a': 2}]
- True
- """
- def __init__(self, param_distributions, n_iter, random_state=None):
- self.param_distributions = param_distributions
- self.n_iter = n_iter
- self.random_state = random_state
-
- def __iter__(self):
- # check if all distributions are given as lists
- # in this case we want to sample without replacement
- all_lists = np.all([not hasattr(v, "rvs")
- for v in self.param_distributions.values()])
- rnd = check_random_state(self.random_state)
-
- if all_lists:
- # look up sampled parameter settings in parameter grid
- param_grid = ParameterGrid(self.param_distributions)
- grid_size = len(param_grid)
-
- if grid_size < self.n_iter:
- raise ValueError(
- "The total space of parameters %d is smaller "
- "than n_iter=%d." % (grid_size, self.n_iter)
- + " For exhaustive searches, use GridSearchCV.")
- for i in sample_without_replacement(grid_size, self.n_iter,
- random_state=rnd):
- yield param_grid[i]
-
- else:
- # Always sort the keys of a dictionary, for reproducibility
- items = sorted(self.param_distributions.items())
- for _ in six.moves.range(self.n_iter):
- params = dict()
- for k, v in items:
- if hasattr(v, "rvs"):
- params[k] = v.rvs()
- else:
- params[k] = v[rnd.randint(len(v))]
- yield params
-
- def __len__(self):
- """Number of points that will be sampled."""
- return self.n_iter
-
-
-def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
- verbose, error_score='raise', **fit_params):
- """Run fit on one set of parameters.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :func:`sklearn.model_selection.fit_grid_point` instead.
-
- Parameters
- ----------
- X : array-like, sparse matrix or list
- Input data.
-
- y : array-like or None
- Targets for input data.
-
- estimator : estimator object
- A object of that type is instantiated for each grid point.
- This is assumed to implement the scikit-learn estimator interface.
- Either estimator needs to provide a ``score`` function,
- or ``scoring`` must be passed.
-
- parameters : dict
- Parameters to be set on estimator for this grid point.
-
- train : ndarray, dtype int or bool
- Boolean mask or indices for training set.
-
- test : ndarray, dtype int or bool
- Boolean mask or indices for test set.
-
- scorer : callable or None.
- If provided must be a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
-
- verbose : int
- Verbosity level.
-
- **fit_params : kwargs
- Additional parameter passed to the fit function of the estimator.
-
- error_score : 'raise' (default) or numeric
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
-
- Returns
- -------
- score : float
- Score of this parameter setting on given training / test split.
-
- parameters : dict
- The parameters that have been evaluated.
-
- n_samples_test : int
- Number of test samples in this split.
- """
- score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train,
- test, verbose, parameters,
- fit_params, error_score)
- return score, parameters, n_samples_test
-
-
-def _check_param_grid(param_grid):
- if hasattr(param_grid, 'items'):
- param_grid = [param_grid]
-
- for p in param_grid:
- for name, v in p.items():
- if isinstance(v, np.ndarray) and v.ndim > 1:
- raise ValueError("Parameter array should be one-dimensional.")
-
- check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
- if True not in check:
- raise ValueError("Parameter values for parameter ({0}) need "
- "to be a sequence.".format(name))
-
- if len(v) == 0:
- raise ValueError("Parameter values for parameter ({0}) need "
- "to be a non-empty sequence.".format(name))
-
-
-class _CVScoreTuple (namedtuple('_CVScoreTuple',
- ('parameters',
- 'mean_validation_score',
- 'cv_validation_scores'))):
- # A raw namedtuple is very memory efficient as it packs the attributes
- # in a struct to get rid of the __dict__ of attributes in particular it
- # does not copy the string for the keys on each instance.
- # By deriving a namedtuple class just to introduce the __repr__ method we
- # would also reintroduce the __dict__ on the instance. By telling the
- # Python interpreter that this subclass uses static __slots__ instead of
- # dynamic attributes. Furthermore we don't need any additional slot in the
- # subclass so we set __slots__ to the empty tuple.
- __slots__ = ()
-
- def __repr__(self):
- """Simple custom repr to summarize the main info"""
- return "mean: {0:.5f}, std: {1:.5f}, params: {2}".format(
- self.mean_validation_score,
- np.std(self.cv_validation_scores),
- self.parameters)
-
-
-class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
- MetaEstimatorMixin)):
- """Base class for hyper parameter search with cross-validation."""
-
- @abstractmethod
- def __init__(self, estimator, scoring=None,
- fit_params=None, n_jobs=1, iid=True,
- refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
- error_score='raise'):
-
- self.scoring = scoring
- self.estimator = estimator
- self.n_jobs = n_jobs
- self.fit_params = fit_params if fit_params is not None else {}
- self.iid = iid
- self.refit = refit
- self.cv = cv
- self.verbose = verbose
- self.pre_dispatch = pre_dispatch
- self.error_score = error_score
-
- @property
- def _estimator_type(self):
- return self.estimator._estimator_type
-
- @property
- def classes_(self):
- return self.best_estimator_.classes_
-
- def score(self, X, y=None):
- """Returns the score on the given data, if the estimator has been refit.
-
- This uses the score defined by ``scoring`` where provided, and the
- ``best_estimator_.score`` method otherwise.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
- Input data, where n_samples is the number of samples and
- n_features is the number of features.
-
- y : array-like, shape = [n_samples] or [n_samples, n_output], optional
- Target relative to X for classification or regression;
- None for unsupervised learning.
-
- Returns
- -------
- score : float
-
- Notes
- -----
- * The long-standing behavior of this method changed in version 0.16.
- * It no longer uses the metric provided by ``estimator.score`` if the
- ``scoring`` parameter was set when fitting.
-
- """
- if self.scorer_ is None:
- raise ValueError("No score function explicitly defined, "
- "and the estimator doesn't provide one %s"
- % self.best_estimator_)
- return self.scorer_(self.best_estimator_, X, y)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def predict(self, X):
- """Call predict on the estimator with the best found parameters.
-
- Only available if ``refit=True`` and the underlying estimator supports
- ``predict``.
-
- Parameters
- -----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.predict(X)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def predict_proba(self, X):
- """Call predict_proba on the estimator with the best found parameters.
-
- Only available if ``refit=True`` and the underlying estimator supports
- ``predict_proba``.
-
- Parameters
- -----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.predict_proba(X)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def predict_log_proba(self, X):
- """Call predict_log_proba on the estimator with the best found parameters.
-
- Only available if ``refit=True`` and the underlying estimator supports
- ``predict_log_proba``.
-
- Parameters
- -----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.predict_log_proba(X)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def decision_function(self, X):
- """Call decision_function on the estimator with the best found parameters.
-
- Only available if ``refit=True`` and the underlying estimator supports
- ``decision_function``.
-
- Parameters
- -----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.decision_function(X)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def transform(self, X):
- """Call transform on the estimator with the best found parameters.
-
- Only available if the underlying estimator supports ``transform`` and
- ``refit=True``.
-
- Parameters
- -----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.transform(X)
-
- @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- def inverse_transform(self, Xt):
- """Call inverse_transform on the estimator with the best found parameters.
-
- Only available if the underlying estimator implements ``inverse_transform`` and
- ``refit=True``.
-
- Parameters
- -----------
- Xt : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
-
- """
- return self.best_estimator_.inverse_transform(Xt)
-
- def _fit(self, X, y, parameter_iterable):
- """Actual fitting, performing the search over parameters."""
-
- estimator = self.estimator
- cv = self.cv
- self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
-
- n_samples = _num_samples(X)
- X, y = indexable(X, y)
-
- if y is not None:
- if len(y) != n_samples:
- raise ValueError('Target variable (y) has a different number '
- 'of samples (%i) than data (X: %i samples)'
- % (len(y), n_samples))
- cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
-
- if self.verbose > 0:
- if isinstance(parameter_iterable, Sized):
- n_candidates = len(parameter_iterable)
- print("Fitting {0} folds for each of {1} candidates, totalling"
- " {2} fits".format(len(cv), n_candidates,
- n_candidates * len(cv)))
-
- base_estimator = clone(self.estimator)
-
- pre_dispatch = self.pre_dispatch
-
- out = Parallel(
- n_jobs=self.n_jobs, verbose=self.verbose,
- pre_dispatch=pre_dispatch
- )(
- delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
- train, test, self.verbose, parameters,
- self.fit_params, return_parameters=True,
- error_score=self.error_score)
- for parameters in parameter_iterable
- for train, test in cv)
-
- # Out is a list of triplet: score, estimator, n_test_samples
- n_fits = len(out)
- n_folds = len(cv)
-
- scores = list()
- grid_scores = list()
- for grid_start in range(0, n_fits, n_folds):
- n_test_samples = 0
- score = 0
- all_scores = []
- for this_score, this_n_test_samples, _, parameters in \
- out[grid_start:grid_start + n_folds]:
- all_scores.append(this_score)
- if self.iid:
- this_score *= this_n_test_samples
- n_test_samples += this_n_test_samples
- score += this_score
- if self.iid:
- score /= float(n_test_samples)
- else:
- score /= float(n_folds)
- scores.append((score, parameters))
- # TODO: shall we also store the test_fold_sizes?
- grid_scores.append(_CVScoreTuple(
- parameters,
- score,
- np.array(all_scores)))
- # Store the computed scores
- self.grid_scores_ = grid_scores
-
- # Find the best parameters by comparing on the mean validation score:
- # note that `sorted` is deterministic in the way it breaks ties
- best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
- reverse=True)[0]
- self.best_params_ = best.parameters
- self.best_score_ = best.mean_validation_score
-
- if self.refit:
- # fit the best estimator using the entire dataset
- # clone first to work around broken estimators
- best_estimator = clone(base_estimator).set_params(
- **best.parameters)
- if y is not None:
- best_estimator.fit(X, y, **self.fit_params)
- else:
- best_estimator.fit(X, **self.fit_params)
- self.best_estimator_ = best_estimator
- return self
-
-
-class GridSearchCV(BaseSearchCV):
- """Exhaustive search over specified parameter values for an estimator.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.GridSearchCV` instead.
-
- Important members are fit, predict.
-
- GridSearchCV implements a "fit" and a "score" method.
- It also implements "predict", "predict_proba", "decision_function",
- "transform" and "inverse_transform" if they are implemented in the
- estimator used.
-
- The parameters of the estimator used to apply these methods are optimized
- by cross-validated grid-search over a parameter grid.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object.
- A object of that type is instantiated for each grid point.
- This is assumed to implement the scikit-learn estimator interface.
- Either estimator needs to provide a ``score`` function,
- or ``scoring`` must be passed.
-
- param_grid : dict or list of dictionaries
- Dictionary with parameters names (string) as keys and lists of
- parameter settings to try as values, or a list of such
- dictionaries, in which case the grids spanned by each dictionary
- in the list are explored. This enables searching over any sequence
- of parameter settings.
-
- scoring : string, callable or None, default=None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
- If ``None``, the ``score`` method of the estimator is used.
-
- fit_params : dict, optional
- Parameters to pass to the fit method.
-
- n_jobs: int, default: 1 :
- The maximum number of estimators fit in parallel.
-
- - If -1 all CPUs are used.
-
- - If 1 is given, no parallel computing code is used at all,
- which is useful for debugging.
-
- - For ``n_jobs`` below -1, ``(n_cpus + n_jobs + 1)`` are used.
- For example, with ``n_jobs = -2`` all CPUs but one are used.
-
- .. versionchanged:: 0.17
- Upgraded to joblib 0.9.3.
-
- pre_dispatch : int, or string, optional
- Controls the number of jobs that get dispatched during parallel
- execution. Reducing this number can be useful to avoid an
- explosion of memory consumption when more jobs get dispatched
- than CPUs can process. This parameter can be:
-
- - None, in which case all the jobs are immediately
- created and spawned. Use this for lightweight and
- fast-running jobs, to avoid delays due to on-demand
- spawning of the jobs
-
- - An int, giving the exact number of total jobs that are
- spawned
-
- - A string, giving an expression as a function of n_jobs,
- as in '2*n_jobs'
-
- iid : boolean, default=True
- If True, the data is assumed to be identically distributed across
- the folds, and the loss minimized is the total loss per sample,
- and not the mean loss across the folds.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass,
- :class:`sklearn.model_selection.StratifiedKFold` is used. In all
- other cases, :class:`sklearn.model_selection.KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- refit : boolean, default=True
- Refit the best estimator with the entire dataset.
- If "False", it is impossible to make predictions using
- this GridSearchCV instance after fitting.
-
- verbose : integer
- Controls the verbosity: the higher, the more messages.
-
- error_score : 'raise' (default) or numeric
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
-
-
- Examples
- --------
- >>> from sklearn import svm, grid_search, datasets
- >>> iris = datasets.load_iris()
- >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
- >>> svr = svm.SVC(gamma="scale")
- >>> clf = grid_search.GridSearchCV(svr, parameters)
- >>> clf.fit(iris.data, iris.target)
- ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
- GridSearchCV(cv=None, error_score=...,
- estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
- decision_function_shape='ovr', degree=..., gamma=...,
- kernel='rbf', max_iter=-1, probability=False,
- random_state=None, shrinking=True, tol=...,
- verbose=False),
- fit_params={}, iid=..., n_jobs=1,
- param_grid=..., pre_dispatch=..., refit=...,
- scoring=..., verbose=...)
-
-
- Attributes
- ----------
- grid_scores_ : list of namedtuples
- Contains scores for all parameter combinations in param_grid.
- Each entry corresponds to one parameter setting.
- Each namedtuple has the attributes:
-
- * ``parameters``, a dict of parameter settings
- * ``mean_validation_score``, the mean score over the
- cross-validation folds
- * ``cv_validation_scores``, the list of scores for each fold
-
- best_estimator_ : estimator
- Estimator that was chosen by the search, i.e. estimator
- which gave highest score (or smallest loss if specified)
- on the left out data. Not available if refit=False.
-
- best_score_ : float
- Score of best_estimator on the left out data.
-
- best_params_ : dict
- Parameter setting that gave the best results on the hold out data.
-
- scorer_ : function
- Scorer function used on the held out data to choose the best
- parameters for the model.
-
- Notes
- ------
- The parameters selected are those that maximize the score of the left out
- data, unless an explicit score is passed in which case it is used instead.
-
- If `n_jobs` was set to a value higher than one, the data is copied for each
- point in the grid (and not `n_jobs` times). This is done for efficiency
- reasons if individual jobs take very little time, but may raise errors if
- the dataset is large and not enough memory is available. A workaround in
- this case is to set `pre_dispatch`. Then, the memory is copied only
- `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
- n_jobs`.
-
- See Also
- ---------
- :class:`ParameterGrid`:
- generates all the combinations of a hyperparameter grid.
-
- :func:`sklearn.cross_validation.train_test_split`:
- utility function to split the data into a development set usable
- for fitting a GridSearchCV instance and an evaluation set for
- its final evaluation.
-
- :func:`sklearn.metrics.make_scorer`:
- Make a scorer from a performance metric or loss function.
-
- """
-
- def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
- n_jobs=1, iid=True, refit=True, cv=None, verbose=0,
- pre_dispatch='2*n_jobs', error_score='raise'):
-
- super(GridSearchCV, self).__init__(
- estimator, scoring, fit_params, n_jobs, iid,
- refit, cv, verbose, pre_dispatch, error_score)
- self.param_grid = param_grid
- _check_param_grid(param_grid)
-
- def fit(self, X, y=None):
- """Run fit with all sets of parameters.
-
- Parameters
- ----------
-
- X : array-like, shape = [n_samples, n_features]
- Training vector, where n_samples is the number of samples and
- n_features is the number of features.
-
- y : array-like, shape = [n_samples] or [n_samples, n_output], optional
- Target relative to X for classification or regression;
- None for unsupervised learning.
-
- """
- return self._fit(X, y, ParameterGrid(self.param_grid))
-
-
-class RandomizedSearchCV(BaseSearchCV):
- """Randomized search on hyper parameters.
-
- .. deprecated:: 0.18
- This module will be removed in 0.20.
- Use :class:`sklearn.model_selection.RandomizedSearchCV` instead.
-
- RandomizedSearchCV implements a "fit" and a "score" method.
- It also implements "predict", "predict_proba", "decision_function",
- "transform" and "inverse_transform" if they are implemented in the
- estimator used.
-
- The parameters of the estimator used to apply these methods are optimized
- by cross-validated search over parameter settings.
-
- In contrast to GridSearchCV, not all parameter values are tried out, but
- rather a fixed number of parameter settings is sampled from the specified
- distributions. The number of parameter settings that are tried is
- given by n_iter.
-
- If all parameters are presented as a list,
- sampling without replacement is performed. If at least one parameter
- is given as a distribution, sampling with replacement is used.
- It is highly recommended to use continuous distributions for continuous
- parameters.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object.
- A object of that type is instantiated for each grid point.
- This is assumed to implement the scikit-learn estimator interface.
- Either estimator needs to provide a ``score`` function,
- or ``scoring`` must be passed.
-
- param_distributions : dict
- Dictionary with parameters names (string) as keys and distributions
- or lists of parameters to try. Distributions must provide a ``rvs``
- method for sampling (such as those from scipy.stats.distributions).
- If a list is given, it is sampled uniformly.
-
- n_iter : int, default=10
- Number of parameter settings that are sampled. n_iter trades
- off runtime vs quality of the solution.
-
- scoring : string, callable or None, default=None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
- If ``None``, the ``score`` method of the estimator is used.
-
- fit_params : dict, optional
- Parameters to pass to the fit method.
-
- n_jobs: int, default: 1 :
- The maximum number of estimators fit in parallel.
-
- - If -1 all CPUs are used.
-
- - If 1 is given, no parallel computing code is used at all,
- which is useful for debugging.
-
- - For ``n_jobs`` below -1, ``(n_cpus + n_jobs + 1)`` are used.
- For example, with ``n_jobs = -2`` all CPUs but one are used.
-
- pre_dispatch : int, or string, optional
- Controls the number of jobs that get dispatched during parallel
- execution. Reducing this number can be useful to avoid an
- explosion of memory consumption when more jobs get dispatched
- than CPUs can process. This parameter can be:
-
- - None, in which case all the jobs are immediately
- created and spawned. Use this for lightweight and
- fast-running jobs, to avoid delays due to on-demand
- spawning of the jobs
-
- - An int, giving the exact number of total jobs that are
- spawned
-
- - A string, giving an expression as a function of n_jobs,
- as in '2*n_jobs'
-
- iid : boolean, default=True
- If True, the data is assumed to be identically distributed across
- the folds, and the loss minimized is the total loss per sample,
- and not the mean loss across the folds.
-
- cv : int, cross-validation generator or an iterable, optional
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
-
- - None, to use the default 3-fold cross-validation,
- - integer, to specify the number of folds.
- - An object to be used as a cross-validation generator.
- - An iterable yielding train/test splits.
-
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass,
- :class:`sklearn.model_selection.StratifiedKFold` is used. In all
- other cases, :class:`sklearn.model_selection.KFold` is used.
-
- Refer :ref:`User Guide ` for the various
- cross-validation strategies that can be used here.
-
- refit : boolean, default=True
- Refit the best estimator with the entire dataset.
- If "False", it is impossible to make predictions using
- this RandomizedSearchCV instance after fitting.
-
- verbose : integer
- Controls the verbosity: the higher, the more messages.
-
- random_state : int, RandomState instance or None, optional, default=None
- Pseudo random number generator state used for random uniform sampling
- from lists of possible values instead of scipy.stats distributions.
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- error_score : 'raise' (default) or numeric
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
-
-
- Attributes
- ----------
- grid_scores_ : list of namedtuples
- Contains scores for all parameter combinations in param_grid.
- Each entry corresponds to one parameter setting.
- Each namedtuple has the attributes:
-
- * ``parameters``, a dict of parameter settings
- * ``mean_validation_score``, the mean score over the
- cross-validation folds
- * ``cv_validation_scores``, the list of scores for each fold
-
- best_estimator_ : estimator
- Estimator that was chosen by the search, i.e. estimator
- which gave highest score (or smallest loss if specified)
- on the left out data. Not available if refit=False.
-
- best_score_ : float
- Score of best_estimator on the left out data.
-
- best_params_ : dict
- Parameter setting that gave the best results on the hold out data.
-
- Notes
- -----
- The parameters selected are those that maximize the score of the held-out
- data, according to the scoring parameter.
-
- If `n_jobs` was set to a value higher than one, the data is copied for each
- parameter setting(and not `n_jobs` times). This is done for efficiency
- reasons if individual jobs take very little time, but may raise errors if
- the dataset is large and not enough memory is available. A workaround in
- this case is to set `pre_dispatch`. Then, the memory is copied only
- `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
- n_jobs`.
-
- See Also
- --------
- :class:`GridSearchCV`:
- Does exhaustive search over a grid of parameters.
-
- :class:`ParameterSampler`:
- A generator over parameter settings, constructed from
- param_distributions.
-
- """
-
- def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
- fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
- verbose=0, pre_dispatch='2*n_jobs', random_state=None,
- error_score='raise'):
-
- self.param_distributions = param_distributions
- self.n_iter = n_iter
- self.random_state = random_state
- super(RandomizedSearchCV, self).__init__(
- estimator=estimator, scoring=scoring, fit_params=fit_params,
- n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
- pre_dispatch=pre_dispatch, error_score=error_score)
-
- def fit(self, X, y=None):
- """Run fit on the estimator with randomly drawn parameters.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
- Training vector, where n_samples in the number of samples and
- n_features is the number of features.
-
- y : array-like, shape = [n_samples] or [n_samples, n_output], optional
- Target relative to X for classification or regression;
- None for unsupervised learning.
-
- """
- sampled_params = ParameterSampler(self.param_distributions,
- self.n_iter,
- random_state=self.random_state)
- return self._fit(X, y, sampled_params)
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
deleted file mode 100644
index 6a29a621ff588..0000000000000
--- a/sklearn/tests/test_cross_validation.py
+++ /dev/null
@@ -1,1252 +0,0 @@
-"""Test the cross_validation module"""
-from __future__ import division
-import warnings
-
-import numpy as np
-from scipy.sparse import coo_matrix
-from scipy.sparse import csr_matrix
-from scipy import stats
-
-from sklearn.exceptions import ConvergenceWarning
-from sklearn.utils.testing import assert_true
-from sklearn.utils.testing import assert_false
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_greater
-from sklearn.utils.testing import assert_greater_equal
-from sklearn.utils.testing import assert_less
-from sklearn.utils.testing import assert_not_equal
-from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_warns_message
-from sklearn.utils.testing import assert_raise_message
-from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.mocking import CheckingClassifier, MockDataFrame
-
-with warnings.catch_warnings():
- warnings.simplefilter('ignore')
- from sklearn import cross_validation as cval
-
-from sklearn.datasets import make_regression
-from sklearn.datasets import load_boston
-from sklearn.datasets import load_digits
-from sklearn.datasets import load_iris
-from sklearn.datasets import make_multilabel_classification
-from sklearn.metrics import explained_variance_score
-from sklearn.metrics import make_scorer
-from sklearn.metrics import precision_score
-from sklearn.externals import six
-from sklearn.externals.six.moves import zip
-
-from sklearn.linear_model import Ridge
-from sklearn.multiclass import OneVsRestClassifier
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.svm import SVC
-from sklearn.cluster import KMeans
-
-from sklearn.preprocessing import Imputer
-from sklearn.pipeline import Pipeline
-
-
-class MockClassifier(object):
- """Dummy classifier to test the cross-validation"""
-
- def __init__(self, a=0, allow_nd=False):
- self.a = a
- self.allow_nd = allow_nd
-
- def fit(self, X, Y=None, sample_weight=None, class_prior=None,
- sparse_sample_weight=None, sparse_param=None, dummy_int=None,
- dummy_str=None, dummy_obj=None, callback=None):
- """The dummy arguments are to test that this fit function can
- accept non-array arguments through cross-validation, such as:
- - int
- - str (this is actually array-like)
- - object
- - function
- """
- self.dummy_int = dummy_int
- self.dummy_str = dummy_str
- self.dummy_obj = dummy_obj
- if callback is not None:
- callback(self)
-
- if self.allow_nd:
- X = X.reshape(len(X), -1)
- if X.ndim >= 3 and not self.allow_nd:
- raise ValueError('X cannot be d')
- if sample_weight is not None:
- assert_true(sample_weight.shape[0] == X.shape[0],
- 'MockClassifier extra fit_param sample_weight.shape[0]'
- ' is {0}, should be {1}'.format(sample_weight.shape[0],
- X.shape[0]))
- if class_prior is not None:
- assert_true(class_prior.shape[0] == len(np.unique(y)),
- 'MockClassifier extra fit_param class_prior.shape[0]'
- ' is {0}, should be {1}'.format(class_prior.shape[0],
- len(np.unique(y))))
- if sparse_sample_weight is not None:
- fmt = ('MockClassifier extra fit_param sparse_sample_weight'
- '.shape[0] is {0}, should be {1}')
- assert_true(sparse_sample_weight.shape[0] == X.shape[0],
- fmt.format(sparse_sample_weight.shape[0], X.shape[0]))
- if sparse_param is not None:
- fmt = ('MockClassifier extra fit_param sparse_param.shape '
- 'is ({0}, {1}), should be ({2}, {3})')
- assert_true(sparse_param.shape == P_sparse.shape,
- fmt.format(sparse_param.shape[0],
- sparse_param.shape[1],
- P_sparse.shape[0], P_sparse.shape[1]))
- return self
-
- def predict(self, T):
- if self.allow_nd:
- T = T.reshape(len(T), -1)
- return T[:, 0]
-
- def score(self, X=None, Y=None):
- return 1. / (1 + np.abs(self.a))
-
- def get_params(self, deep=False):
- return {'a': self.a, 'allow_nd': self.allow_nd}
-
-X = np.ones((10, 2))
-X_sparse = coo_matrix(X)
-W_sparse = coo_matrix((np.array([1]), (np.array([1]), np.array([0]))),
- shape=(10, 1))
-P_sparse = coo_matrix(np.eye(5))
-
-# avoid StratifiedKFold's Warning about least populated class in y
-y = np.arange(10) % 3
-
-##############################################################################
-# Tests
-
-
-def check_valid_split(train, test, n_samples=None):
- # Use python sets to get more informative assertion failure messages
- train, test = set(train), set(test)
-
- # Train and test split should not overlap
- assert_equal(train.intersection(test), set())
-
- if n_samples is not None:
- # Check that the union of train an test split cover all the indices
- assert_equal(train.union(test), set(range(n_samples)))
-
-
-def check_cv_coverage(cv, expected_n_iter=None, n_samples=None):
- # Check that a all the samples appear at least once in a test fold
- if expected_n_iter is not None:
- assert_equal(len(cv), expected_n_iter)
- else:
- expected_n_iter = len(cv)
-
- collected_test_samples = set()
- iterations = 0
- for train, test in cv:
- check_valid_split(train, test, n_samples=n_samples)
- iterations += 1
- collected_test_samples.update(test)
-
- # Check that the accumulated test samples cover the whole dataset
- assert_equal(iterations, expected_n_iter)
- if n_samples is not None:
- assert_equal(collected_test_samples, set(range(n_samples)))
-
-
-def test_kfold_valueerrors():
- # Check that errors are raised if there is not enough samples
- assert_raises(ValueError, cval.KFold, 3, 4)
-
- # Check that a warning is raised if the least populated class has too few
- # members.
- y = [3, 3, -1, -1, 3]
-
- cv = assert_warns_message(Warning, "The least populated class",
- cval.StratifiedKFold, y, 3)
-
- # Check that despite the warning the folds are still computed even
- # though all the classes are not necessarily represented at on each
- # side of the split at each split
- check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y))
-
- # Check that errors are raised if all n_labels for individual
- # classes are less than n_folds.
- y = [3, 3, -1, -1, 2]
-
- assert_raises(ValueError, cval.StratifiedKFold, y, 3)
-
- # Error when number of folds is <= 1
- assert_raises(ValueError, cval.KFold, 2, 0)
- assert_raises(ValueError, cval.KFold, 2, 1)
- error_string = ("k-fold cross validation requires at least one"
- " train / test split")
- assert_raise_message(ValueError, error_string,
- cval.StratifiedKFold, y, 0)
- assert_raise_message(ValueError, error_string,
- cval.StratifiedKFold, y, 1)
-
- # When n is not integer:
- assert_raises(ValueError, cval.KFold, 2.5, 2)
-
- # When n_folds is not integer:
- assert_raises(ValueError, cval.KFold, 5, 1.5)
- assert_raises(ValueError, cval.StratifiedKFold, y, 1.5)
-
-
-def test_kfold_indices():
- # Check all indices are returned in the test folds
- kf = cval.KFold(300, 3)
- check_cv_coverage(kf, expected_n_iter=3, n_samples=300)
-
- # Check all indices are returned in the test folds even when equal-sized
- # folds are not possible
- kf = cval.KFold(17, 3)
- check_cv_coverage(kf, expected_n_iter=3, n_samples=17)
-
-
-def test_kfold_no_shuffle():
- # Manually check that KFold preserves the data ordering on toy datasets
- splits = iter(cval.KFold(4, 2))
- train, test = next(splits)
- assert_array_equal(test, [0, 1])
- assert_array_equal(train, [2, 3])
-
- train, test = next(splits)
- assert_array_equal(test, [2, 3])
- assert_array_equal(train, [0, 1])
-
- splits = iter(cval.KFold(5, 2))
- train, test = next(splits)
- assert_array_equal(test, [0, 1, 2])
- assert_array_equal(train, [3, 4])
-
- train, test = next(splits)
- assert_array_equal(test, [3, 4])
- assert_array_equal(train, [0, 1, 2])
-
-
-def test_stratified_kfold_no_shuffle():
- # Manually check that StratifiedKFold preserves the data ordering as much
- # as possible on toy datasets in order to avoid hiding sample dependencies
- # when possible
- splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2))
- train, test = next(splits)
- assert_array_equal(test, [0, 2])
- assert_array_equal(train, [1, 3])
-
- train, test = next(splits)
- assert_array_equal(test, [1, 3])
- assert_array_equal(train, [0, 2])
-
- splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2))
- train, test = next(splits)
- assert_array_equal(test, [0, 1, 3, 4])
- assert_array_equal(train, [2, 5, 6])
-
- train, test = next(splits)
- assert_array_equal(test, [2, 5, 6])
- assert_array_equal(train, [0, 1, 3, 4])
-
-
-def test_stratified_kfold_ratios():
- # Check that stratified kfold preserves label ratios in individual splits
- # Repeat with shuffling turned off and on
- n_samples = 1000
- labels = np.array([4] * int(0.10 * n_samples) +
- [0] * int(0.89 * n_samples) +
- [1] * int(0.01 * n_samples))
- for shuffle in [False, True]:
- for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle):
- assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10,
- 2)
- assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89,
- 2)
- assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01,
- 2)
- assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2)
- assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2)
- assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2)
-
-
-def test_kfold_balance():
- # Check that KFold returns folds with balanced sizes
- for kf in [cval.KFold(i, 5) for i in range(11, 17)]:
- sizes = []
- for _, test in kf:
- sizes.append(len(test))
-
- assert_true((np.max(sizes) - np.min(sizes)) <= 1)
- assert_equal(np.sum(sizes), kf.n)
-
-
-def test_stratifiedkfold_balance():
- # Check that KFold returns folds with balanced sizes (only when
- # stratification is possible)
- # Repeat with shuffling turned off and on
- labels = [0] * 3 + [1] * 14
- for shuffle in [False, True]:
- for skf in [cval.StratifiedKFold(labels[:i], 3, shuffle=shuffle)
- for i in range(11, 17)]:
- sizes = []
- for _, test in skf:
- sizes.append(len(test))
-
- assert_true((np.max(sizes) - np.min(sizes)) <= 1)
- assert_equal(np.sum(sizes), skf.n)
-
-
-def test_shuffle_kfold():
- # Check the indices are shuffled properly, and that all indices are
- # returned in the different test folds
- kf = cval.KFold(300, 3, shuffle=True, random_state=0)
- ind = np.arange(300)
-
- all_folds = None
- for train, test in kf:
- assert_true(np.any(np.arange(100) != ind[test]))
- assert_true(np.any(np.arange(100, 200) != ind[test]))
- assert_true(np.any(np.arange(200, 300) != ind[test]))
-
- if all_folds is None:
- all_folds = ind[test].copy()
- else:
- all_folds = np.concatenate((all_folds, ind[test]))
-
- all_folds.sort()
- assert_array_equal(all_folds, ind)
-
-
-def test_shuffle_stratifiedkfold():
- # Check that shuffling is happening when requested, and for proper
- # sample coverage
- labels = [0] * 20 + [1] * 20
- kf0 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=0))
- kf1 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=1))
- for (_, test0), (_, test1) in zip(kf0, kf1):
- assert_true(set(test0) != set(test1))
- check_cv_coverage(kf0, expected_n_iter=5, n_samples=40)
-
-
-def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372
- # The digits samples are dependent: they are apparently grouped by authors
- # although we don't have any information on the groups segment locations
- # for this data. We can highlight this fact be computing k-fold cross-
- # validation with and without shuffling: we observe that the shuffling case
- # wrongly makes the IID assumption and is therefore too optimistic: it
- # estimates a much higher accuracy (around 0.96) than the non
- # shuffling variant (around 0.86).
-
- digits = load_digits()
- X, y = digits.data[:800], digits.target[:800]
- model = SVC(C=10, gamma=0.005)
- n = len(y)
-
- cv = cval.KFold(n, 5, shuffle=False)
- mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
- assert_greater(0.88, mean_score)
- assert_greater(mean_score, 0.85)
-
- # Shuffling the data artificially breaks the dependency and hides the
- # overfitting of the model with regards to the writing style of the authors
- # by yielding a seriously overestimated score:
-
- cv = cval.KFold(n, 5, shuffle=True, random_state=0)
- mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
- assert_greater(mean_score, 0.95)
-
- cv = cval.KFold(n, 5, shuffle=True, random_state=1)
- mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
- assert_greater(mean_score, 0.95)
-
- # Similarly, StratifiedKFold should try to shuffle the data as little
- # as possible (while respecting the balanced class constraints)
- # and thus be able to detect the dependency by not overestimating
- # the CV score either. As the digits dataset is approximately balanced
- # the estimated mean score is close to the score measured with
- # non-shuffled KFold
-
- cv = cval.StratifiedKFold(y, 5)
- mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
- assert_greater(0.88, mean_score)
- assert_greater(mean_score, 0.85)
-
-
-def test_label_kfold():
- rng = np.random.RandomState(0)
-
- # Parameters of the test
- n_labels = 15
- n_samples = 1000
- n_folds = 5
-
- # Construct the test data
- tolerance = 0.05 * n_samples # 5 percent error allowed
- labels = rng.randint(0, n_labels, n_samples)
- folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
- ideal_n_labels_per_fold = n_samples // n_folds
-
- # Check that folds have approximately the same size
- assert_equal(len(folds), len(labels))
- for i in np.unique(folds):
- assert_greater_equal(tolerance,
- abs(sum(folds == i) - ideal_n_labels_per_fold))
-
- # Check that each label appears only in 1 fold
- for label in np.unique(labels):
- assert_equal(len(np.unique(folds[labels == label])), 1)
-
- # Check that no label is on both sides of the split
- labels = np.asarray(labels, dtype=object)
- for train, test in cval.LabelKFold(labels, n_folds=n_folds):
- assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
-
- # Construct the test data
- labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
- 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
- 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
- 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
- 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
- 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
- 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
- labels = np.asarray(labels, dtype=object)
-
- n_labels = len(np.unique(labels))
- n_samples = len(labels)
- n_folds = 5
- tolerance = 0.05 * n_samples # 5 percent error allowed
- folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
- ideal_n_labels_per_fold = n_samples // n_folds
-
- # Check that folds have approximately the same size
- assert_equal(len(folds), len(labels))
- for i in np.unique(folds):
- assert_greater_equal(tolerance,
- abs(sum(folds == i) - ideal_n_labels_per_fold))
-
- # Check that each label appears only in 1 fold
- for label in np.unique(labels):
- assert_equal(len(np.unique(folds[labels == label])), 1)
-
- # Check that no label is on both sides of the split
- for train, test in cval.LabelKFold(labels, n_folds=n_folds):
- assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
-
- # Should fail if there are more folds than labels
- labels = np.array([1, 1, 1, 2, 2])
- assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
-
-
-def test_shuffle_split():
- ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
- ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
- ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
- for typ in six.integer_types:
- ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
- for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
- assert_array_equal(t1[0], t2[0])
- assert_array_equal(t2[0], t3[0])
- assert_array_equal(t3[0], t4[0])
- assert_array_equal(t1[1], t2[1])
- assert_array_equal(t2[1], t3[1])
- assert_array_equal(t3[1], t4[1])
-
-
-def test_stratified_shuffle_split_init():
- y = np.asarray([0, 1, 1, 1, 2, 2, 2])
- # Check that error is raised if there is a class with only one sample
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)
-
- # Check that error is raised if the test set size is smaller than n_classes
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
- # Check that error is raised if the train set size is smaller than
- # n_classes
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)
-
- y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
- # Check that errors are raised if there is not enough samples
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)
-
- # Train size or test size too small
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
- assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2)
-
-
-def test_stratified_shuffle_split_iter():
- ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
- np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
- np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
- np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
- np.array([-1] * 800 + [1] * 50)
- ]
-
- for y in ys:
- sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
- random_state=0)
- test_size = np.ceil(0.33 * len(y))
- train_size = len(y) - test_size
- for train, test in sss:
- assert_array_equal(np.unique(y[train]), np.unique(y[test]))
- # Checks if folds keep classes proportions
- p_train = (np.bincount(np.unique(y[train],
- return_inverse=True)[1]) /
- float(len(y[train])))
- p_test = (np.bincount(np.unique(y[test],
- return_inverse=True)[1]) /
- float(len(y[test])))
- assert_array_almost_equal(p_train, p_test, 1)
- assert_equal(len(train) + len(test), y.size)
- assert_equal(len(train), train_size)
- assert_equal(len(test), test_size)
- assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
-
-
-def test_stratified_shuffle_split_even():
- # Test the StratifiedShuffleSplit, indices are drawn with a
- # equal chance
- n_folds = 5
- n_iter = 1000
-
- def assert_counts_are_ok(idx_counts, p):
- # Here we test that the distribution of the counts
- # per index is close enough to a binomial
- threshold = 0.05 / n_splits
- bf = stats.binom(n_splits, p)
- for count in idx_counts:
- p = bf.pmf(count)
- assert_true(p > threshold,
- "An index is not drawn with chance corresponding "
- "to even draws")
-
- for n_samples in (6, 22):
- labels = np.array((n_samples // 2) * [0, 1])
- splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
- test_size=1. / n_folds,
- random_state=0)
-
- train_counts = [0] * n_samples
- test_counts = [0] * n_samples
- n_splits = 0
- for train, test in splits:
- n_splits += 1
- for counter, ids in [(train_counts, train), (test_counts, test)]:
- for id in ids:
- counter[id] += 1
- assert_equal(n_splits, n_iter)
-
- assert_equal(len(train), splits.n_train)
- assert_equal(len(test), splits.n_test)
- assert_equal(len(set(train).intersection(test)), 0)
-
- label_counts = np.unique(labels)
- assert_equal(splits.test_size, 1.0 / n_folds)
- assert_equal(splits.n_train + splits.n_test, len(labels))
- assert_equal(len(label_counts), 2)
- ex_test_p = float(splits.n_test) / n_samples
- ex_train_p = float(splits.n_train) / n_samples
-
- assert_counts_are_ok(train_counts, ex_train_p)
- assert_counts_are_ok(test_counts, ex_test_p)
-
-
-def test_stratified_shuffle_split_overlap_train_test_bug():
- # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
- # the original bug report
- labels = [0, 1, 2, 3] * 3 + [4, 5] * 5
-
- splits = cval.StratifiedShuffleSplit(labels, n_iter=1,
- test_size=0.5, random_state=0)
- train, test = next(iter(splits))
-
- assert_array_equal(np.intersect1d(train, test), [])
-
-
-def test_predefinedsplit_with_kfold_split():
- # Check that PredefinedSplit can reproduce a split generated by Kfold.
- folds = -1 * np.ones(10)
- kf_train = []
- kf_test = []
- for i, (train_ind, test_ind) in enumerate(cval.KFold(10, 5, shuffle=True)):
- kf_train.append(train_ind)
- kf_test.append(test_ind)
- folds[test_ind] = i
- ps_train = []
- ps_test = []
- ps = cval.PredefinedSplit(folds)
- for train_ind, test_ind in ps:
- ps_train.append(train_ind)
- ps_test.append(test_ind)
- assert_array_equal(ps_train, kf_train)
- assert_array_equal(ps_test, kf_test)
-
-
-def test_label_shuffle_split():
- ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
- np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
- np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
- np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
- ]
-
- for y in ys:
- n_iter = 6
- test_size = 1. / 3
- slo = cval.LabelShuffleSplit(y, n_iter, test_size=test_size,
- random_state=0)
-
- # Make sure the repr works
- repr(slo)
-
- # Test that the length is correct
- assert_equal(len(slo), n_iter)
-
- y_unique = np.unique(y)
-
- for train, test in slo:
- # First test: no train label is in the test set and vice versa
- y_train_unique = np.unique(y[train])
- y_test_unique = np.unique(y[test])
- assert_false(np.any(np.in1d(y[train], y_test_unique)))
- assert_false(np.any(np.in1d(y[test], y_train_unique)))
-
- # Second test: train and test add up to all the data
- assert_equal(y[train].size + y[test].size, y.size)
-
- # Third test: train and test are disjoint
- assert_array_equal(np.intersect1d(train, test), [])
-
- # Fourth test: # unique train and test labels are correct,
- # +- 1 for rounding error
- assert_true(abs(len(y_test_unique) -
- round(test_size * len(y_unique))) <= 1)
- assert_true(abs(len(y_train_unique) -
- round((1.0 - test_size) * len(y_unique))) <= 1)
-
-
-def test_leave_label_out_changing_labels():
- # Check that LeaveOneLabelOut and LeavePLabelOut work normally if
- # the labels variable is changed before calling __iter__
- labels = np.array([0, 1, 2, 1, 1, 2, 0, 0])
- labels_changing = np.array(labels, copy=True)
- lolo = cval.LeaveOneLabelOut(labels)
- lolo_changing = cval.LeaveOneLabelOut(labels_changing)
- lplo = cval.LeavePLabelOut(labels, p=2)
- lplo_changing = cval.LeavePLabelOut(labels_changing, p=2)
- labels_changing[:] = 0
- for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
- for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
- assert_array_equal(train, train_chan)
- assert_array_equal(test, test_chan)
-
-
-def test_cross_val_score():
- clf = MockClassifier()
- for a in range(-10, 10):
- clf.a = a
- # Smoke test
- scores = cval.cross_val_score(clf, X, y)
- assert_array_equal(scores, clf.score(X, y))
-
- # test with multioutput y
- scores = cval.cross_val_score(clf, X_sparse, X)
- assert_array_equal(scores, clf.score(X_sparse, X))
-
- scores = cval.cross_val_score(clf, X_sparse, y)
- assert_array_equal(scores, clf.score(X_sparse, y))
-
- # test with multioutput y
- scores = cval.cross_val_score(clf, X_sparse, X)
- assert_array_equal(scores, clf.score(X_sparse, X))
-
- # test with X and y as list
- list_check = lambda x: isinstance(x, list)
- clf = CheckingClassifier(check_X=list_check)
- scores = cval.cross_val_score(clf, X.tolist(), y.tolist())
-
- clf = CheckingClassifier(check_y=list_check)
- scores = cval.cross_val_score(clf, X, y.tolist())
-
- assert_raises(ValueError, cval.cross_val_score, clf, X, y,
- scoring="sklearn")
-
- # test with 3d X and
- X_3d = X[:, :, np.newaxis]
- clf = MockClassifier(allow_nd=True)
- scores = cval.cross_val_score(clf, X_3d, y)
-
- clf = MockClassifier(allow_nd=False)
- assert_raises(ValueError, cval.cross_val_score, clf, X_3d, y)
-
-
-def test_cross_val_score_pandas():
- # check cross_val_score doesn't destroy pandas dataframe
- types = [(MockDataFrame, MockDataFrame)]
- try:
- from pandas import Series, DataFrame
- types.append((Series, DataFrame))
- except ImportError:
- pass
- for TargetType, InputFeatureType in types:
- # X dataframe, y series
- X_df, y_ser = InputFeatureType(X), TargetType(y)
- check_df = lambda x: isinstance(x, InputFeatureType)
- check_series = lambda x: isinstance(x, TargetType)
- clf = CheckingClassifier(check_X=check_df, check_y=check_series)
- cval.cross_val_score(clf, X_df, y_ser)
-
-
-def test_cross_val_score_mask():
- # test that cross_val_score works with boolean masks
- svm = SVC(kernel="linear")
- iris = load_iris()
- X, y = iris.data, iris.target
- cv_indices = cval.KFold(len(y), 5)
- scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
- cv_indices = cval.KFold(len(y), 5)
- cv_masks = []
- for train, test in cv_indices:
- mask_train = np.zeros(len(y), dtype=np.bool)
- mask_test = np.zeros(len(y), dtype=np.bool)
- mask_train[train] = 1
- mask_test[test] = 1
- cv_masks.append((train, test))
- scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
- assert_array_equal(scores_indices, scores_masks)
-
-
-def test_cross_val_score_precomputed():
- # test for svm with precomputed kernel
- svm = SVC(kernel="precomputed")
- iris = load_iris()
- X, y = iris.data, iris.target
- linear_kernel = np.dot(X, X.T)
- score_precomputed = cval.cross_val_score(svm, linear_kernel, y)
- svm = SVC(kernel="linear")
- score_linear = cval.cross_val_score(svm, X, y)
- assert_array_equal(score_precomputed, score_linear)
-
- # Error raised for non-square X
- svm = SVC(kernel="precomputed")
- assert_raises(ValueError, cval.cross_val_score, svm, X, y)
-
- # test error is raised when the precomputed kernel is not array-like
- # or sparse
- assert_raises(ValueError, cval.cross_val_score, svm,
- linear_kernel.tolist(), y)
-
-
-def test_cross_val_score_fit_params():
- clf = MockClassifier()
- n_samples = X.shape[0]
- n_classes = len(np.unique(y))
-
- DUMMY_INT = 42
- DUMMY_STR = '42'
- DUMMY_OBJ = object()
-
- def assert_fit_params(clf):
- # Function to test that the values are passed correctly to the
- # classifier arguments for non-array type
-
- assert_equal(clf.dummy_int, DUMMY_INT)
- assert_equal(clf.dummy_str, DUMMY_STR)
- assert_equal(clf.dummy_obj, DUMMY_OBJ)
-
- fit_params = {'sample_weight': np.ones(n_samples),
- 'class_prior': np.ones(n_classes) / n_classes,
- 'sparse_sample_weight': W_sparse,
- 'sparse_param': P_sparse,
- 'dummy_int': DUMMY_INT,
- 'dummy_str': DUMMY_STR,
- 'dummy_obj': DUMMY_OBJ,
- 'callback': assert_fit_params}
- cval.cross_val_score(clf, X, y, fit_params=fit_params)
-
-
-def test_cross_val_score_score_func():
- clf = MockClassifier()
- _score_func_args = []
-
- def score_func(y_test, y_predict):
- _score_func_args.append((y_test, y_predict))
- return 1.0
-
- with warnings.catch_warnings(record=True):
- scoring = make_scorer(score_func)
- score = cval.cross_val_score(clf, X, y, scoring=scoring)
- assert_array_equal(score, [1.0, 1.0, 1.0])
- assert len(_score_func_args) == 3
-
-
-def test_cross_val_score_errors():
- class BrokenEstimator:
- pass
-
- assert_raises(TypeError, cval.cross_val_score, BrokenEstimator(), X)
-
-
-def test_train_test_split_errors():
- assert_raises(ValueError, cval.train_test_split)
- assert_raises(ValueError, cval.train_test_split, range(3), train_size=1.1)
- assert_raises(ValueError, cval.train_test_split, range(3), test_size=0.6,
- train_size=0.6)
- assert_raises(ValueError, cval.train_test_split, range(3),
- test_size=np.float32(0.6), train_size=np.float32(0.6))
- assert_raises(ValueError, cval.train_test_split, range(3),
- test_size="wrong_type")
- assert_raises(ValueError, cval.train_test_split, range(3), test_size=2,
- train_size=4)
- assert_raises(TypeError, cval.train_test_split, range(3),
- some_argument=1.1)
- assert_raises(ValueError, cval.train_test_split, range(3), range(42))
-
-
-def test_train_test_split():
- X = np.arange(100).reshape((10, 10))
- X_s = coo_matrix(X)
- y = np.arange(10)
-
- # simple test
- split = cval.train_test_split(X, y, test_size=None, train_size=.5)
- X_train, X_test, y_train, y_test = split
- assert_equal(len(y_test), len(y_train))
- # test correspondence of X and y
- assert_array_equal(X_train[:, 0], y_train * 10)
- assert_array_equal(X_test[:, 0], y_test * 10)
-
- # conversion of lists to arrays (deprecated?)
- with warnings.catch_warnings(record=True):
- split = cval.train_test_split(X, X_s, y.tolist())
- X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
- assert_array_equal(X_train, X_s_train.toarray())
- assert_array_equal(X_test, X_s_test.toarray())
-
- # don't convert lists to anything else by default
- split = cval.train_test_split(X, X_s, y.tolist())
- X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
- assert_true(isinstance(y_train, list))
- assert_true(isinstance(y_test, list))
-
- # allow nd-arrays
- X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
- y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
- split = cval.train_test_split(X_4d, y_3d)
- assert_equal(split[0].shape, (7, 5, 3, 2))
- assert_equal(split[1].shape, (3, 5, 3, 2))
- assert_equal(split[2].shape, (7, 7, 11))
- assert_equal(split[3].shape, (3, 7, 11))
-
- # test stratification option
- y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
- for test_size, exp_test_size in zip([2, 4, 0.25, 0.5, 0.75],
- [2, 4, 2, 4, 6]):
- train, test = cval.train_test_split(y,
- test_size=test_size,
- stratify=y,
- random_state=0)
- assert_equal(len(test), exp_test_size)
- assert_equal(len(test) + len(train), len(y))
- # check the 1:1 ratio of ones and twos in the data is preserved
- assert_equal(np.sum(train == 1), np.sum(train == 2))
-
-
-def train_test_split_pandas():
- # check cross_val_score doesn't destroy pandas dataframe
- types = [MockDataFrame]
- try:
- from pandas import DataFrame
- types.append(DataFrame)
- except ImportError:
- pass
- for InputFeatureType in types:
- # X dataframe
- X_df = InputFeatureType(X)
- X_train, X_test = cval.train_test_split(X_df)
- assert_true(isinstance(X_train, InputFeatureType))
- assert_true(isinstance(X_test, InputFeatureType))
-
-def train_test_split_mock_pandas():
- # X mock dataframe
- X_df = MockDataFrame(X)
- X_train, X_test = cval.train_test_split(X_df)
- assert_true(isinstance(X_train, MockDataFrame))
- assert_true(isinstance(X_test, MockDataFrame))
-
-
-def test_cross_val_score_with_score_func_classification():
- iris = load_iris()
- clf = SVC(kernel='linear')
-
- # Default score (should be the accuracy score)
- scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
- assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)
-
- # Correct classification score (aka. zero / one score) - should be the
- # same as the default estimator score
- zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
- scoring="accuracy", cv=5)
- assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
-
- # F1 score (class are balanced so f1_score should be equal to zero/one
- # score
- f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
- scoring="f1_weighted", cv=5)
- assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
-
-
-def test_cross_val_score_with_score_func_regression():
- X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
- random_state=0)
- reg = Ridge()
-
- # Default score of the Ridge regression estimator
- scores = cval.cross_val_score(reg, X, y, cv=5)
- assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
-
- # R2 score (aka. determination coefficient) - should be the
- # same as the default estimator score
- r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
- assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
-
- # Mean squared error; this is a loss function, so "scores" are negative
- neg_mse_scores = cval.cross_val_score(reg, X, y, cv=5,
- scoring="neg_mean_squared_error")
- expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
- assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2)
-
- # Explained variance
- scoring = make_scorer(explained_variance_score)
- ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
- assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
-
-
-def test_permutation_score():
- iris = load_iris()
- X = iris.data
- X_sparse = coo_matrix(X)
- y = iris.target
- svm = SVC(kernel='linear')
- cv = cval.StratifiedKFold(y, 2)
-
- score, scores, pvalue = cval.permutation_test_score(
- svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
- assert_greater(score, 0.9)
- assert_almost_equal(pvalue, 0.0, 1)
-
- score_label, _, pvalue_label = cval.permutation_test_score(
- svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
- labels=np.ones(y.size), random_state=0)
- assert_true(score_label == score)
- assert_true(pvalue_label == pvalue)
-
- # check that we obtain the same results with a sparse representation
- svm_sparse = SVC(kernel='linear')
- cv_sparse = cval.StratifiedKFold(y, 2)
- score_label, _, pvalue_label = cval.permutation_test_score(
- svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
- scoring="accuracy", labels=np.ones(y.size), random_state=0)
-
- assert_true(score_label == score)
- assert_true(pvalue_label == pvalue)
-
- # test with custom scoring object
- def custom_score(y_true, y_pred):
- return (((y_true == y_pred).sum() - (y_true != y_pred).sum())
- / y_true.shape[0])
-
- scorer = make_scorer(custom_score)
- score, _, pvalue = cval.permutation_test_score(
- svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
- assert_almost_equal(score, .93, 2)
- assert_almost_equal(pvalue, 0.01, 3)
-
- # set random y
- y = np.mod(np.arange(len(y)), 3)
-
- score, scores, pvalue = cval.permutation_test_score(
- svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
-
- assert_less(score, 0.5)
- assert_greater(pvalue, 0.2)
-
-
-def test_cross_val_generator_with_indices():
- X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- y = np.array([1, 1, 2, 2])
- labels = np.array([1, 2, 3, 4])
- # explicitly passing indices value is deprecated
- loo = cval.LeaveOneOut(4)
- lpo = cval.LeavePOut(4, 2)
- kf = cval.KFold(4, 2)
- skf = cval.StratifiedKFold(y, 2)
- lolo = cval.LeaveOneLabelOut(labels)
- lopo = cval.LeavePLabelOut(labels, 2)
- ps = cval.PredefinedSplit([1, 1, 2, 2])
- ss = cval.ShuffleSplit(2)
- for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
- for train, test in cv:
- assert_not_equal(np.asarray(train).dtype.kind, 'b')
- assert_not_equal(np.asarray(train).dtype.kind, 'b')
- X[train], X[test]
- y[train], y[test]
-
-
-@ignore_warnings
-def test_cross_val_generator_with_default_indices():
- X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- y = np.array([1, 1, 2, 2])
- labels = np.array([1, 2, 3, 4])
- loo = cval.LeaveOneOut(4)
- lpo = cval.LeavePOut(4, 2)
- kf = cval.KFold(4, 2)
- skf = cval.StratifiedKFold(y, 2)
- lolo = cval.LeaveOneLabelOut(labels)
- lopo = cval.LeavePLabelOut(labels, 2)
- ss = cval.ShuffleSplit(2)
- ps = cval.PredefinedSplit([1, 1, 2, 2])
- for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
- for train, test in cv:
- assert_not_equal(np.asarray(train).dtype.kind, 'b')
- assert_not_equal(np.asarray(train).dtype.kind, 'b')
- X[train], X[test]
- y[train], y[test]
-
-
-def test_shufflesplit_errors():
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1,
- train_size=0.95)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
- assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
- assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
- train_size=None)
-
-
-def test_shufflesplit_reproducible():
- # Check that iterating twice on the ShuffleSplit gives the same
- # sequence of train-test when the random_state is given
- ss = cval.ShuffleSplit(10, random_state=21)
- assert_array_equal(list(a for a, b in ss), list(a for a, b in ss))
-
-
-def test_safe_split_with_precomputed_kernel():
- clf = SVC(gamma="scale")
- clfp = SVC(kernel="precomputed")
-
- iris = load_iris()
- X, y = iris.data, iris.target
- K = np.dot(X, X.T)
-
- cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0)
- tr, te = list(cv)[0]
-
- X_tr, y_tr = cval._safe_split(clf, X, y, tr)
- K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr)
- assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))
-
- X_te, y_te = cval._safe_split(clf, X, y, te, tr)
- K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr)
- assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
-
-
-def test_cross_val_score_allow_nans():
- # Check that cross_val_score allows input data with NaNs
- X = np.arange(200, dtype=np.float64).reshape(10, -1)
- X[2, :] = np.nan
- y = np.repeat([0, 1], X.shape[0] / 2)
- p = Pipeline([
- ('imputer', Imputer(strategy='mean', missing_values='NaN')),
- ('classifier', MockClassifier()),
- ])
- cval.cross_val_score(p, X, y, cv=5)
-
-
-def test_train_test_split_allow_nans():
- # Check that train_test_split allows input data with NaNs
- X = np.arange(200, dtype=np.float64).reshape(10, -1)
- X[2, :] = np.nan
- y = np.repeat([0, 1], X.shape[0] / 2)
- cval.train_test_split(X, y, test_size=0.2, random_state=42)
-
-
-def test_permutation_test_score_allow_nans():
- # Check that permutation_test_score allows input data with NaNs
- X = np.arange(200, dtype=np.float64).reshape(10, -1)
- X[2, :] = np.nan
- y = np.repeat([0, 1], X.shape[0] / 2)
- p = Pipeline([
- ('imputer', Imputer(strategy='mean', missing_values='NaN')),
- ('classifier', MockClassifier()),
- ])
- cval.permutation_test_score(p, X, y, cv=5)
-
-
-def test_check_cv_return_types():
- X = np.ones((9, 2))
- cv = cval.check_cv(3, X, classifier=False)
- assert_true(isinstance(cv, cval.KFold))
-
- y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
- cv = cval.check_cv(3, X, y_binary, classifier=True)
- assert_true(isinstance(cv, cval.StratifiedKFold))
-
- y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
- cv = cval.check_cv(3, X, y_multiclass, classifier=True)
- assert_true(isinstance(cv, cval.StratifiedKFold))
-
- X = np.ones((5, 2))
- y_multilabel = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [0, 1, 1], [1, 0, 0]]
- cv = cval.check_cv(3, X, y_multilabel, classifier=True)
- assert_true(isinstance(cv, cval.KFold))
-
- y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
- cv = cval.check_cv(3, X, y_multioutput, classifier=True)
- assert_true(isinstance(cv, cval.KFold))
-
-
-def test_cross_val_score_multilabel():
- X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
- [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
- y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
- [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
- clf = KNeighborsClassifier(n_neighbors=1)
- scoring_micro = make_scorer(precision_score, average='micro')
- scoring_macro = make_scorer(precision_score, average='macro')
- scoring_samples = make_scorer(precision_score, average='samples')
- score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
- score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
- score_samples = cval.cross_val_score(clf, X, y,
- scoring=scoring_samples, cv=5)
- assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
- assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
- assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
-
-
-def test_cross_val_predict():
- boston = load_boston()
- X, y = boston.data, boston.target
- cv = cval.KFold(len(boston.target))
-
- est = Ridge()
-
- # Naive loop (should be same as cross_val_predict):
- preds2 = np.zeros_like(y)
- for train, test in cv:
- est.fit(X[train], y[train])
- preds2[test] = est.predict(X[test])
-
- preds = cval.cross_val_predict(est, X, y, cv=cv)
- assert_array_almost_equal(preds, preds2)
-
- preds = cval.cross_val_predict(est, X, y)
- assert_equal(len(preds), len(y))
-
- cv = cval.LeaveOneOut(len(y))
- preds = cval.cross_val_predict(est, X, y, cv=cv)
- assert_equal(len(preds), len(y))
-
- Xsp = X.copy()
- Xsp *= (Xsp > np.median(Xsp))
- Xsp = coo_matrix(Xsp)
- preds = cval.cross_val_predict(est, Xsp, y)
- assert_array_almost_equal(len(preds), len(y))
-
- preds = cval.cross_val_predict(KMeans(), X)
- assert_equal(len(preds), len(y))
-
- def bad_cv():
- for i in range(4):
- yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8])
-
- assert_raises(ValueError, cval.cross_val_predict, est, X, y, cv=bad_cv())
-
-
-def test_cross_val_predict_input_types():
- clf = Ridge()
- # Smoke test
- predictions = cval.cross_val_predict(clf, X, y)
- assert_equal(predictions.shape, (10,))
-
- # test with multioutput y
- with ignore_warnings(category=ConvergenceWarning):
- predictions = cval.cross_val_predict(clf, X_sparse, X)
- assert_equal(predictions.shape, (10, 2))
-
- predictions = cval.cross_val_predict(clf, X_sparse, y)
- assert_array_equal(predictions.shape, (10,))
-
- # test with multioutput y
- with ignore_warnings(category=ConvergenceWarning):
- predictions = cval.cross_val_predict(clf, X_sparse, X)
- assert_array_equal(predictions.shape, (10, 2))
-
- # test with X and y as list
- list_check = lambda x: isinstance(x, list)
- clf = CheckingClassifier(check_X=list_check)
- predictions = cval.cross_val_predict(clf, X.tolist(), y.tolist())
-
- clf = CheckingClassifier(check_y=list_check)
- predictions = cval.cross_val_predict(clf, X, y.tolist())
-
- # test with 3d X and
- X_3d = X[:, :, np.newaxis]
- check_3d = lambda x: x.ndim == 3
- clf = CheckingClassifier(check_X=check_3d)
- predictions = cval.cross_val_predict(clf, X_3d, y)
- assert_array_equal(predictions.shape, (10,))
-
-
-def test_cross_val_predict_pandas():
- # check cross_val_score doesn't destroy pandas dataframe
- types = [(MockDataFrame, MockDataFrame)]
- try:
- from pandas import Series, DataFrame
- types.append((Series, DataFrame))
- except ImportError:
- pass
- for TargetType, InputFeatureType in types:
- # X dataframe, y series
- X_df, y_ser = InputFeatureType(X), TargetType(y)
- check_df = lambda x: isinstance(x, InputFeatureType)
- check_series = lambda x: isinstance(x, TargetType)
- clf = CheckingClassifier(check_X=check_df, check_y=check_series)
- cval.cross_val_predict(clf, X_df, y_ser)
-
-
-def test_sparse_fit_params():
- iris = load_iris()
- X, y = iris.data, iris.target
- clf = MockClassifier()
- fit_params = {'sparse_sample_weight': coo_matrix(np.eye(X.shape[0]))}
- a = cval.cross_val_score(clf, X, y, fit_params=fit_params)
- assert_array_equal(a, np.ones(3))
-
-
-def test_check_is_partition():
- p = np.arange(100)
- assert_true(cval._check_is_partition(p, 100))
- assert_false(cval._check_is_partition(np.delete(p, 23), 100))
-
- p[0] = 23
- assert_false(cval._check_is_partition(p, 100))
-
-
-def test_cross_val_predict_sparse_prediction():
- # check that cross_val_predict gives same result for sparse and dense input
- X, y = make_multilabel_classification(n_classes=2, n_labels=1,
- allow_unlabeled=False,
- return_indicator=True,
- random_state=1)
- X_sparse = csr_matrix(X)
- y_sparse = csr_matrix(y)
- classif = OneVsRestClassifier(SVC(kernel='linear'))
- preds = cval.cross_val_predict(classif, X, y, cv=10)
- preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10)
- preds_sparse = preds_sparse.toarray()
- assert_array_almost_equal(preds_sparse, preds)
diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py
deleted file mode 100644
index 7a42757daea89..0000000000000
--- a/sklearn/tests/test_grid_search.py
+++ /dev/null
@@ -1,815 +0,0 @@
-"""
-Testing for grid search module (sklearn.grid_search)
-
-"""
-
-from collections import Iterable, Sized
-from sklearn.externals.six.moves import cStringIO as StringIO
-from sklearn.externals.six.moves import xrange
-from itertools import chain, product
-import pickle
-import warnings
-import sys
-
-import numpy as np
-import scipy.sparse as sp
-
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_not_equal
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_warns
-from sklearn.utils.testing import assert_raise_message
-from sklearn.utils.testing import assert_false, assert_true
-from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_no_warnings
-from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.mocking import CheckingClassifier, MockDataFrame
-
-from scipy.stats import bernoulli, expon, uniform
-
-from sklearn.externals.six.moves import zip
-from sklearn.base import BaseEstimator
-from sklearn.datasets import make_classification
-from sklearn.datasets import make_blobs
-from sklearn.datasets import make_multilabel_classification
-from sklearn.svm import LinearSVC, SVC
-from sklearn.tree import DecisionTreeRegressor
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.cluster import KMeans
-from sklearn.neighbors import KernelDensity
-from sklearn.metrics import f1_score
-from sklearn.metrics import make_scorer
-from sklearn.metrics import roc_auc_score
-from sklearn.linear_model import Ridge
-
-from sklearn.exceptions import FitFailedWarning
-
-with warnings.catch_warnings():
- warnings.simplefilter('ignore')
- from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV,
- ParameterGrid, ParameterSampler)
- from sklearn.cross_validation import KFold, StratifiedKFold
-
-from sklearn.preprocessing import Imputer
-from sklearn.pipeline import Pipeline
-
-
-# Neither of the following two estimators inherit from BaseEstimator,
-# to test hyperparameter search on user-defined classifiers.
-class MockClassifier(object):
- """Dummy classifier to test the cross-validation"""
- def __init__(self, foo_param=0):
- self.foo_param = foo_param
-
- def fit(self, X, Y):
- assert_true(len(X) == len(Y))
- return self
-
- def predict(self, T):
- return T.shape[0]
-
- def transform(self, X):
- return X - self.foo_param
-
- def inverse_transform(self, X):
- return X + self.foo_param
-
- predict_proba = predict
- decision_function = predict
-
- def score(self, X=None, Y=None):
- if self.foo_param > 1:
- score = 1.
- else:
- score = 0.
- return score
-
- def get_params(self, deep=False):
- return {'foo_param': self.foo_param}
-
- def set_params(self, **params):
- self.foo_param = params['foo_param']
- return self
-
-
-class LinearSVCNoScore(LinearSVC):
- """An LinearSVC classifier that has no score method."""
- @property
- def score(self):
- raise AttributeError
-
-X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
-y = np.array([1, 1, 2, 2])
-
-
-def assert_grid_iter_equals_getitem(grid):
- assert_equal(list(grid), [grid[i] for i in range(len(grid))])
-
-
-def test_parameter_grid():
- # Test basic properties of ParameterGrid.
- params1 = {"foo": [1, 2, 3]}
- grid1 = ParameterGrid(params1)
- assert_true(isinstance(grid1, Iterable))
- assert_true(isinstance(grid1, Sized))
- assert_equal(len(grid1), 3)
- assert_grid_iter_equals_getitem(grid1)
-
- params2 = {"foo": [4, 2],
- "bar": ["ham", "spam", "eggs"]}
- grid2 = ParameterGrid(params2)
- assert_equal(len(grid2), 6)
-
- # loop to assert we can iterate over the grid multiple times
- for i in xrange(2):
- # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2)
- points = set(tuple(chain(*(sorted(p.items())))) for p in grid2)
- assert_equal(points,
- set(("bar", x, "foo", y)
- for x, y in product(params2["bar"], params2["foo"])))
-
- assert_grid_iter_equals_getitem(grid2)
-
- # Special case: empty grid (useful to get default estimator settings)
- empty = ParameterGrid({})
- assert_equal(len(empty), 1)
- assert_equal(list(empty), [{}])
- assert_grid_iter_equals_getitem(empty)
- assert_raises(IndexError, lambda: empty[1])
-
- has_empty = ParameterGrid([{'C': [1, 10]}, {}, {'C': [.5]}])
- assert_equal(len(has_empty), 4)
- assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}, {'C': .5}])
- assert_grid_iter_equals_getitem(has_empty)
-
-
-def test_grid_search():
- # Test that the best estimator contains the right value for foo_param
- clf = MockClassifier()
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3)
- # make sure it selects the smallest parameter in case of ties
- old_stdout = sys.stdout
- sys.stdout = StringIO()
- grid_search.fit(X, y)
- sys.stdout = old_stdout
- assert_equal(grid_search.best_estimator_.foo_param, 2)
-
- for i, foo_i in enumerate([1, 2, 3]):
- assert_true(grid_search.grid_scores_[i][0]
- == {'foo_param': foo_i})
- # Smoke test the score etc:
- grid_search.score(X, y)
- grid_search.predict_proba(X)
- grid_search.decision_function(X)
- grid_search.transform(X)
-
- # Test exception handling on scoring
- grid_search.scoring = 'sklearn'
- assert_raises(ValueError, grid_search.fit, X, y)
-
-
-def test_transform_inverse_transform_round_trip():
- clf = MockClassifier()
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3)
- grid_search.fit(X, y)
- X_round_trip = grid_search.inverse_transform(grid_search.transform(X))
- assert_array_equal(X, X_round_trip)
-
-
-@ignore_warnings
-def test_grid_search_no_score():
- # Test grid-search on classifier that has no score function.
- clf = LinearSVC(random_state=0)
- X, y = make_blobs(random_state=0, centers=2)
- Cs = [.1, 1, 10]
- clf_no_score = LinearSVCNoScore(random_state=0)
- grid_search = GridSearchCV(clf, {'C': Cs}, scoring='accuracy')
- grid_search.fit(X, y)
-
- grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs},
- scoring='accuracy')
- # smoketest grid search
- grid_search_no_score.fit(X, y)
-
- # check that best params are equal
- assert_equal(grid_search_no_score.best_params_, grid_search.best_params_)
- # check that we can call score and that it gives the correct result
- assert_equal(grid_search.score(X, y), grid_search_no_score.score(X, y))
-
- # giving no scoring function raises an error
- grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs})
- assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit,
- [[1]])
-
-
-def test_grid_search_score_method():
- X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
- random_state=0)
- clf = LinearSVC(random_state=0)
- grid = {'C': [.1]}
-
- search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
- search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y)
- search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid,
- scoring='roc_auc').fit(X, y)
- search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y)
-
- # ChangedBehaviourWarning occurred previously (prior to #9005)
- score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y)
- score_accuracy = assert_no_warnings(search_accuracy.score, X, y)
- score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score,
- X, y)
- score_auc = assert_no_warnings(search_auc.score, X, y)
-
- # ensure the test is sane
- assert_true(score_auc < 1.0)
- assert_true(score_accuracy < 1.0)
- assert_not_equal(score_auc, score_accuracy)
-
- assert_almost_equal(score_accuracy, score_no_scoring)
- assert_almost_equal(score_auc, score_no_score_auc)
-
-
-def test_trivial_grid_scores():
- # Test search over a "grid" with only one point.
- # Non-regression test: grid_scores_ wouldn't be set by GridSearchCV.
- clf = MockClassifier()
- grid_search = GridSearchCV(clf, {'foo_param': [1]})
- grid_search.fit(X, y)
- assert_true(hasattr(grid_search, "grid_scores_"))
-
- random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1)
- random_search.fit(X, y)
- assert_true(hasattr(random_search, "grid_scores_"))
-
-
-def test_no_refit():
- # Test that grid search can be used for model selection only
- clf = MockClassifier()
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False)
- grid_search.fit(X, y)
- assert_true(hasattr(grid_search, "best_params_"))
-
-
-def test_grid_search_error():
- # Test that grid search will capture errors on data with different
- # length
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
- clf = LinearSVC()
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- assert_raises(ValueError, cv.fit, X_[:180], y_)
-
-
-def test_grid_search_iid():
- # test the iid parameter
- # noise-free simple 2d-data
- X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0,
- cluster_std=0.1, shuffle=False, n_samples=80)
- # split dataset into two folds that are not iid
- # first one contains data of all 4 blobs, second only from two.
- mask = np.ones(X.shape[0], dtype=np.bool)
- mask[np.where(y == 1)[0][::2]] = 0
- mask[np.where(y == 2)[0][::2]] = 0
- # this leads to perfect classification on one fold and a score of 1/3 on
- # the other
- svm = SVC(kernel='linear')
- # create "cv" for splits
- cv = [[mask, ~mask], [~mask, mask]]
- # once with iid=True (default)
- grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv)
- grid_search.fit(X, y)
- first = grid_search.grid_scores_[0]
- assert_equal(first.parameters['C'], 1)
- assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
- # for first split, 1/4 of dataset is in test, for second 3/4.
- # take weighted average
- assert_almost_equal(first.mean_validation_score,
- 1 * 1. / 4. + 1. / 3. * 3. / 4.)
-
- # once with iid=False
- grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv,
- iid=False)
- grid_search.fit(X, y)
- first = grid_search.grid_scores_[0]
- assert_equal(first.parameters['C'], 1)
- # scores are the same as above
- assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
- # averaged score is just mean of scores
- assert_almost_equal(first.mean_validation_score,
- np.mean(first.cv_validation_scores))
-
-
-def test_grid_search_one_grid_point():
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
- param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}
-
- clf = SVC()
- cv = GridSearchCV(clf, param_dict)
- cv.fit(X_, y_)
-
- clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
- clf.fit(X_, y_)
-
- assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
-
-
-def test_grid_search_bad_param_grid():
- param_dict = {"C": 1.0}
- clf = SVC()
- assert_raises(ValueError, GridSearchCV, clf, param_dict)
-
- param_dict = {"C": []}
- clf = SVC()
- assert_raises(ValueError, GridSearchCV, clf, param_dict)
-
- param_dict = {"C": np.ones(6).reshape(3, 2)}
- clf = SVC()
- assert_raises(ValueError, GridSearchCV, clf, param_dict)
-
-
-def test_grid_search_sparse():
- # Test that grid search works with both dense and sparse matrices
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
- clf = LinearSVC()
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- cv.fit(X_[:180], y_[:180])
- y_pred = cv.predict(X_[180:])
- C = cv.best_estimator_.C
-
- X_ = sp.csr_matrix(X_)
- clf = LinearSVC()
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- cv.fit(X_[:180].tocoo(), y_[:180])
- y_pred2 = cv.predict(X_[180:])
- C2 = cv.best_estimator_.C
-
- assert_true(np.mean(y_pred == y_pred2) >= .9)
- assert_equal(C, C2)
-
-
-def test_grid_search_sparse_scoring():
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
- clf = LinearSVC()
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
- cv.fit(X_[:180], y_[:180])
- y_pred = cv.predict(X_[180:])
- C = cv.best_estimator_.C
-
- X_ = sp.csr_matrix(X_)
- clf = LinearSVC()
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
- cv.fit(X_[:180], y_[:180])
- y_pred2 = cv.predict(X_[180:])
- C2 = cv.best_estimator_.C
-
- assert_array_equal(y_pred, y_pred2)
- assert_equal(C, C2)
- # Smoke test the score
- # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
- # cv.score(X_[:180], y[:180]))
-
- # test loss where greater is worse
- def f1_loss(y_true_, y_pred_):
- return -f1_score(y_true_, y_pred_)
- F1Loss = make_scorer(f1_loss, greater_is_better=False)
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
- cv.fit(X_[:180], y_[:180])
- y_pred3 = cv.predict(X_[180:])
- C3 = cv.best_estimator_.C
-
- assert_equal(C, C3)
- assert_array_equal(y_pred, y_pred3)
-
-
-def test_grid_search_precomputed_kernel():
- # Test that grid search works when the input features are given in the
- # form of a precomputed kernel matrix
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
- # compute the training kernel matrix corresponding to the linear kernel
- K_train = np.dot(X_[:180], X_[:180].T)
- y_train = y_[:180]
-
- clf = SVC(kernel='precomputed')
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- cv.fit(K_train, y_train)
-
- assert_true(cv.best_score_ >= 0)
-
- # compute the test kernel matrix
- K_test = np.dot(X_[180:], X_[:180].T)
- y_test = y_[180:]
-
- y_pred = cv.predict(K_test)
-
- assert_true(np.mean(y_pred == y_test) >= 0)
-
- # test error is raised when the precomputed kernel is not array-like
- # or sparse
- assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
-
-
-def test_grid_search_precomputed_kernel_error_nonsquare():
- # Test that grid search returns an error with a non-square precomputed
- # training kernel matrix
- K_train = np.zeros((10, 20))
- y_train = np.ones((10, ))
- clf = SVC(kernel='precomputed')
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- assert_raises(ValueError, cv.fit, K_train, y_train)
-
-
-def test_grid_search_precomputed_kernel_error_kernel_function():
- # Test that grid search returns an error when using a kernel_function
- X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
- kernel_function = lambda x1, x2: np.dot(x1, x2.T)
- clf = SVC(kernel=kernel_function)
- cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
- assert_raises(ValueError, cv.fit, X_, y_)
-
-
-class BrokenClassifier(BaseEstimator):
- """Broken classifier that cannot be fit twice"""
-
- def __init__(self, parameter=None):
- self.parameter = parameter
-
- def fit(self, X, y):
- assert_true(not hasattr(self, 'has_been_fit_'))
- self.has_been_fit_ = True
-
- def predict(self, X):
- return np.zeros(X.shape[0])
-
-
-@ignore_warnings
-def test_refit():
- # Regression test for bug in refitting
- # Simulates re-fitting a broken estimator; this used to break with
- # sparse SVMs.
- X = np.arange(100).reshape(10, 10)
- y = np.array([0] * 5 + [1] * 5)
-
- clf = GridSearchCV(BrokenClassifier(), [{'parameter': [0, 1]}],
- scoring="precision", refit=True)
- clf.fit(X, y)
-
-
-def test_gridsearch_nd():
- # Pass X as list in GridSearchCV
- X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
- y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
- check_X = lambda x: x.shape[1:] == (5, 3, 2)
- check_y = lambda x: x.shape[1:] == (7, 11)
- clf = CheckingClassifier(check_X=check_X, check_y=check_y)
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
- grid_search.fit(X_4d, y_3d).score(X, y)
- assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_X_as_list():
- # Pass X as list in GridSearchCV
- X = np.arange(100).reshape(10, 10)
- y = np.array([0] * 5 + [1] * 5)
-
- clf = CheckingClassifier(check_X=lambda x: isinstance(x, list))
- cv = KFold(n=len(X), n_folds=3)
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
- grid_search.fit(X.tolist(), y).score(X, y)
- assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_y_as_list():
- # Pass y as list in GridSearchCV
- X = np.arange(100).reshape(10, 10)
- y = np.array([0] * 5 + [1] * 5)
-
- clf = CheckingClassifier(check_y=lambda x: isinstance(x, list))
- cv = KFold(n=len(X), n_folds=3)
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
- grid_search.fit(X, y.tolist()).score(X, y)
- assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_pandas_input():
- # check cross_val_score doesn't destroy pandas dataframe
- types = [(MockDataFrame, MockDataFrame)]
- try:
- from pandas import Series, DataFrame
- types.append((DataFrame, Series))
- except ImportError:
- pass
-
- X = np.arange(100).reshape(10, 10)
- y = np.array([0] * 5 + [1] * 5)
-
- for InputFeatureType, TargetType in types:
- # X dataframe, y series
- X_df, y_ser = InputFeatureType(X), TargetType(y)
- check_df = lambda x: isinstance(x, InputFeatureType)
- check_series = lambda x: isinstance(x, TargetType)
- clf = CheckingClassifier(check_X=check_df, check_y=check_series)
-
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
- grid_search.fit(X_df, y_ser).score(X_df, y_ser)
- grid_search.predict(X_df)
- assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_unsupervised_grid_search():
- # test grid-search with unsupervised estimator
- X, y = make_blobs(random_state=0)
- km = KMeans(random_state=0)
- grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
- scoring='adjusted_rand_score')
- grid_search.fit(X, y)
- # ARI can find the right number :)
- assert_equal(grid_search.best_params_["n_clusters"], 3)
-
- # Now without a score, and without y
- grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]))
- grid_search.fit(X)
- assert_equal(grid_search.best_params_["n_clusters"], 4)
-
-
-def test_gridsearch_no_predict():
- # test grid-search with an estimator without predict.
- # slight duplication of a test from KDE
- def custom_scoring(estimator, X):
- return 42 if estimator.bandwidth == .1 else 0
- X, _ = make_blobs(cluster_std=.1, random_state=1,
- centers=[[0, 1], [1, 0], [0, 0]])
- search = GridSearchCV(KernelDensity(),
- param_grid=dict(bandwidth=[.01, .1, 1]),
- scoring=custom_scoring)
- search.fit(X)
- assert_equal(search.best_params_['bandwidth'], .1)
- assert_equal(search.best_score_, 42)
-
-
-def test_param_sampler():
- # test basic properties of param sampler
- param_distributions = {"kernel": ["rbf", "linear"],
- "C": uniform(0, 1)}
- sampler = ParameterSampler(param_distributions=param_distributions,
- n_iter=10, random_state=0)
- samples = [x for x in sampler]
- assert_equal(len(samples), 10)
- for sample in samples:
- assert_true(sample["kernel"] in ["rbf", "linear"])
- assert_true(0 <= sample["C"] <= 1)
-
-
-def test_randomized_search_grid_scores():
- # Make a dataset with a lot of noise to get various kind of prediction
- # errors across CV folds and parameter settings
- X, y = make_classification(n_samples=200, n_features=100, n_informative=3,
- random_state=0)
-
- # XXX: as of today (scipy 0.12) it's not possible to set the random seed
- # of scipy.stats distributions: the assertions in this test should thus
- # not depend on the randomization
- params = dict(C=expon(scale=10),
- gamma=expon(scale=0.1))
- n_cv_iter = 3
- n_search_iter = 30
- search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter,
- param_distributions=params, iid=False)
- search.fit(X, y)
- assert_equal(len(search.grid_scores_), n_search_iter)
-
- # Check consistency of the structure of each cv_score item
- for cv_score in search.grid_scores_:
- assert_equal(len(cv_score.cv_validation_scores), n_cv_iter)
- # Because we set iid to False, the mean_validation score is the
- # mean of the fold mean scores instead of the aggregate sample-wise
- # mean score
- assert_almost_equal(np.mean(cv_score.cv_validation_scores),
- cv_score.mean_validation_score)
- assert_equal(list(sorted(cv_score.parameters.keys())),
- list(sorted(params.keys())))
-
- # Check the consistency with the best_score_ and best_params_ attributes
- sorted_grid_scores = list(sorted(search.grid_scores_,
- key=lambda x: x.mean_validation_score))
- best_score = sorted_grid_scores[-1].mean_validation_score
- assert_equal(search.best_score_, best_score)
-
- tied_best_params = [s.parameters for s in sorted_grid_scores
- if s.mean_validation_score == best_score]
- assert_true(search.best_params_ in tied_best_params,
- "best_params_={0} is not part of the"
- " tied best models: {1}".format(
- search.best_params_, tied_best_params))
-
-
-def test_grid_search_score_consistency():
- # test that correct scores are used
- clf = LinearSVC(random_state=0)
- X, y = make_blobs(random_state=0, centers=2)
- Cs = [.1, 1, 10]
- for score in ['f1', 'roc_auc']:
- grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score)
- grid_search.fit(X, y)
- cv = StratifiedKFold(n_folds=3, y=y)
- for C, scores in zip(Cs, grid_search.grid_scores_):
- clf.set_params(C=C)
- scores = scores[2] # get the separate runs from grid scores
- i = 0
- for train, test in cv:
- clf.fit(X[train], y[train])
- if score == "f1":
- correct_score = f1_score(y[test], clf.predict(X[test]))
- elif score == "roc_auc":
- dec = clf.decision_function(X[test])
- correct_score = roc_auc_score(y[test], dec)
- assert_almost_equal(correct_score, scores[i])
- i += 1
-
-
-def test_pickle():
- # Test that a fit search can be pickled
- clf = MockClassifier()
- grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True)
- grid_search.fit(X, y)
- pickle.dumps(grid_search) # smoke test
-
- random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]},
- refit=True, n_iter=3)
- random_search.fit(X, y)
- pickle.dumps(random_search) # smoke test
-
-
-def test_grid_search_with_multioutput_data():
- # Test search with multi-output estimator
-
- X, y = make_multilabel_classification(random_state=0)
-
- est_parameters = {"max_depth": [1, 2, 3, 4]}
- cv = KFold(y.shape[0], random_state=0)
-
- estimators = [DecisionTreeRegressor(random_state=0),
- DecisionTreeClassifier(random_state=0)]
-
- # Test with grid search cv
- for est in estimators:
- grid_search = GridSearchCV(est, est_parameters, cv=cv)
- grid_search.fit(X, y)
- for parameters, _, cv_validation_scores in grid_search.grid_scores_:
- est.set_params(**parameters)
-
- for i, (train, test) in enumerate(cv):
- est.fit(X[train], y[train])
- correct_score = est.score(X[test], y[test])
- assert_almost_equal(correct_score,
- cv_validation_scores[i])
-
- # Test with a randomized search
- for est in estimators:
- random_search = RandomizedSearchCV(est, est_parameters,
- cv=cv, n_iter=3)
- random_search.fit(X, y)
- for parameters, _, cv_validation_scores in random_search.grid_scores_:
- est.set_params(**parameters)
-
- for i, (train, test) in enumerate(cv):
- est.fit(X[train], y[train])
- correct_score = est.score(X[test], y[test])
- assert_almost_equal(correct_score,
- cv_validation_scores[i])
-
-
-def test_predict_proba_disabled():
- # Test predict_proba when disabled on estimator.
- X = np.arange(20).reshape(5, -1)
- y = [0, 0, 1, 1, 1]
- clf = SVC(gamma='scale', probability=False)
- gs = GridSearchCV(clf, {}, cv=2).fit(X, y)
- assert_false(hasattr(gs, "predict_proba"))
-
-
-def test_grid_search_allows_nans():
- # Test GridSearchCV with Imputer
- X = np.arange(20, dtype=np.float64).reshape(5, -1)
- X[2, :] = np.nan
- y = [0, 0, 1, 1, 1]
- p = Pipeline([
- ('imputer', Imputer(strategy='mean', missing_values='NaN')),
- ('classifier', MockClassifier()),
- ])
- GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
-
-
-class FailingClassifier(BaseEstimator):
- """Classifier that raises a ValueError on fit()"""
-
- FAILING_PARAMETER = 2
-
- def __init__(self, parameter=None):
- self.parameter = parameter
-
- def fit(self, X, y=None):
- if self.parameter == FailingClassifier.FAILING_PARAMETER:
- raise ValueError("Failing classifier failed as required")
-
- def predict(self, X):
- return np.zeros(X.shape[0])
-
-
-def test_grid_search_failing_classifier():
- # GridSearchCV with on_error != 'raise'
- # Ensures that a warning is raised and score reset where appropriate.
-
- X, y = make_classification(n_samples=20, n_features=10, random_state=0)
-
- clf = FailingClassifier()
-
- # refit=False because we only want to check that errors caused by fits
- # to individual folds will be caught and warnings raised instead. If
- # refit was done, then an exception would be raised on refit and not
- # caught by grid_search (expected behavior), and this would cause an
- # error in this test.
- gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
- refit=False, error_score=0.0)
-
- assert_warns(FitFailedWarning, gs.fit, X, y)
-
- # Ensure that grid scores were set to zero as required for those fits
- # that are expected to fail.
- assert all(np.all(this_point.cv_validation_scores == 0.0)
- for this_point in gs.grid_scores_
- if this_point.parameters['parameter'] ==
- FailingClassifier.FAILING_PARAMETER)
-
- gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
- refit=False, error_score=float('nan'))
- assert_warns(FitFailedWarning, gs.fit, X, y)
- assert all(np.all(np.isnan(this_point.cv_validation_scores))
- for this_point in gs.grid_scores_
- if this_point.parameters['parameter'] ==
- FailingClassifier.FAILING_PARAMETER)
-
-
-def test_grid_search_failing_classifier_raise():
- # GridSearchCV with on_error == 'raise' raises the error
-
- X, y = make_classification(n_samples=20, n_features=10, random_state=0)
-
- clf = FailingClassifier()
-
- # refit=False because we want to test the behaviour of the grid search part
- gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
- refit=False, error_score='raise')
-
- # FailingClassifier issues a ValueError so this is what we look for.
- assert_raises(ValueError, gs.fit, X, y)
-
-
-def test_parameters_sampler_replacement():
- # raise error if n_iter too large
- params = {'first': [0, 1], 'second': ['a', 'b', 'c']}
- sampler = ParameterSampler(params, n_iter=7)
- assert_raises(ValueError, list, sampler)
- # degenerates to GridSearchCV if n_iter the same as grid_size
- sampler = ParameterSampler(params, n_iter=6)
- samples = list(sampler)
- assert_equal(len(samples), 6)
- for values in ParameterGrid(params):
- assert_true(values in samples)
-
- # test sampling without replacement in a large grid
- params = {'a': range(10), 'b': range(10), 'c': range(10)}
- sampler = ParameterSampler(params, n_iter=99, random_state=42)
- samples = list(sampler)
- assert_equal(len(samples), 99)
- hashable_samples = ["a%db%dc%d" % (p['a'], p['b'], p['c'])
- for p in samples]
- assert_equal(len(set(hashable_samples)), 99)
-
- # doesn't go into infinite loops
- params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']}
- sampler = ParameterSampler(params_distribution, n_iter=7)
- samples = list(sampler)
- assert_equal(len(samples), 7)
-
-
-def test_classes__property():
- # Test that classes_ property matches best_esimator_.classes_
- X = np.arange(100).reshape(10, 10)
- y = np.array([0] * 5 + [1] * 5)
- Cs = [.1, 1, 10]
-
- grid_search = GridSearchCV(LinearSVC(random_state=0), {'C': Cs})
- grid_search.fit(X, y)
- assert_array_equal(grid_search.best_estimator_.classes_,
- grid_search.classes_)
-
- # Test that regressors do not have a classes_ attribute
- grid_search = GridSearchCV(Ridge(), {'alpha': [1.0, 2.0]})
- grid_search.fit(X, y)
- assert_false(hasattr(grid_search, 'classes_'))
From f30720a9239589866d5a3d45168d9bd35889194d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Thu, 24 May 2018 12:27:35 -0400
Subject: [PATCH 18/36] import fix
---
sklearn/model_selection/tests/test_validation.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 2929916619769..8da0b33a22262 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -12,7 +12,7 @@
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.exceptions import FitFailedWarning
-from sklearn.tests.test_grid_search import FailingClassifier
+from sklearn.model_selection.tests.test_search import FailingClassifier
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_false
From 048a5cab045fc5eb980dbc9711d9a0a94009b481 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 12:27:01 -0400
Subject: [PATCH 19/36] pep8
---
sklearn/utils/estimator_checks.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 71b5fbe2706c3..8474741f85f52 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -38,6 +38,7 @@
from sklearn.utils.testing import create_memmap_backed_data
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+
from sklearn.base import (clone, ClusterMixin,
BaseEstimator, is_classifier, is_regressor,
is_outlier_detector)
@@ -78,7 +79,6 @@
ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MICEImputer',
'MinMaxScaler', 'QuantileTransformer']
-
def _yield_non_meta_checks(name, estimator):
yield check_estimators_dtypes
yield check_fit_score_takes_y
@@ -256,8 +256,10 @@ def _yield_all_checks(name, estimator):
for check in _yield_outliers_checks(name, estimator):
yield check
yield check_fit2d_predict1d
- yield check_fit2d_1sample
yield check_methods_subset_invariance
+ if name != 'GaussianProcess': # FIXME
+ # XXX GaussianProcess deprecated in 0.20
+ yield check_fit2d_1sample
yield check_fit2d_1feature
yield check_fit1d
yield check_get_params_invariance
From ddd45bdd347795ae823a308c72ca376b5abd7afe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 12:29:43 -0400
Subject: [PATCH 20/36] pep8
---
sklearn/utils/estimator_checks.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 8474741f85f52..5e03fa2c96ee5 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -79,6 +79,7 @@
ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MICEImputer',
'MinMaxScaler', 'QuantileTransformer']
+
def _yield_non_meta_checks(name, estimator):
yield check_estimators_dtypes
yield check_fit_score_takes_y
From 49b149897133798106df8cdba6dbd8a05142cdeb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 12:29:49 -0400
Subject: [PATCH 21/36] delete old GMM
---
sklearn/mixture/dpgmm.py | 859 ---------------------------------------
sklearn/mixture/gmm.py | 853 --------------------------------------
2 files changed, 1712 deletions(-)
delete mode 100644 sklearn/mixture/dpgmm.py
delete mode 100644 sklearn/mixture/gmm.py
diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py
deleted file mode 100644
index 9cf77fee74f14..0000000000000
--- a/sklearn/mixture/dpgmm.py
+++ /dev/null
@@ -1,859 +0,0 @@
-"""Bayesian Gaussian Mixture Models and
-Dirichlet Process Gaussian Mixture Models"""
-from __future__ import print_function
-
-# Author: Alexandre Passos (alexandre.tp@gmail.com)
-# Bertrand Thirion
-#
-# Based on mixture.py by:
-# Ron Weiss
-# Fabian Pedregosa
-#
-
-# Important note for the deprecation cleaning of 0.20 :
-# All the function and classes of this file have been deprecated in 0.18.
-# When you remove this file please also remove the related files
-# - 'sklearn/mixture/gmm.py'
-# - 'sklearn/mixture/test_dpgmm.py'
-# - 'sklearn/mixture/test_gmm.py'
-
-import numpy as np
-from scipy.special import digamma as _digamma, gammaln as _gammaln
-from scipy import linalg
-from scipy.linalg import pinvh
-from scipy.spatial.distance import cdist
-
-from ..externals.six.moves import xrange
-from ..utils import check_random_state, check_array, deprecated
-from ..utils.fixes import logsumexp
-from ..utils.extmath import squared_norm, stable_cumsum
-from ..utils.validation import check_is_fitted
-from .. import cluster
-from .gmm import _GMMBase
-
-
-@deprecated("The function digamma is deprecated in 0.18 and "
- "will be removed in 0.20. Use scipy.special.digamma instead.")
-def digamma(x):
- return _digamma(x + np.finfo(np.float32).eps)
-
-
-@deprecated("The function gammaln is deprecated in 0.18 and "
- "will be removed in 0.20. Use scipy.special.gammaln instead.")
-def gammaln(x):
- return _gammaln(x + np.finfo(np.float32).eps)
-
-
-@deprecated("The function log_normalize is deprecated in 0.18 and "
- "will be removed in 0.20.")
-def log_normalize(v, axis=0):
- """Normalized probabilities from unnormalized log-probabilities"""
- v = np.rollaxis(v, axis)
- v = v.copy()
- v -= v.max(axis=0)
- out = logsumexp(v)
- v = np.exp(v - out)
- v += np.finfo(np.float32).eps
- v /= np.sum(v, axis=0)
- return np.swapaxes(v, 0, axis)
-
-
-@deprecated("The function wishart_log_det is deprecated in 0.18 and "
- "will be removed in 0.20.")
-def wishart_log_det(a, b, detB, n_features):
- """Expected value of the log of the determinant of a Wishart
-
- The expected value of the logarithm of the determinant of a
- wishart-distributed random variable with the specified parameters."""
- l = np.sum(digamma(0.5 * (a - np.arange(-1, n_features - 1))))
- l += n_features * np.log(2)
- return l + detB
-
-
-@deprecated("The function wishart_logz is deprecated in 0.18 and "
- "will be removed in 0.20.")
-def wishart_logz(v, s, dets, n_features):
- "The logarithm of the normalization constant for the wishart distribution"
- z = 0.
- z += 0.5 * v * n_features * np.log(2)
- z += (0.25 * (n_features * (n_features - 1)) * np.log(np.pi))
- z += 0.5 * v * np.log(dets)
- z += np.sum(gammaln(0.5 * (v - np.arange(n_features) + 1)))
- return z
-
-
-def _bound_wishart(a, B, detB):
- """Returns a function of the dof, scale matrix and its determinant
- used as an upper bound in variational approximation of the evidence"""
- n_features = B.shape[0]
- logprior = wishart_logz(a, B, detB, n_features)
- logprior -= wishart_logz(n_features,
- np.identity(n_features),
- 1, n_features)
- logprior += 0.5 * (a - 1) * wishart_log_det(a, B, detB, n_features)
- logprior += 0.5 * a * np.trace(B)
- return logprior
-
-
-##############################################################################
-# Variational bound on the log likelihood of each class
-##############################################################################
-
-
-def _sym_quad_form(x, mu, A):
- """helper function to calculate symmetric quadratic form x.T * A * x"""
- q = (cdist(x, mu[np.newaxis], "mahalanobis", VI=A) ** 2).reshape(-1)
- return q
-
-
-def _bound_state_log_lik(X, initial_bound, precs, means, covariance_type):
- """Update the bound with likelihood terms, for standard covariance types"""
- n_components, n_features = means.shape
- n_samples = X.shape[0]
- bound = np.empty((n_samples, n_components))
- bound[:] = initial_bound
- if covariance_type in ['diag', 'spherical']:
- for k in range(n_components):
- d = X - means[k]
- bound[:, k] -= 0.5 * np.sum(d * d * precs[k], axis=1)
- elif covariance_type == 'tied':
- for k in range(n_components):
- bound[:, k] -= 0.5 * _sym_quad_form(X, means[k], precs)
- elif covariance_type == 'full':
- for k in range(n_components):
- bound[:, k] -= 0.5 * _sym_quad_form(X, means[k], precs[k])
- return bound
-
-
-class _DPGMMBase(_GMMBase):
- """Variational Inference for the Infinite Gaussian Mixture Model.
-
- DPGMM stands for Dirichlet Process Gaussian Mixture Model, and it
- is an infinite mixture model with the Dirichlet Process as a prior
- distribution on the number of clusters. In practice the
- approximate inference algorithm uses a truncated distribution with
- a fixed maximum number of components, but almost always the number
- of components actually used depends on the data.
-
- Stick-breaking Representation of a Gaussian mixture model
- probability distribution. This class allows for easy and efficient
- inference of an approximate posterior distribution over the
- parameters of a Gaussian mixture model with a variable number of
- components (smaller than the truncation parameter n_components).
-
- Initialization is with normally-distributed means and identity
- covariance, for proper convergence.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n_components : int, default 1
- Number of mixture components.
-
- covariance_type : string, default 'diag'
- String describing the type of covariance parameters to
- use. Must be one of 'spherical', 'tied', 'diag', 'full'.
-
- alpha : float, default 1
- Real number representing the concentration parameter of
- the dirichlet process. Intuitively, the Dirichlet Process
- is as likely to start a new cluster for a point as it is
- to add that point to a cluster with alpha elements. A
- higher alpha means more clusters, as the expected number
- of clusters is ``alpha*log(N)``.
-
- tol : float, default 1e-3
- Convergence threshold.
-
- n_iter : int, default 10
- Maximum number of iterations to perform before convergence.
-
- params : string, default 'wmc'
- Controls which parameters are updated in the training
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars.
-
- init_params : string, default 'wmc'
- Controls which parameters are updated in the initialization
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars. Defaults to 'wmc'.
-
- verbose : int, default 0
- Controls output verbosity.
-
- Attributes
- ----------
- covariance_type : string
- String describing the type of covariance parameters used by
- the DP-GMM. Must be one of 'spherical', 'tied', 'diag', 'full'.
-
- n_components : int
- Number of mixture components.
-
- weights_ : array, shape (`n_components`,)
- Mixing weights for each mixture component.
-
- means_ : array, shape (`n_components`, `n_features`)
- Mean parameters for each mixture component.
-
- precs_ : array
- Precision (inverse covariance) parameters for each mixture
- component. The shape depends on `covariance_type`::
-
- (`n_components`, 'n_features') if 'spherical',
- (`n_features`, `n_features`) if 'tied',
- (`n_components`, `n_features`) if 'diag',
- (`n_components`, `n_features`, `n_features`) if 'full'
-
- converged_ : bool
- True when convergence was reached in fit(), False otherwise.
-
- See Also
- --------
- GMM : Finite Gaussian mixture model fit with EM
-
- VBGMM : Finite Gaussian mixture model fit with a variational
- algorithm, better for situations where there might be too little
- data to get a good estimate of the covariance matrix.
- """
- def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
- random_state=None, tol=1e-3, verbose=0, min_covar=None,
- n_iter=10, params='wmc', init_params='wmc'):
- self.alpha = alpha
- super(_DPGMMBase, self).__init__(n_components, covariance_type,
- random_state=random_state,
- tol=tol, min_covar=min_covar,
- n_iter=n_iter, params=params,
- init_params=init_params,
- verbose=verbose)
-
- def _get_precisions(self):
- """Return precisions as a full matrix."""
- if self.covariance_type == 'full':
- return self.precs_
- elif self.covariance_type in ['diag', 'spherical']:
- return [np.diag(cov) for cov in self.precs_]
- elif self.covariance_type == 'tied':
- return [self.precs_] * self.n_components
-
- def _get_covars(self):
- return [pinvh(c) for c in self._get_precisions()]
-
- def _set_covars(self, covars):
- raise NotImplementedError("""The variational algorithm does
- not support setting the covariance parameters.""")
-
- def score_samples(self, X):
- """Return the likelihood of the data under the model.
-
- Compute the bound on log probability of X under the model
- and return the posterior distribution (responsibilities) of
- each mixture component for each element of X.
-
- This is done by computing the parameters for the mean-field of
- z for each observation.
-
- Parameters
- ----------
- X : array_like, shape (n_samples, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- logprob : array_like, shape (n_samples,)
- Log probabilities of each data point in X
- responsibilities : array_like, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation
- """
- check_is_fitted(self, 'gamma_')
-
- X = check_array(X)
- if X.ndim == 1:
- X = X[:, np.newaxis]
- sd = digamma(self.gamma_.T[1] + self.gamma_.T[2])
- dgamma1 = digamma(self.gamma_.T[1]) - sd
- dgamma2 = np.zeros(self.n_components)
- dgamma2[0] = digamma(self.gamma_[0, 2]) - digamma(self.gamma_[0, 1] +
- self.gamma_[0, 2])
- for j in range(1, self.n_components):
- dgamma2[j] = dgamma2[j - 1] + digamma(self.gamma_[j - 1, 2])
- dgamma2[j] -= sd[j - 1]
- dgamma = dgamma1 + dgamma2
- # Free memory and developers cognitive load:
- del dgamma1, dgamma2, sd
-
- if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']:
- raise NotImplementedError("This ctype is not implemented: %s"
- % self.covariance_type)
- p = _bound_state_log_lik(X, self._initial_bound + self.bound_prec_,
- self.precs_, self.means_,
- self.covariance_type)
- z = p + dgamma
- z = log_normalize(z, axis=-1)
- bound = np.sum(z * p, axis=-1)
- return bound, z
-
- def _update_concentration(self, z):
- """Update the concentration parameters for each cluster"""
- sz = np.sum(z, axis=0)
- self.gamma_.T[1] = 1. + sz
- self.gamma_.T[2].fill(0)
- for i in range(self.n_components - 2, -1, -1):
- self.gamma_[i, 2] = self.gamma_[i + 1, 2] + sz[i]
- self.gamma_.T[2] += self.alpha
-
- def _update_means(self, X, z):
- """Update the variational distributions for the means"""
- n_features = X.shape[1]
- for k in range(self.n_components):
- if self.covariance_type in ['spherical', 'diag']:
- num = np.sum(z.T[k].reshape((-1, 1)) * X, axis=0)
- num *= self.precs_[k]
- den = 1. + self.precs_[k] * np.sum(z.T[k])
- self.means_[k] = num / den
- elif self.covariance_type in ['tied', 'full']:
- if self.covariance_type == 'tied':
- cov = self.precs_
- else:
- cov = self.precs_[k]
- den = np.identity(n_features) + cov * np.sum(z.T[k])
- num = np.sum(z.T[k].reshape((-1, 1)) * X, axis=0)
- num = np.dot(cov, num)
- self.means_[k] = linalg.lstsq(den, num)[0]
-
- def _update_precisions(self, X, z):
- """Update the variational distributions for the precisions"""
- n_features = X.shape[1]
- if self.covariance_type == 'spherical':
- self.dof_ = 0.5 * n_features * np.sum(z, axis=0)
- for k in range(self.n_components):
- # could be more memory efficient ?
- sq_diff = np.sum((X - self.means_[k]) ** 2, axis=1)
- self.scale_[k] = 1.
- self.scale_[k] += 0.5 * np.sum(z.T[k] * (sq_diff + n_features))
- self.bound_prec_[k] = (
- 0.5 * n_features * (
- digamma(self.dof_[k]) - np.log(self.scale_[k])))
- self.precs_ = np.tile(self.dof_ / self.scale_, [n_features, 1]).T
-
- elif self.covariance_type == 'diag':
- for k in range(self.n_components):
- self.dof_[k].fill(1. + 0.5 * np.sum(z.T[k], axis=0))
- sq_diff = (X - self.means_[k]) ** 2 # see comment above
- self.scale_[k] = np.ones(n_features) + 0.5 * np.dot(
- z.T[k], (sq_diff + 1))
- self.precs_[k] = self.dof_[k] / self.scale_[k]
- self.bound_prec_[k] = 0.5 * np.sum(digamma(self.dof_[k])
- - np.log(self.scale_[k]))
- self.bound_prec_[k] -= 0.5 * np.sum(self.precs_[k])
-
- elif self.covariance_type == 'tied':
- self.dof_ = 2 + X.shape[0] + n_features
- self.scale_ = (X.shape[0] + 1) * np.identity(n_features)
- for k in range(self.n_components):
- diff = X - self.means_[k]
- self.scale_ += np.dot(diff.T, z[:, k:k + 1] * diff)
- self.scale_ = pinvh(self.scale_)
- self.precs_ = self.dof_ * self.scale_
- self.det_scale_ = linalg.det(self.scale_)
- self.bound_prec_ = 0.5 * wishart_log_det(
- self.dof_, self.scale_, self.det_scale_, n_features)
- self.bound_prec_ -= 0.5 * self.dof_ * np.trace(self.scale_)
-
- elif self.covariance_type == 'full':
- for k in range(self.n_components):
- sum_resp = np.sum(z.T[k])
- self.dof_[k] = 2 + sum_resp + n_features
- self.scale_[k] = (sum_resp + 1) * np.identity(n_features)
- diff = X - self.means_[k]
- self.scale_[k] += np.dot(diff.T, z[:, k:k + 1] * diff)
- self.scale_[k] = pinvh(self.scale_[k])
- self.precs_[k] = self.dof_[k] * self.scale_[k]
- self.det_scale_[k] = linalg.det(self.scale_[k])
- self.bound_prec_[k] = 0.5 * wishart_log_det(
- self.dof_[k], self.scale_[k], self.det_scale_[k],
- n_features)
- self.bound_prec_[k] -= 0.5 * self.dof_[k] * np.trace(
- self.scale_[k])
-
- def _monitor(self, X, z, n, end=False):
- """Monitor the lower bound during iteration
-
- Debug method to help see exactly when it is failing to converge as
- expected.
-
- Note: this is very expensive and should not be used by default."""
- if self.verbose > 0:
- print("Bound after updating %8s: %f" % (n, self.lower_bound(X, z)))
- if end:
- print("Cluster proportions:", self.gamma_.T[1])
- print("covariance_type:", self.covariance_type)
-
- def _do_mstep(self, X, z, params):
- """Maximize the variational lower bound
-
- Update each of the parameters to maximize the lower bound."""
- self._monitor(X, z, "z")
- self._update_concentration(z)
- self._monitor(X, z, "gamma")
- if 'm' in params:
- self._update_means(X, z)
- self._monitor(X, z, "mu")
- if 'c' in params:
- self._update_precisions(X, z)
- self._monitor(X, z, "a and b", end=True)
-
- def _initialize_gamma(self):
- "Initializes the concentration parameters"
- self.gamma_ = self.alpha * np.ones((self.n_components, 3))
-
- def _bound_concentration(self):
- """The variational lower bound for the concentration parameter."""
- logprior = gammaln(self.alpha) * self.n_components
- logprior += np.sum((self.alpha - 1) * (
- digamma(self.gamma_.T[2]) - digamma(self.gamma_.T[1] +
- self.gamma_.T[2])))
- logprior += np.sum(- gammaln(self.gamma_.T[1] + self.gamma_.T[2]))
- logprior += np.sum(gammaln(self.gamma_.T[1]) +
- gammaln(self.gamma_.T[2]))
- logprior -= np.sum((self.gamma_.T[1] - 1) * (
- digamma(self.gamma_.T[1]) - digamma(self.gamma_.T[1] +
- self.gamma_.T[2])))
- logprior -= np.sum((self.gamma_.T[2] - 1) * (
- digamma(self.gamma_.T[2]) - digamma(self.gamma_.T[1] +
- self.gamma_.T[2])))
- return logprior
-
- def _bound_means(self):
- "The variational lower bound for the mean parameters"
- logprior = 0.
- logprior -= 0.5 * squared_norm(self.means_)
- logprior -= 0.5 * self.means_.shape[1] * self.n_components
- return logprior
-
- def _bound_precisions(self):
- """Returns the bound term related to precisions"""
- logprior = 0.
- if self.covariance_type == 'spherical':
- logprior += np.sum(gammaln(self.dof_))
- logprior -= np.sum(
- (self.dof_ - 1) * digamma(np.maximum(0.5, self.dof_)))
- logprior += np.sum(- np.log(self.scale_) + self.dof_
- - self.precs_[:, 0])
- elif self.covariance_type == 'diag':
- logprior += np.sum(gammaln(self.dof_))
- logprior -= np.sum(
- (self.dof_ - 1) * digamma(np.maximum(0.5, self.dof_)))
- logprior += np.sum(- np.log(self.scale_) + self.dof_ - self.precs_)
- elif self.covariance_type == 'tied':
- logprior += _bound_wishart(self.dof_, self.scale_, self.det_scale_)
- elif self.covariance_type == 'full':
- for k in range(self.n_components):
- logprior += _bound_wishart(self.dof_[k],
- self.scale_[k],
- self.det_scale_[k])
- return logprior
-
- def _bound_proportions(self, z):
- """Returns the bound term related to proportions"""
- dg12 = digamma(self.gamma_.T[1] + self.gamma_.T[2])
- dg1 = digamma(self.gamma_.T[1]) - dg12
- dg2 = digamma(self.gamma_.T[2]) - dg12
-
- cz = stable_cumsum(z[:, ::-1], axis=-1)[:, -2::-1]
- logprior = np.sum(cz * dg2[:-1]) + np.sum(z * dg1)
- del cz # Save memory
- z_non_zeros = z[z > np.finfo(np.float32).eps]
- logprior -= np.sum(z_non_zeros * np.log(z_non_zeros))
- return logprior
-
- def _logprior(self, z):
- logprior = self._bound_concentration()
- logprior += self._bound_means()
- logprior += self._bound_precisions()
- logprior += self._bound_proportions(z)
- return logprior
-
- def lower_bound(self, X, z):
- """returns a lower bound on model evidence based on X and membership"""
- check_is_fitted(self, 'means_')
-
- if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']:
- raise NotImplementedError("This ctype is not implemented: %s"
- % self.covariance_type)
- X = np.asarray(X)
- if X.ndim == 1:
- X = X[:, np.newaxis]
- c = np.sum(z * _bound_state_log_lik(X, self._initial_bound +
- self.bound_prec_, self.precs_,
- self.means_, self.covariance_type))
-
- return c + self._logprior(z)
-
- def _set_weights(self):
- for i in xrange(self.n_components):
- self.weights_[i] = self.gamma_[i, 1] / (self.gamma_[i, 1]
- + self.gamma_[i, 2])
- self.weights_ /= np.sum(self.weights_)
-
- def _fit(self, X, y=None):
- """Estimate model parameters with the variational
- algorithm.
-
- A initialization step is performed before entering the em
- algorithm. If you want to avoid this step, set the keyword
- argument init_params to the empty string '' when creating
- the object. Likewise, if you would like just to do an
- initialization, set n_iter=0.
-
- Parameters
- ----------
- X : array_like, shape (n, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- responsibilities : array, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation.
- """
- self.random_state_ = check_random_state(self.random_state)
-
- # initialization step
- X = check_array(X)
- if X.ndim == 1:
- X = X[:, np.newaxis]
-
- n_samples, n_features = X.shape
- z = np.ones((n_samples, self.n_components))
- z /= self.n_components
-
- self._initial_bound = - 0.5 * n_features * np.log(2 * np.pi)
- self._initial_bound -= np.log(2 * np.pi * np.e)
-
- if (self.init_params != '') or not hasattr(self, 'gamma_'):
- self._initialize_gamma()
-
- if 'm' in self.init_params or not hasattr(self, 'means_'):
- self.means_ = cluster.KMeans(
- n_clusters=self.n_components,
- random_state=self.random_state_).fit(X).cluster_centers_[::-1]
-
- if 'w' in self.init_params or not hasattr(self, 'weights_'):
- self.weights_ = np.tile(1.0 / self.n_components, self.n_components)
-
- if 'c' in self.init_params or not hasattr(self, 'precs_'):
- if self.covariance_type == 'spherical':
- self.dof_ = np.ones(self.n_components)
- self.scale_ = np.ones(self.n_components)
- self.precs_ = np.ones((self.n_components, n_features))
- self.bound_prec_ = 0.5 * n_features * (
- digamma(self.dof_) - np.log(self.scale_))
- elif self.covariance_type == 'diag':
- self.dof_ = 1 + 0.5 * n_features
- self.dof_ *= np.ones((self.n_components, n_features))
- self.scale_ = np.ones((self.n_components, n_features))
- self.precs_ = np.ones((self.n_components, n_features))
- self.bound_prec_ = 0.5 * (np.sum(digamma(self.dof_) -
- np.log(self.scale_), 1))
- self.bound_prec_ -= 0.5 * np.sum(self.precs_, 1)
- elif self.covariance_type == 'tied':
- self.dof_ = 1.
- self.scale_ = np.identity(n_features)
- self.precs_ = np.identity(n_features)
- self.det_scale_ = 1.
- self.bound_prec_ = 0.5 * wishart_log_det(
- self.dof_, self.scale_, self.det_scale_, n_features)
- self.bound_prec_ -= 0.5 * self.dof_ * np.trace(self.scale_)
- elif self.covariance_type == 'full':
- self.dof_ = (1 + self.n_components + n_samples)
- self.dof_ *= np.ones(self.n_components)
- self.scale_ = [2 * np.identity(n_features)
- for _ in range(self.n_components)]
- self.precs_ = [np.identity(n_features)
- for _ in range(self.n_components)]
- self.det_scale_ = np.ones(self.n_components)
- self.bound_prec_ = np.zeros(self.n_components)
- for k in range(self.n_components):
- self.bound_prec_[k] = wishart_log_det(
- self.dof_[k], self.scale_[k], self.det_scale_[k],
- n_features)
- self.bound_prec_[k] -= (self.dof_[k] *
- np.trace(self.scale_[k]))
- self.bound_prec_ *= 0.5
-
- # EM algorithms
- current_log_likelihood = None
- # reset self.converged_ to False
- self.converged_ = False
-
- for i in range(self.n_iter):
- prev_log_likelihood = current_log_likelihood
- # Expectation step
- curr_logprob, z = self.score_samples(X)
-
- current_log_likelihood = (
- curr_logprob.mean() + self._logprior(z) / n_samples)
-
- # Check for convergence.
- if prev_log_likelihood is not None:
- change = abs(current_log_likelihood - prev_log_likelihood)
- if change < self.tol:
- self.converged_ = True
- break
-
- # Maximization step
- self._do_mstep(X, z, self.params)
-
- if self.n_iter == 0:
- # Need to make sure that there is a z value to output
- # Output zeros because it was just a quick initialization
- z = np.zeros((X.shape[0], self.n_components))
-
- self._set_weights()
-
- return z
-
-
-@deprecated("The `DPGMM` class is not working correctly and it's better "
- "to use `sklearn.mixture.BayesianGaussianMixture` class with "
- "parameter `weight_concentration_prior_type='dirichlet_process'` "
- "instead. DPGMM is deprecated in 0.18 and will be "
- "removed in 0.20.")
-class DPGMM(_DPGMMBase):
- """Dirichlet Process Gaussian Mixture Models
-
- .. deprecated:: 0.18
- This class will be removed in 0.20.
- Use :class:`sklearn.mixture.BayesianGaussianMixture` with
- parameter ``weight_concentration_prior_type='dirichlet_process'``
- instead.
-
- """
-
- def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
- random_state=None, tol=1e-3, verbose=0, min_covar=None,
- n_iter=10, params='wmc', init_params='wmc'):
- super(DPGMM, self).__init__(
- n_components=n_components, covariance_type=covariance_type,
- alpha=alpha, random_state=random_state, tol=tol, verbose=verbose,
- min_covar=min_covar, n_iter=n_iter, params=params,
- init_params=init_params)
-
-
-@deprecated("The `VBGMM` class is not working correctly and it's better "
- "to use `sklearn.mixture.BayesianGaussianMixture` class with "
- "parameter `weight_concentration_prior_type="
- "'dirichlet_distribution'` instead. "
- "VBGMM is deprecated in 0.18 and will be removed in 0.20.")
-class VBGMM(_DPGMMBase):
- """Variational Inference for the Gaussian Mixture Model
-
- .. deprecated:: 0.18
- This class will be removed in 0.20.
- Use :class:`sklearn.mixture.BayesianGaussianMixture` with parameter
- ``weight_concentration_prior_type='dirichlet_distribution'`` instead.
-
- Variational inference for a Gaussian mixture model probability
- distribution. This class allows for easy and efficient inference
- of an approximate posterior distribution over the parameters of a
- Gaussian mixture model with a fixed number of components.
-
- Initialization is with normally-distributed means and identity
- covariance, for proper convergence.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n_components : int, default 1
- Number of mixture components.
-
- covariance_type : string, default 'diag'
- String describing the type of covariance parameters to
- use. Must be one of 'spherical', 'tied', 'diag', 'full'.
-
- alpha : float, default 1
- Real number representing the concentration parameter of
- the dirichlet distribution. Intuitively, the higher the
- value of alpha the more likely the variational mixture of
- Gaussians model will use all components it can.
-
- tol : float, default 1e-3
- Convergence threshold.
-
- n_iter : int, default 10
- Maximum number of iterations to perform before convergence.
-
- params : string, default 'wmc'
- Controls which parameters are updated in the training
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars.
-
- init_params : string, default 'wmc'
- Controls which parameters are updated in the initialization
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars. Defaults to 'wmc'.
-
- verbose : int, default 0
- Controls output verbosity.
-
- Attributes
- ----------
- covariance_type : string
- String describing the type of covariance parameters used by
- the DP-GMM. Must be one of 'spherical', 'tied', 'diag', 'full'.
-
- n_features : int
- Dimensionality of the Gaussians.
-
- n_components : int (read-only)
- Number of mixture components.
-
- weights_ : array, shape (`n_components`,)
- Mixing weights for each mixture component.
-
- means_ : array, shape (`n_components`, `n_features`)
- Mean parameters for each mixture component.
-
- precs_ : array
- Precision (inverse covariance) parameters for each mixture
- component. The shape depends on `covariance_type`::
-
- (`n_components`, 'n_features') if 'spherical',
- (`n_features`, `n_features`) if 'tied',
- (`n_components`, `n_features`) if 'diag',
- (`n_components`, `n_features`, `n_features`) if 'full'
-
- converged_ : bool
- True when convergence was reached in fit(), False
- otherwise.
-
- See Also
- --------
- GMM : Finite Gaussian mixture model fit with EM
- DPGMM : Infinite Gaussian mixture model, using the dirichlet
- process, fit with a variational algorithm
- """
-
- def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
- random_state=None, tol=1e-3, verbose=0,
- min_covar=None, n_iter=10, params='wmc', init_params='wmc'):
- super(VBGMM, self).__init__(
- n_components, covariance_type, random_state=random_state,
- tol=tol, verbose=verbose, min_covar=min_covar,
- n_iter=n_iter, params=params, init_params=init_params)
- self.alpha = alpha
-
- def _fit(self, X, y=None):
- """Estimate model parameters with the variational algorithm.
-
- A initialization step is performed before entering the EM
- algorithm. If you want to avoid this step, set the keyword
- argument init_params to the empty string '' when creating
- the object. Likewise, if you just would like to do an
- initialization, set n_iter=0.
-
- Parameters
- ----------
- X : array_like, shape (n, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- responsibilities : array, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation.
- """
- self.alpha_ = float(self.alpha) / self.n_components
- return super(VBGMM, self)._fit(X, y)
-
- def score_samples(self, X):
- """Return the likelihood of the data under the model.
-
- Compute the bound on log probability of X under the model
- and return the posterior distribution (responsibilities) of
- each mixture component for each element of X.
-
- This is done by computing the parameters for the mean-field of
- z for each observation.
-
- Parameters
- ----------
- X : array_like, shape (n_samples, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- logprob : array_like, shape (n_samples,)
- Log probabilities of each data point in X
- responsibilities : array_like, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation
- """
- check_is_fitted(self, 'gamma_')
-
- X = check_array(X)
- if X.ndim == 1:
- X = X[:, np.newaxis]
- dg = digamma(self.gamma_) - digamma(np.sum(self.gamma_))
-
- if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']:
- raise NotImplementedError("This ctype is not implemented: %s"
- % self.covariance_type)
- p = _bound_state_log_lik(X, self._initial_bound + self.bound_prec_,
- self.precs_, self.means_,
- self.covariance_type)
-
- z = p + dg
- z = log_normalize(z, axis=-1)
- bound = np.sum(z * p, axis=-1)
- return bound, z
-
- def _update_concentration(self, z):
- for i in range(self.n_components):
- self.gamma_[i] = self.alpha_ + np.sum(z.T[i])
-
- def _initialize_gamma(self):
- self.gamma_ = self.alpha_ * np.ones(self.n_components)
-
- def _bound_proportions(self, z):
- logprior = 0.
- dg = digamma(self.gamma_)
- dg -= digamma(np.sum(self.gamma_))
- logprior += np.sum(dg.reshape((-1, 1)) * z.T)
- z_non_zeros = z[z > np.finfo(np.float32).eps]
- logprior -= np.sum(z_non_zeros * np.log(z_non_zeros))
- return logprior
-
- def _bound_concentration(self):
- logprior = gammaln(np.sum(self.gamma_)) - gammaln(self.n_components
- * self.alpha_)
- logprior -= np.sum(gammaln(self.gamma_) - gammaln(self.alpha_))
- sg = digamma(np.sum(self.gamma_))
- logprior += np.sum((self.gamma_ - self.alpha_)
- * (digamma(self.gamma_) - sg))
- return logprior
-
- def _monitor(self, X, z, n, end=False):
- """Monitor the lower bound during iteration
-
- Debug method to help see exactly when it is failing to converge as
- expected.
-
- Note: this is very expensive and should not be used by default."""
- if self.verbose > 0:
- print("Bound after updating %8s: %f" % (n, self.lower_bound(X, z)))
- if end:
- print("Cluster proportions:", self.gamma_)
- print("covariance_type:", self.covariance_type)
-
- def _set_weights(self):
- self.weights_[:] = self.gamma_
- self.weights_ /= np.sum(self.weights_)
diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py
deleted file mode 100644
index b3c231314cc21..0000000000000
--- a/sklearn/mixture/gmm.py
+++ /dev/null
@@ -1,853 +0,0 @@
-"""
-Gaussian Mixture Models.
-
-This implementation corresponds to frequentist (non-Bayesian) formulation
-of Gaussian Mixture Models.
-"""
-
-# Author: Ron Weiss
-# Fabian Pedregosa
-# Bertrand Thirion
-
-# Important note for the deprecation cleaning of 0.20 :
-# All the functions and classes of this file have been deprecated in 0.18.
-# When you remove this file please also remove the related files
-# - 'sklearn/mixture/dpgmm.py'
-# - 'sklearn/mixture/test_dpgmm.py'
-# - 'sklearn/mixture/test_gmm.py'
-from time import time
-
-import numpy as np
-from scipy import linalg
-
-from ..base import BaseEstimator
-from ..utils import check_random_state, check_array, deprecated
-from ..utils.fixes import logsumexp
-from ..utils.validation import check_is_fitted
-from .. import cluster
-
-from sklearn.externals.six.moves import zip
-
-EPS = np.finfo(float).eps
-
-@deprecated("The function log_multivariate_normal_density is deprecated in 0.18"
- " and will be removed in 0.20.")
-def log_multivariate_normal_density(X, means, covars, covariance_type='diag'):
- """Compute the log probability under a multivariate Gaussian distribution.
-
- Parameters
- ----------
- X : array_like, shape (n_samples, n_features)
- List of n_features-dimensional data points. Each row corresponds to a
- single data point.
-
- means : array_like, shape (n_components, n_features)
- List of n_features-dimensional mean vectors for n_components Gaussians.
- Each row corresponds to a single mean vector.
-
- covars : array_like
- List of n_components covariance parameters for each Gaussian. The shape
- depends on `covariance_type`:
- (n_components, n_features) if 'spherical',
- (n_features, n_features) if 'tied',
- (n_components, n_features) if 'diag',
- (n_components, n_features, n_features) if 'full'
-
- covariance_type : string
- Type of the covariance parameters. Must be one of
- 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'.
-
- Returns
- -------
- lpr : array_like, shape (n_samples, n_components)
- Array containing the log probabilities of each data point in
- X under each of the n_components multivariate Gaussian distributions.
- """
- log_multivariate_normal_density_dict = {
- 'spherical': _log_multivariate_normal_density_spherical,
- 'tied': _log_multivariate_normal_density_tied,
- 'diag': _log_multivariate_normal_density_diag,
- 'full': _log_multivariate_normal_density_full}
- return log_multivariate_normal_density_dict[covariance_type](
- X, means, covars)
-
-
-@deprecated("The function sample_gaussian is deprecated in 0.18"
- " and will be removed in 0.20."
- " Use numpy.random.multivariate_normal instead.")
-def sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
- random_state=None):
- """Generate random samples from a Gaussian distribution.
-
- Parameters
- ----------
- mean : array_like, shape (n_features,)
- Mean of the distribution.
-
- covar : array_like
- Covariance of the distribution. The shape depends on `covariance_type`:
- scalar if 'spherical',
- (n_features) if 'diag',
- (n_features, n_features) if 'tied', or 'full'
-
- covariance_type : string, optional
- Type of the covariance parameters. Must be one of
- 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'.
-
- n_samples : int, optional
- Number of samples to generate. Defaults to 1.
-
- Returns
- -------
- X : array
- Randomly generated sample. The shape depends on `n_samples`:
- (n_features,) if `1`
- (n_features, n_samples) otherwise
- """
- return _sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
- random_state=None)
-
-
-def _sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
- random_state=None):
- rng = check_random_state(random_state)
- n_dim = len(mean)
- rand = rng.randn(n_dim, n_samples)
- if n_samples == 1:
- rand.shape = (n_dim,)
-
- if covariance_type == 'spherical':
- rand *= np.sqrt(covar)
- elif covariance_type == 'diag':
- rand = np.dot(np.diag(np.sqrt(covar)), rand)
- else:
- s, U = linalg.eigh(covar)
- s.clip(0, out=s) # get rid of tiny negatives
- np.sqrt(s, out=s)
- U *= s
- rand = np.dot(U, rand)
-
- return (rand.T + mean).T
-
-
-class _GMMBase(BaseEstimator):
- """Gaussian Mixture Model.
-
- Representation of a Gaussian mixture model probability distribution.
- This class allows for easy evaluation of, sampling from, and
- maximum-likelihood estimation of the parameters of a GMM distribution.
-
- Initializes parameters such that every mixture component has zero
- mean and identity covariance.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- n_components : int, optional
- Number of mixture components. Defaults to 1.
-
- covariance_type : string, optional
- String describing the type of covariance parameters to
- use. Must be one of 'spherical', 'tied', 'diag', 'full'.
- Defaults to 'diag'.
-
- random_state : int, RandomState instance or None, optional (default=None)
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
- by `np.random`.
-
- min_covar : float, optional
- Floor on the diagonal of the covariance matrix to prevent
- overfitting. Defaults to 1e-3.
-
- tol : float, optional
- Convergence threshold. EM iterations will stop when average
- gain in log-likelihood is below this threshold. Defaults to 1e-3.
-
- n_iter : int, optional
- Number of EM iterations to perform.
-
- n_init : int, optional
- Number of initializations to perform. The best results is kept.
-
- params : string, optional
- Controls which parameters are updated in the training
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars. Defaults to 'wmc'.
-
- init_params : string, optional
- Controls which parameters are updated in the initialization
- process. Can contain any combination of 'w' for weights,
- 'm' for means, and 'c' for covars. Defaults to 'wmc'.
-
- verbose : int, default: 0
- Enable verbose output. If 1 then it always prints the current
- initialization and iteration step. If greater than 1 then
- it prints additionally the change and time needed for each step.
-
- Attributes
- ----------
- weights_ : array, shape (`n_components`,)
- This attribute stores the mixing weights for each mixture component.
-
- means_ : array, shape (`n_components`, `n_features`)
- Mean parameters for each mixture component.
-
- covars_ : array
- Covariance parameters for each mixture component. The shape
- depends on `covariance_type`::
-
- (n_components, n_features) if 'spherical',
- (n_features, n_features) if 'tied',
- (n_components, n_features) if 'diag',
- (n_components, n_features, n_features) if 'full'
-
- converged_ : bool
- True when convergence was reached in fit(), False otherwise.
-
- See Also
- --------
-
- DPGMM : Infinite gaussian mixture model, using the Dirichlet
- process, fit with a variational algorithm
-
-
- VBGMM : Finite gaussian mixture model fit with a variational
- algorithm, better for situations where there might be too little
- data to get a good estimate of the covariance matrix.
-
- Examples
- --------
-
- >>> import numpy as np
- >>> from sklearn import mixture
- >>> np.random.seed(1)
- >>> g = mixture.GMM(n_components=2)
- >>> # Generate random observations with two modes centered on 0
- >>> # and 10 to use for training.
- >>> obs = np.concatenate((np.random.randn(100, 1),
- ... 10 + np.random.randn(300, 1)))
- >>> g.fit(obs) # doctest: +NORMALIZE_WHITESPACE
- GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
- n_components=2, n_init=1, n_iter=100, params='wmc',
- random_state=None, tol=0.001, verbose=0)
- >>> np.round(g.weights_, 2)
- array([0.75, 0.25])
- >>> np.round(g.means_, 2)
- array([[10.05],
- [ 0.06]])
- >>> np.round(g.covars_, 2) # doctest: +SKIP
- array([[[ 1.02]],
- [[ 0.96]]])
- >>> g.predict([[0], [2], [9], [10]]) # doctest: +ELLIPSIS
- array([1, 1, 0, 0]...)
- >>> np.round(g.score([[0], [2], [9], [10]]), 2)
- array([-2.19, -4.58, -1.75, -1.21])
- >>> # Refit the model on new data (initial parameters remain the
- >>> # same), this time with an even split between the two modes.
- >>> g.fit(20 * [[0]] + 20 * [[10]]) # doctest: +NORMALIZE_WHITESPACE
- GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
- n_components=2, n_init=1, n_iter=100, params='wmc',
- random_state=None, tol=0.001, verbose=0)
- >>> np.round(g.weights_, 2)
- array([0.5, 0.5])
-
- """
-
- def __init__(self, n_components=1, covariance_type='diag',
- random_state=None, tol=1e-3, min_covar=1e-3,
- n_iter=100, n_init=1, params='wmc', init_params='wmc',
- verbose=0):
- self.n_components = n_components
- self.covariance_type = covariance_type
- self.tol = tol
- self.min_covar = min_covar
- self.random_state = random_state
- self.n_iter = n_iter
- self.n_init = n_init
- self.params = params
- self.init_params = init_params
- self.verbose = verbose
-
- if covariance_type not in ['spherical', 'tied', 'diag', 'full']:
- raise ValueError('Invalid value for covariance_type: %s' %
- covariance_type)
-
- if n_init < 1:
- raise ValueError('GMM estimation requires at least one run')
-
- def _get_covars(self):
- """Covariance parameters for each mixture component.
-
- The shape depends on ``cvtype``::
-
- (n_states, n_features) if 'spherical',
- (n_features, n_features) if 'tied',
- (n_states, n_features) if 'diag',
- (n_states, n_features, n_features) if 'full'
-
- """
- if self.covariance_type == 'full':
- return self.covars_
- elif self.covariance_type == 'diag':
- return [np.diag(cov) for cov in self.covars_]
- elif self.covariance_type == 'tied':
- return [self.covars_] * self.n_components
- elif self.covariance_type == 'spherical':
- return [np.diag(cov) for cov in self.covars_]
-
- def _set_covars(self, covars):
- """Provide values for covariance."""
- covars = np.asarray(covars)
- _validate_covars(covars, self.covariance_type, self.n_components)
- self.covars_ = covars
-
- def score_samples(self, X):
- """Return the per-sample likelihood of the data under the model.
-
- Compute the log probability of X under the model and
- return the posterior distribution (responsibilities) of each
- mixture component for each element of X.
-
- Parameters
- ----------
- X : array_like, shape (n_samples, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- logprob : array_like, shape (n_samples,)
- Log probabilities of each data point in X.
-
- responsibilities : array_like, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation
- """
- check_is_fitted(self, 'means_')
-
- X = check_array(X)
- if X.ndim == 1:
- X = X[:, np.newaxis]
- if X.size == 0:
- return np.array([]), np.empty((0, self.n_components))
- if X.shape[1] != self.means_.shape[1]:
- raise ValueError('The shape of X is not compatible with self')
-
- lpr = (log_multivariate_normal_density(X, self.means_, self.covars_,
- self.covariance_type) +
- np.log(self.weights_))
- logprob = logsumexp(lpr, axis=1)
- responsibilities = np.exp(lpr - logprob[:, np.newaxis])
- return logprob, responsibilities
-
- def score(self, X, y=None):
- """Compute the log probability under the model.
-
- Parameters
- ----------
- X : array_like, shape (n_samples, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- logprob : array_like, shape (n_samples,)
- Log probabilities of each data point in X
- """
- logprob, _ = self.score_samples(X)
- return logprob
-
- def predict(self, X):
- """Predict label for data.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
-
- Returns
- -------
- C : array, shape = (n_samples,) component memberships
- """
- logprob, responsibilities = self.score_samples(X)
- return responsibilities.argmax(axis=1)
-
- def predict_proba(self, X):
- """Predict posterior probability of data under each Gaussian
- in the model.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
-
- Returns
- -------
- responsibilities : array-like, shape = (n_samples, n_components)
- Returns the probability of the sample for each Gaussian
- (state) in the model.
- """
- logprob, responsibilities = self.score_samples(X)
- return responsibilities
-
- def sample(self, n_samples=1, random_state=None):
- """Generate random samples from the model.
-
- Parameters
- ----------
- n_samples : int, optional
- Number of samples to generate. Defaults to 1.
-
- Returns
- -------
- X : array_like, shape (n_samples, n_features)
- List of samples
- """
- check_is_fitted(self, 'means_')
-
- if random_state is None:
- random_state = self.random_state
- random_state = check_random_state(random_state)
- weight_cdf = np.cumsum(self.weights_)
-
- X = np.empty((n_samples, self.means_.shape[1]))
- rand = random_state.rand(n_samples)
- # decide which component to use for each sample
- comps = weight_cdf.searchsorted(rand)
- # for each component, generate all needed samples
- for comp in range(self.n_components):
- # occurrences of current component in X
- comp_in_X = (comp == comps)
- # number of those occurrences
- num_comp_in_X = comp_in_X.sum()
- if num_comp_in_X > 0:
- if self.covariance_type == 'tied':
- cv = self.covars_
- elif self.covariance_type == 'spherical':
- cv = self.covars_[comp][0]
- else:
- cv = self.covars_[comp]
- X[comp_in_X] = _sample_gaussian(
- self.means_[comp], cv, self.covariance_type,
- num_comp_in_X, random_state=random_state).T
- return X
-
- def fit_predict(self, X, y=None):
- """Fit and then predict labels for data.
-
- Warning: Due to the final maximization step in the EM algorithm,
- with low iterations the prediction may not be 100% accurate.
-
- .. versionadded:: 0.17
- *fit_predict* method in Gaussian Mixture Model.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
-
- Returns
- -------
- C : array, shape = (n_samples,) component memberships
- """
- return self._fit(X, y).argmax(axis=1)
-
- def _fit(self, X, y=None, do_prediction=False):
- """Estimate model parameters with the EM algorithm.
-
- A initialization step is performed before entering the
- expectation-maximization (EM) algorithm. If you want to avoid
- this step, set the keyword argument init_params to the empty
- string '' when creating the GMM object. Likewise, if you would
- like just to do an initialization, set n_iter=0.
-
- Parameters
- ----------
- X : array_like, shape (n, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- responsibilities : array, shape (n_samples, n_components)
- Posterior probabilities of each mixture component for each
- observation.
- """
-
- # initialization step
- X = check_array(X, dtype=np.float64, ensure_min_samples=2,
- estimator=self)
- if X.shape[0] < self.n_components:
- raise ValueError(
- 'GMM estimation with %s components, but got only %s samples' %
- (self.n_components, X.shape[0]))
-
- max_log_prob = -np.infty
-
- if self.verbose > 0:
- print('Expectation-maximization algorithm started.')
-
- for init in range(self.n_init):
- if self.verbose > 0:
- print('Initialization ' + str(init + 1))
- start_init_time = time()
-
- if 'm' in self.init_params or not hasattr(self, 'means_'):
- self.means_ = cluster.KMeans(
- n_clusters=self.n_components,
- random_state=self.random_state).fit(X).cluster_centers_
- if self.verbose > 1:
- print('\tMeans have been initialized.')
-
- if 'w' in self.init_params or not hasattr(self, 'weights_'):
- self.weights_ = np.tile(1.0 / self.n_components,
- self.n_components)
- if self.verbose > 1:
- print('\tWeights have been initialized.')
-
- if 'c' in self.init_params or not hasattr(self, 'covars_'):
- cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1])
- if not cv.shape:
- cv.shape = (1, 1)
- self.covars_ = \
- distribute_covar_matrix_to_match_covariance_type(
- cv, self.covariance_type, self.n_components)
- if self.verbose > 1:
- print('\tCovariance matrices have been initialized.')
-
- # EM algorithms
- current_log_likelihood = None
- # reset self.converged_ to False
- self.converged_ = False
-
- for i in range(self.n_iter):
- if self.verbose > 0:
- print('\tEM iteration ' + str(i + 1))
- start_iter_time = time()
- prev_log_likelihood = current_log_likelihood
- # Expectation step
- log_likelihoods, responsibilities = self.score_samples(X)
- current_log_likelihood = log_likelihoods.mean()
-
- # Check for convergence.
- if prev_log_likelihood is not None:
- change = abs(current_log_likelihood - prev_log_likelihood)
- if self.verbose > 1:
- print('\t\tChange: ' + str(change))
- if change < self.tol:
- self.converged_ = True
- if self.verbose > 0:
- print('\t\tEM algorithm converged.')
- break
-
- # Maximization step
- self._do_mstep(X, responsibilities, self.params,
- self.min_covar)
- if self.verbose > 1:
- print('\t\tEM iteration ' + str(i + 1) + ' took {0:.5f}s'.format(
- time() - start_iter_time))
-
- # if the results are better, keep it
- if self.n_iter:
- if current_log_likelihood > max_log_prob:
- max_log_prob = current_log_likelihood
- best_params = {'weights': self.weights_,
- 'means': self.means_,
- 'covars': self.covars_}
- if self.verbose > 1:
- print('\tBetter parameters were found.')
-
- if self.verbose > 1:
- print('\tInitialization ' + str(init + 1) + ' took {0:.5f}s'.format(
- time() - start_init_time))
-
- # check the existence of an init param that was not subject to
- # likelihood computation issue.
- if np.isneginf(max_log_prob) and self.n_iter:
- raise RuntimeError(
- "EM algorithm was never able to compute a valid likelihood " +
- "given initial parameters. Try different init parameters " +
- "(or increasing n_init) or check for degenerate data.")
-
- if self.n_iter:
- self.covars_ = best_params['covars']
- self.means_ = best_params['means']
- self.weights_ = best_params['weights']
- else: # self.n_iter == 0 occurs when using GMM within HMM
- # Need to make sure that there are responsibilities to output
- # Output zeros because it was just a quick initialization
- responsibilities = np.zeros((X.shape[0], self.n_components))
-
- return responsibilities
-
- def fit(self, X, y=None):
- """Estimate model parameters with the EM algorithm.
-
- A initialization step is performed before entering the
- expectation-maximization (EM) algorithm. If you want to avoid
- this step, set the keyword argument init_params to the empty
- string '' when creating the GMM object. Likewise, if you would
- like just to do an initialization, set n_iter=0.
-
- Parameters
- ----------
- X : array_like, shape (n, n_features)
- List of n_features-dimensional data points. Each row
- corresponds to a single data point.
-
- Returns
- -------
- self
- """
- self._fit(X, y)
- return self
-
- def _do_mstep(self, X, responsibilities, params, min_covar=0):
- """Perform the Mstep of the EM algorithm and return the cluster weights.
- """
- weights = responsibilities.sum(axis=0)
- weighted_X_sum = np.dot(responsibilities.T, X)
- inverse_weights = 1.0 / (weights[:, np.newaxis] + 10 * EPS)
-
- if 'w' in params:
- self.weights_ = (weights / (weights.sum() + 10 * EPS) + EPS)
- if 'm' in params:
- self.means_ = weighted_X_sum * inverse_weights
- if 'c' in params:
- covar_mstep_func = _covar_mstep_funcs[self.covariance_type]
- self.covars_ = covar_mstep_func(
- self, X, responsibilities, weighted_X_sum, inverse_weights,
- min_covar)
- return weights
-
- def _n_parameters(self):
- """Return the number of free parameters in the model."""
- ndim = self.means_.shape[1]
- if self.covariance_type == 'full':
- cov_params = self.n_components * ndim * (ndim + 1) / 2.
- elif self.covariance_type == 'diag':
- cov_params = self.n_components * ndim
- elif self.covariance_type == 'tied':
- cov_params = ndim * (ndim + 1) / 2.
- elif self.covariance_type == 'spherical':
- cov_params = self.n_components
- mean_params = ndim * self.n_components
- return int(cov_params + mean_params + self.n_components - 1)
-
- def bic(self, X):
- """Bayesian information criterion for the current model fit
- and the proposed data.
-
- Parameters
- ----------
- X : array of shape(n_samples, n_dimensions)
-
- Returns
- -------
- bic : float (the lower the better)
- """
- return (-2 * self.score(X).sum() +
- self._n_parameters() * np.log(X.shape[0]))
-
- def aic(self, X):
- """Akaike information criterion for the current model fit
- and the proposed data.
-
- Parameters
- ----------
- X : array of shape(n_samples, n_dimensions)
-
- Returns
- -------
- aic : float (the lower the better)
- """
- return - 2 * self.score(X).sum() + 2 * self._n_parameters()
-
-
-@deprecated("The class GMM is deprecated in 0.18 and will be "
- " removed in 0.20. Use class GaussianMixture instead.")
-class GMM(_GMMBase):
- """
- Legacy Gaussian Mixture Model
-
- .. deprecated:: 0.18
- This class will be removed in 0.20.
- Use :class:`sklearn.mixture.GaussianMixture` instead.
-
- """
-
- def __init__(self, n_components=1, covariance_type='diag',
- random_state=None, tol=1e-3, min_covar=1e-3,
- n_iter=100, n_init=1, params='wmc', init_params='wmc',
- verbose=0):
- super(GMM, self).__init__(
- n_components=n_components, covariance_type=covariance_type,
- random_state=random_state, tol=tol, min_covar=min_covar,
- n_iter=n_iter, n_init=n_init, params=params,
- init_params=init_params, verbose=verbose)
-
-#########################################################################
-# some helper routines
-#########################################################################
-
-
-def _log_multivariate_normal_density_diag(X, means, covars):
- """Compute Gaussian log-density at X for a diagonal model."""
- n_samples, n_dim = X.shape
- lpr = -0.5 * (n_dim * np.log(2 * np.pi) + np.sum(np.log(covars), 1)
- + np.sum((means ** 2) / covars, 1)
- - 2 * np.dot(X, (means / covars).T)
- + np.dot(X ** 2, (1.0 / covars).T))
- return lpr
-
-
-def _log_multivariate_normal_density_spherical(X, means, covars):
- """Compute Gaussian log-density at X for a spherical model."""
- cv = covars.copy()
- if covars.ndim == 1:
- cv = cv[:, np.newaxis]
- if cv.shape[1] == 1:
- cv = np.tile(cv, (1, X.shape[-1]))
- return _log_multivariate_normal_density_diag(X, means, cv)
-
-
-def _log_multivariate_normal_density_tied(X, means, covars):
- """Compute Gaussian log-density at X for a tied model."""
- cv = np.tile(covars, (means.shape[0], 1, 1))
- return _log_multivariate_normal_density_full(X, means, cv)
-
-
-def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7):
- """Log probability for full covariance matrices."""
- n_samples, n_dim = X.shape
- nmix = len(means)
- log_prob = np.empty((n_samples, nmix))
- for c, (mu, cv) in enumerate(zip(means, covars)):
- try:
- cv_chol = linalg.cholesky(cv, lower=True)
- except linalg.LinAlgError:
- # The model is most probably stuck in a component with too
- # few observations, we need to reinitialize this components
- try:
- cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim),
- lower=True)
- except linalg.LinAlgError:
- raise ValueError("'covars' must be symmetric, "
- "positive-definite")
-
- cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol)))
- cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T
- log_prob[:, c] = - .5 * (np.sum(cv_sol ** 2, axis=1) +
- n_dim * np.log(2 * np.pi) + cv_log_det)
-
- return log_prob
-
-
-def _validate_covars(covars, covariance_type, n_components):
- """Do basic checks on matrix covariance sizes and values."""
- from scipy import linalg
- if covariance_type == 'spherical':
- if len(covars) != n_components:
- raise ValueError("'spherical' covars have length n_components")
- elif np.any(covars <= 0):
- raise ValueError("'spherical' covars must be non-negative")
- elif covariance_type == 'tied':
- if covars.shape[0] != covars.shape[1]:
- raise ValueError("'tied' covars must have shape (n_dim, n_dim)")
- elif (not np.allclose(covars, covars.T)
- or np.any(linalg.eigvalsh(covars) <= 0)):
- raise ValueError("'tied' covars must be symmetric, "
- "positive-definite")
- elif covariance_type == 'diag':
- if len(covars.shape) != 2:
- raise ValueError("'diag' covars must have shape "
- "(n_components, n_dim)")
- elif np.any(covars <= 0):
- raise ValueError("'diag' covars must be non-negative")
- elif covariance_type == 'full':
- if len(covars.shape) != 3:
- raise ValueError("'full' covars must have shape "
- "(n_components, n_dim, n_dim)")
- elif covars.shape[1] != covars.shape[2]:
- raise ValueError("'full' covars must have shape "
- "(n_components, n_dim, n_dim)")
- for n, cv in enumerate(covars):
- if (not np.allclose(cv, cv.T)
- or np.any(linalg.eigvalsh(cv) <= 0)):
- raise ValueError("component %d of 'full' covars must be "
- "symmetric, positive-definite" % n)
- else:
- raise ValueError("covariance_type must be one of " +
- "'spherical', 'tied', 'diag', 'full'")
-
-
-@deprecated("The function distribute_covar_matrix_to_match_covariance_type"
- "is deprecated in 0.18 and will be removed in 0.20.")
-def distribute_covar_matrix_to_match_covariance_type(
- tied_cv, covariance_type, n_components):
- """Create all the covariance matrices from a given template."""
- if covariance_type == 'spherical':
- cv = np.tile(tied_cv.mean() * np.ones(tied_cv.shape[1]),
- (n_components, 1))
- elif covariance_type == 'tied':
- cv = tied_cv
- elif covariance_type == 'diag':
- cv = np.tile(np.diag(tied_cv), (n_components, 1))
- elif covariance_type == 'full':
- cv = np.tile(tied_cv, (n_components, 1, 1))
- else:
- raise ValueError("covariance_type must be one of " +
- "'spherical', 'tied', 'diag', 'full'")
- return cv
-
-
-def _covar_mstep_diag(gmm, X, responsibilities, weighted_X_sum, norm,
- min_covar):
- """Perform the covariance M step for diagonal cases."""
- avg_X2 = np.dot(responsibilities.T, X * X) * norm
- avg_means2 = gmm.means_ ** 2
- avg_X_means = gmm.means_ * weighted_X_sum * norm
- return avg_X2 - 2 * avg_X_means + avg_means2 + min_covar
-
-
-def _covar_mstep_spherical(*args):
- """Perform the covariance M step for spherical cases."""
- cv = _covar_mstep_diag(*args)
- return np.tile(cv.mean(axis=1)[:, np.newaxis], (1, cv.shape[1]))
-
-
-def _covar_mstep_full(gmm, X, responsibilities, weighted_X_sum, norm,
- min_covar):
- """Perform the covariance M step for full cases."""
- # Eq. 12 from K. Murphy, "Fitting a Conditional Linear Gaussian
- # Distribution"
- n_features = X.shape[1]
- cv = np.empty((gmm.n_components, n_features, n_features))
- for c in range(gmm.n_components):
- post = responsibilities[:, c]
- mu = gmm.means_[c]
- diff = X - mu
- with np.errstate(under='ignore'):
- # Underflow Errors in doing post * X.T are not important
- avg_cv = np.dot(post * diff.T, diff) / (post.sum() + 10 * EPS)
- cv[c] = avg_cv + min_covar * np.eye(n_features)
- return cv
-
-
-def _covar_mstep_tied(gmm, X, responsibilities, weighted_X_sum, norm,
- min_covar):
- """Perform the covariance M step for tied cases."""
- # Eq. 15 from K. Murphy, "Fitting a Conditional Linear Gaussian
- # Distribution"
- avg_X2 = np.dot(X.T, X)
- avg_means2 = np.dot(gmm.means_.T, weighted_X_sum)
- out = avg_X2 - avg_means2
- out *= 1. / X.shape[0]
- out.flat[::len(out) + 1] += min_covar
- return out
-
-_covar_mstep_funcs = {'spherical': _covar_mstep_spherical,
- 'diag': _covar_mstep_diag,
- 'tied': _covar_mstep_tied,
- 'full': _covar_mstep_full,
- }
From 3144d76f4133867d356f3918587e61f0e7b9f333 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 12:35:05 -0400
Subject: [PATCH 22/36] remove deprecated scorers
---
sklearn/metrics/scorer.py | 48 +++++----------------------------------
1 file changed, 6 insertions(+), 42 deletions(-)
diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 3cafcd7ced27a..590b9826b4ef3 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -18,17 +18,17 @@
# Arnaud Joly
# License: Simplified BSD
-from abc import ABCMeta, abstractmethod
+from abc import ABCMeta
from collections import Iterable
-import warnings
import numpy as np
from . import (r2_score, median_absolute_error, mean_absolute_error,
mean_squared_error, mean_squared_log_error, accuracy_score,
f1_score, roc_auc_score, average_precision_score,
- precision_score, recall_score, log_loss, balanced_accuracy_score,
- explained_variance_score, brier_score_loss)
+ precision_score, recall_score, log_loss,
+ balanced_accuracy_score, explained_variance_score,
+ brier_score_loss)
from .cluster import adjusted_rand_score
from .cluster import homogeneity_score
@@ -49,16 +49,6 @@ def __init__(self, score_func, sign, kwargs):
self._kwargs = kwargs
self._score_func = score_func
self._sign = sign
- # XXX After removing the deprecated scorers (v0.20) remove the
- # XXX deprecation_msg property again and remove __call__'s body again
- self._deprecation_msg = None
-
- @abstractmethod
- def __call__(self, estimator, X, y, sample_weight=None):
- if self._deprecation_msg is not None:
- warnings.warn(self._deprecation_msg,
- category=DeprecationWarning,
- stacklevel=2)
def __repr__(self):
kwargs_string = "".join([", %s=%s" % (str(k), str(v))
@@ -97,8 +87,7 @@ def __call__(self, estimator, X, y_true, sample_weight=None):
score : float
Score function applied to prediction of estimator on X.
"""
- super(_PredictScorer, self).__call__(estimator, X, y_true,
- sample_weight=sample_weight)
+
y_pred = estimator.predict(X)
if sample_weight is not None:
return self._sign * self._score_func(y_true, y_pred,
@@ -134,8 +123,6 @@ def __call__(self, clf, X, y, sample_weight=None):
score : float
Score function applied to prediction of estimator on X.
"""
- super(_ProbaScorer, self).__call__(clf, X, y,
- sample_weight=sample_weight)
y_type = type_of_target(y)
y_pred = clf.predict_proba(X)
if y_type == "binary":
@@ -178,8 +165,6 @@ def __call__(self, clf, X, y, sample_weight=None):
score : float
Score function applied to prediction of estimator on X.
"""
- super(_ThresholdScorer, self).__call__(clf, X, y,
- sample_weight=sample_weight)
y_type = type_of_target(y)
if y_type not in ("binary", "multilabel-indicator"):
raise ValueError("{0} format is not supported".format(y_type))
@@ -471,31 +456,13 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
r2_scorer = make_scorer(r2_score)
neg_mean_squared_error_scorer = make_scorer(mean_squared_error,
greater_is_better=False)
-deprecation_msg = ('Scoring method mean_squared_error was renamed to '
- 'neg_mean_squared_error in version 0.18 and will '
- 'be removed in 0.20.')
-mean_squared_error_scorer = make_scorer(mean_squared_error,
- greater_is_better=False)
-mean_squared_error_scorer._deprecation_msg = deprecation_msg
neg_mean_squared_log_error_scorer = make_scorer(mean_squared_log_error,
greater_is_better=False)
neg_mean_absolute_error_scorer = make_scorer(mean_absolute_error,
greater_is_better=False)
-deprecation_msg = ('Scoring method mean_absolute_error was renamed to '
- 'neg_mean_absolute_error in version 0.18 and will '
- 'be removed in 0.20.')
-mean_absolute_error_scorer = make_scorer(mean_absolute_error,
- greater_is_better=False)
-mean_absolute_error_scorer._deprecation_msg = deprecation_msg
+
neg_median_absolute_error_scorer = make_scorer(median_absolute_error,
greater_is_better=False)
-deprecation_msg = ('Scoring method median_absolute_error was renamed to '
- 'neg_median_absolute_error in version 0.18 and will '
- 'be removed in 0.20.')
-median_absolute_error_scorer = make_scorer(median_absolute_error,
- greater_is_better=False)
-median_absolute_error_scorer._deprecation_msg = deprecation_msg
-
# Standard Classification Scores
accuracy_scorer = make_scorer(accuracy_score)
@@ -535,9 +502,6 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
neg_mean_absolute_error=neg_mean_absolute_error_scorer,
neg_mean_squared_error=neg_mean_squared_error_scorer,
neg_mean_squared_log_error=neg_mean_squared_log_error_scorer,
- median_absolute_error=median_absolute_error_scorer,
- mean_absolute_error=mean_absolute_error_scorer,
- mean_squared_error=mean_squared_error_scorer,
accuracy=accuracy_scorer, roc_auc=roc_auc_scorer,
balanced_accuracy=balanced_accuracy_scorer,
average_precision=average_precision_scorer,
From ec66b5d641d6d7262120d25a33e6d02562e35b35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 12:38:15 -0400
Subject: [PATCH 23/36] attributes X_, y_ in isotonic
---
sklearn/isotonic.py | 29 -----------------------------
1 file changed, 29 deletions(-)
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index 77bb7e903a183..79c4d11b207f8 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -8,7 +8,6 @@
from scipy.stats import spearmanr
from .base import BaseEstimator, TransformerMixin, RegressorMixin
from .utils import as_float_array, check_array, check_consistent_length
-from .utils import deprecated
from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique
import warnings
import math
@@ -217,34 +216,6 @@ def __init__(self, y_min=None, y_max=None, increasing=True,
self.increasing = increasing
self.out_of_bounds = out_of_bounds
- @property
- @deprecated("Attribute ``X_`` is deprecated in version 0.18 and will be"
- " removed in version 0.20.")
- def X_(self):
- return self._X_
-
- @X_.setter
- def X_(self, value):
- self._X_ = value
-
- @X_.deleter
- def X_(self):
- del self._X_
-
- @property
- @deprecated("Attribute ``y_`` is deprecated in version 0.18 and will"
- " be removed in version 0.20.")
- def y_(self):
- return self._y_
-
- @y_.setter
- def y_(self, value):
- self._y_ = value
-
- @y_.deleter
- def y_(self):
- del self._y_
-
def _check_fit_data(self, X, y, sample_weight=None):
if len(X.shape) != 1:
raise ValueError("X should be a 1d array")
From 9a879e11294c6b2136fd8822f92f2d2cb743c4e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 12:49:29 -0400
Subject: [PATCH 24/36] many more deprecations
---
sklearn/base.py | 51 +------------
sklearn/cluster/tests/test_hierarchical.py | 4 +-
sklearn/linear_model/base.py | 76 +-------------------
sklearn/linear_model/least_angle.py | 14 +---
sklearn/model_selection/_search.py | 26 -------
sklearn/model_selection/_split.py | 1 +
sklearn/model_selection/tests/test_search.py | 1 +
7 files changed, 11 insertions(+), 162 deletions(-)
diff --git a/sklearn/base.py b/sklearn/base.py
index 5b96a3de729ab..d75adb06d61b1 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -67,57 +67,10 @@ def clone(estimator, safe=True):
for name in new_object_params:
param1 = new_object_params[name]
param2 = params_set[name]
- if param1 is param2:
- # this should always happen
- continue
- if isinstance(param1, np.ndarray):
- # For most ndarrays, we do not test for complete equality
- if not isinstance(param2, type(param1)):
- equality_test = False
- elif (param1.ndim > 0
- and param1.shape[0] > 0
- and isinstance(param2, np.ndarray)
- and param2.ndim > 0
- and param2.shape[0] > 0):
- equality_test = (
- param1.shape == param2.shape
- and param1.dtype == param2.dtype
- and (_first_and_last_element(param1) ==
- _first_and_last_element(param2))
- )
- else:
- equality_test = np.all(param1 == param2)
- elif sparse.issparse(param1):
- # For sparse matrices equality doesn't work
- if not sparse.issparse(param2):
- equality_test = False
- elif param1.size == 0 or param2.size == 0:
- equality_test = (
- param1.__class__ == param2.__class__
- and param1.size == 0
- and param2.size == 0
- )
- else:
- equality_test = (
- param1.__class__ == param2.__class__
- and (_first_and_last_element(param1) ==
- _first_and_last_element(param2))
- and param1.nnz == param2.nnz
- and param1.shape == param2.shape
- )
- else:
- # fall back on standard equality
- equality_test = param1 == param2
- if equality_test:
- warnings.warn("Estimator %s modifies parameters in __init__."
- " This behavior is deprecated as of 0.18 and "
- "support for this behavior will be removed in 0.20."
- % type(estimator).__name__, DeprecationWarning)
- else:
+ if param1 is not param2:
raise RuntimeError('Cannot clone object %s, as the constructor '
- 'does not seem to set parameter %s' %
+ 'either does not set or modifies parameter %s' %
(estimator, name))
-
return new_object
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index 83ddc9729cebf..b3056b95d225c 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -51,6 +51,7 @@ def test_deprecation_of_n_components_in_linkage_tree():
assert_equal(n_leaves, n_leaves_t)
assert_equal(parent, parent_t)
+
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
@@ -511,7 +512,8 @@ def test_connectivity_callable():
connectivity = kneighbors_graph(X, 3, include_self=False)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(
- connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False))
+ connectivity=partial(kneighbors_graph, n_neighbors=3,
+ include_self=False))
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 6bcdd624083e9..09c389cb336d7 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -26,7 +26,7 @@
from ..externals import six
from ..externals.joblib import Parallel, delayed
from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
-from ..utils import check_array, check_X_y, deprecated, as_float_array
+from ..utils import check_array, check_X_y
from ..utils.validation import FLOAT_DTYPES
from ..utils import check_random_state
from ..utils.extmath import safe_sparse_dot
@@ -67,80 +67,6 @@ def make_dataset(X, y, sample_weight, random_state=None):
return dataset, intercept_decay
-@deprecated("sparse_center_data was deprecated in version 0.18 and will be "
- "removed in 0.20. Use utilities in preprocessing.data instead")
-def sparse_center_data(X, y, fit_intercept, normalize=False):
- """
- Compute information needed to center data to have mean zero along
- axis 0. Be aware that X will not be centered since it would break
- the sparsity, but will be normalized if asked so.
- """
- if fit_intercept:
- # we might require not to change the csr matrix sometimes
- # store a copy if normalize is True.
- # Change dtype to float64 since mean_variance_axis accepts
- # it that way.
- if sp.isspmatrix(X) and X.getformat() == 'csr':
- X = sp.csr_matrix(X, copy=normalize, dtype=np.float64)
- else:
- X = sp.csc_matrix(X, copy=normalize, dtype=np.float64)
-
- X_offset, X_var = mean_variance_axis(X, axis=0)
- if normalize:
- # transform variance to std in-place
- X_var *= X.shape[0]
- X_std = np.sqrt(X_var, X_var)
- del X_var
- X_std[X_std == 0] = 1
- inplace_column_scale(X, 1. / X_std)
- else:
- X_std = np.ones(X.shape[1])
- y_offset = y.mean(axis=0)
- y = y - y_offset
- else:
- X_offset = np.zeros(X.shape[1])
- X_std = np.ones(X.shape[1])
- y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
-
- return X, y, X_offset, y_offset, X_std
-
-
-@deprecated("center_data was deprecated in version 0.18 and will be removed "
- "in 0.20. Use utilities in preprocessing.data instead")
-def center_data(X, y, fit_intercept, normalize=False, copy=True,
- sample_weight=None):
- """
- Centers data to have mean zero along axis 0. This is here because
- nearly all linear models will want their data to be centered.
- If sample_weight is not None, then the weighted mean of X and y
- is zero, and not the mean itself
- """
- X = as_float_array(X, copy)
- if fit_intercept:
- if isinstance(sample_weight, numbers.Number):
- sample_weight = None
- if sp.issparse(X):
- X_offset = np.zeros(X.shape[1])
- X_std = np.ones(X.shape[1])
- else:
- X_offset = np.average(X, axis=0, weights=sample_weight)
- X -= X_offset
- # XXX: currently scaled to variance=n_samples
- if normalize:
- X_std = np.sqrt(np.sum(X ** 2, axis=0))
- X_std[X_std == 0] = 1
- X /= X_std
- else:
- X_std = np.ones(X.shape[1])
- y_offset = np.average(y, axis=0, weights=sample_weight)
- y = y - y_offset
- else:
- X_offset = np.zeros(X.shape[1])
- X_std = np.ones(X.shape[1])
- y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
- return X, y, X_offset, y_offset, X_std
-
-
def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
sample_weight=None, return_mean=False):
"""
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 929e6ab6d08b3..cd10edcc4e944 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -544,7 +544,7 @@ class Lars(LinearModel, RegressorMixin):
remove fit_intercept which is set True by default.
.. deprecated:: 0.20
-
+
The option is broken and deprecated. It will be removed in v0.22.
Attributes
@@ -619,10 +619,8 @@ def _fit(self, X, y, max_iter, alpha, fit_path, Xy=None):
"""Auxiliary method to fit the model using X, y as training data"""
n_features = X.shape[1]
- X, y, X_offset, y_offset, X_scale = self._preprocess_data(X, y,
- self.fit_intercept,
- self.normalize,
- self.copy_X)
+ X, y, X_offset, y_offset, X_scale = self._preprocess_data(
+ X, y, self.fit_intercept, self.normalize, self.copy_X)
if y.ndim == 1:
y = y[:, np.newaxis]
@@ -1174,12 +1172,6 @@ def alpha(self):
# impedance matching for the above Lars.fit (should not be documented)
return self.alpha_
- @property
- @deprecated("Attribute ``cv_mse_path_`` is deprecated in 0.18 and "
- "will be removed in 0.20. Use ``mse_path_`` instead")
- def cv_mse_path_(self):
- return self.mse_path_
-
class LassoLarsCV(LarsCV):
"""Cross-validated Lasso, using the LARS algorithm
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 99d6096af73db..b803289aebf1e 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -779,32 +779,6 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
return self
- @property
- def grid_scores_(self):
- check_is_fitted(self, 'cv_results_')
- if self.multimetric_:
- raise AttributeError("grid_scores_ attribute is not available for"
- " multi-metric evaluation.")
- warnings.warn(
- "The grid_scores_ attribute was deprecated in version 0.18"
- " in favor of the more elaborate cv_results_ attribute."
- " The grid_scores_ attribute will not be available from 0.20",
- DeprecationWarning)
-
- grid_scores = list()
-
- for i, (params, mean, std) in enumerate(zip(
- self.cv_results_['params'],
- self.cv_results_['mean_test_score'],
- self.cv_results_['std_test_score'])):
- scores = np.array(list(self.cv_results_['split%d_test_score'
- % s][i]
- for s in range(self.n_splits_)),
- dtype=np.float64)
- grid_scores.append(_CVScoreTuple(params, mean, scores))
-
- return grid_scores
-
class GridSearchCV(BaseSearchCV):
"""Exhaustive search over specified parameter values for an estimator.
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 866cb4cc53aa8..202009ab4951a 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -2058,6 +2058,7 @@ def train_test_split(*arrays, **options):
# Tell nose that train_test_split is not a test
train_test_split.__test__ = False
+
def _build_repr(self):
# XXX This is copied from BaseEstimator's get_params
cls = self.__class__
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index f436c7b55cf36..974431320d4fc 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -138,6 +138,7 @@ def test_validate_parameter_grid_input(input, error_type, error_message):
with pytest.raises(error_type, message=error_message):
ParameterGrid(input)
+
def test_parameter_grid():
# Test basic properties of ParameterGrid.
From d50d694f3ea97db5443c0efa68dbd0843add8c06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 13:09:06 -0400
Subject: [PATCH 25/36] remove grid_scores_ tests
---
sklearn/model_selection/tests/test_search.py | 26 --------------------
1 file changed, 26 deletions(-)
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 974431320d4fc..a689db24679fc 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -799,30 +799,6 @@ def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand):
for key in param_keys + score_keys))
-def check_cv_results_grid_scores_consistency(search):
- # TODO Remove test in 0.20
- if search.multimetric_:
- assert_raise_message(AttributeError, "not available for multi-metric",
- getattr, search, 'grid_scores_')
- else:
- cv_results = search.cv_results_
- res_scores = np.vstack(list([cv_results["split%d_test_score" % i]
- for i in range(search.n_splits_)])).T
- res_means = cv_results["mean_test_score"]
- res_params = cv_results["params"]
- n_cand = len(res_params)
- grid_scores = assert_warns(DeprecationWarning, getattr,
- search, 'grid_scores_')
- assert_equal(len(grid_scores), n_cand)
- # Check consistency of the structure of grid_scores
- for i in range(n_cand):
- assert_equal(grid_scores[i].parameters, res_params[i])
- assert_array_equal(grid_scores[i].cv_validation_scores,
- res_scores[i, :])
- assert_array_equal(grid_scores[i].mean_validation_score,
- res_means[i])
-
-
def test_grid_search_cv_results():
X, y = make_classification(n_samples=50, n_features=4,
random_state=42)
@@ -873,7 +849,6 @@ def test_grid_search_cv_results():
cv_results['param_degree'].mask[i])
for i in range(n_candidates)
if cv_results['param_kernel'][i] == 'rbf'))
- check_cv_results_grid_scores_consistency(search)
def test_random_search_cv_results():
@@ -908,7 +883,6 @@ def test_random_search_cv_results():
# For random_search, all the param array vals should be unmasked
assert_false(any(cv_results['param_C'].mask) or
any(cv_results['param_gamma'].mask))
- check_cv_results_grid_scores_consistency(search)
@ignore_warnings(category=DeprecationWarning)
From 278add76cb77cf5d9dd53e26c75963de9cf9dc10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 13:09:29 -0400
Subject: [PATCH 26/36] export_graphviz out_file deprecation
---
sklearn/tree/export.py | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py
index 65e0695e4a6ac..ef13790e65b42 100644
--- a/sklearn/tree/export.py
+++ b/sklearn/tree/export.py
@@ -14,7 +14,6 @@
from numbers import Integral
import numpy as np
-import warnings
from ..externals import six
from ..utils.validation import check_is_fitted
@@ -73,7 +72,7 @@ def __repr__(self):
SENTINEL = Sentinel()
-def export_graphviz(decision_tree, out_file=SENTINEL, max_depth=None,
+def export_graphviz(decision_tree, out_file=None, max_depth=None,
feature_names=None, class_names=None, label='all',
filled=False, leaves_parallel=False, impurity=True,
node_ids=False, proportion=False, rotate=False,
@@ -97,9 +96,12 @@ def export_graphviz(decision_tree, out_file=SENTINEL, max_depth=None,
decision_tree : decision tree regressor or classifier
The decision tree to be exported to GraphViz.
- out_file : file object or string, optional (default='tree.dot')
+ out_file : file object or string, optional (default=None)
Handle or name of the output file. If ``None``, the result is
- returned as a string. This will the default from version 0.20.
+ returned as a string.
+
+ .. versionchanged:: 0.20
+ Default of out_file changed from "tree.dot" to None.
max_depth : int, optional (default=None)
The maximum depth of the representation. If None, the tree is fully
@@ -395,12 +397,6 @@ def recurse(tree, node_id, criterion, parent=None, depth=0):
own_file = False
return_string = False
try:
- if out_file == SENTINEL:
- warnings.warn("out_file can be set to None starting from 0.18. "
- "This will be the default in 0.20.",
- DeprecationWarning)
- out_file = "tree.dot"
-
if isinstance(out_file, six.string_types):
if six.PY3:
out_file = open(out_file, "w", encoding="utf-8")
From 40939f42b819bfaccc48be5f34882e136c1c66ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 13:09:53 -0400
Subject: [PATCH 27/36] LDA learning_method changed to batch
---
sklearn/decomposition/online_lda.py | 18 +++++++-----------
1 file changed, 7 insertions(+), 11 deletions(-)
diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
index bf35b20f35b8b..fa40e2ef6802d 100644
--- a/sklearn/decomposition/online_lda.py
+++ b/sklearn/decomposition/online_lda.py
@@ -156,12 +156,11 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
to `1 / n_components`.
In the literature, this is called `beta`.
- learning_method : 'batch' | 'online', default='online'
+ learning_method : 'batch' | 'online', default='batch'
Method used to update `_component`. Only used in `fit` method.
In general, if the data size is large, the online update will be much
faster than the batch update.
- The default learning method is going to be changed to 'batch' in the
- 0.20 release.
+
Valid options::
'batch': Batch variational Bayes method. Use all training data in
@@ -172,6 +171,9 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
variable incrementally. The learning rate is controlled by the
``learning_decay`` and the ``learning_offset`` parameters.
+ .. versionchanged:: 0.20
+ The default learning method is now ``"batch"``.
+
learning_decay : float, optional (default=0.7)
It is a parameter that control learning rate in the online learning
method. The value should be set between (0.5, 1.0] to guarantee
@@ -262,7 +264,7 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
"""
def __init__(self, n_components=10, doc_topic_prior=None,
- topic_word_prior=None, learning_method=None,
+ topic_word_prior=None, learning_method='batch',
learning_decay=.7, learning_offset=10., max_iter=10,
batch_size=128, evaluate_every=-1, total_samples=1e6,
perp_tol=1e-1, mean_change_tol=1e-3, max_doc_update_iter=100,
@@ -307,7 +309,7 @@ def _check_params(self):
raise ValueError("Invalid 'learning_offset' parameter: %r"
% self.learning_offset)
- if self.learning_method not in ("batch", "online", None):
+ if self.learning_method not in ("batch", "online"):
raise ValueError("Invalid 'learning_method' parameter: %r"
% self.learning_method)
@@ -529,12 +531,6 @@ def fit(self, X, y=None):
max_iter = self.max_iter
evaluate_every = self.evaluate_every
learning_method = self.learning_method
- if learning_method is None:
- warnings.warn("The default value for 'learning_method' will be "
- "changed from 'online' to 'batch' in the release "
- "0.20. This warning was introduced in 0.18.",
- DeprecationWarning)
- learning_method = 'online'
batch_size = self.batch_size
From 2a34f54823d468f9071a8702c6b63a661cddb9cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 13:10:21 -0400
Subject: [PATCH 28/36] change clone test from deprecation warning to error.
---
sklearn/tests/test_base.py | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 4620dcbd03604..31c4d80967a1d 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -24,7 +24,6 @@
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets
-from sklearn.utils import deprecated
from sklearn.base import TransformerMixin
from sklearn.utils.mocking import MockDataFrame
@@ -132,6 +131,9 @@ def test_clone_buggy():
varg_est = VargEstimator()
assert_raises(RuntimeError, clone, varg_est)
+ est = ModifyInitParams()
+ assert_raises(RuntimeError, clone, est)
+
def test_clone_empty_array():
# Regression test for cloning estimators with empty arrays
@@ -152,16 +154,6 @@ def test_clone_nan():
assert_true(clf.empty is clf2.empty)
-def test_clone_copy_init_params():
- # test for deprecation warning when copying or casting an init parameter
- est = ModifyInitParams()
- message = ("Estimator ModifyInitParams modifies parameters in __init__. "
- "This behavior is deprecated as of 0.18 and support "
- "for this behavior will be removed in 0.20.")
-
- assert_warns_message(DeprecationWarning, message, clone, est)
-
-
def test_clone_sparse_matrices():
sparse_matrix_classes = [
getattr(sp, name)
From a53c000fc469c1c71c282553cbce70b336a3d1d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 13:10:29 -0400
Subject: [PATCH 29/36] sphinx formatting
---
sklearn/model_selection/_search.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index b803289aebf1e..9c7fd7d00b345 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -861,7 +861,7 @@ class GridSearchCV(BaseSearchCV):
will change to False in version 0.21, to correspond to the standard
definition of cross-validation.
- ..versionchanged:: 0.20
+ .. versionchanged:: 0.20
Parameter ``iid`` will change from True to False by default in
version 0.22, and will be removed in 0.24.
@@ -1195,7 +1195,7 @@ class RandomizedSearchCV(BaseSearchCV):
will change to False in version 0.21, to correspond to the standard
definition of cross-validation.
- ..versionchanged:: 0.20
+ .. versionchanged:: 0.20
Parameter ``iid`` will change from True to False by default in
version 0.22, and will be removed in 0.24.
From 22a7cde877b269f74675a0c040171340112109cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 13:21:18 -0400
Subject: [PATCH 30/36] remove removed classed and functions from classes.rst
---
doc/modules/classes.rst | 43 -----------------------------------------
1 file changed, 43 deletions(-)
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index f9dc4e4dfced0..fc78214e71f7a 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1502,46 +1502,3 @@ To be removed in 0.21
datasets.load_mlcomp
linear_model.lasso_stability_path
-
-
-To be removed in 0.20
----------------------
-
-.. autosummary::
- :toctree: generated/
- :template: deprecated_class.rst
-
- cross_validation.KFold
- cross_validation.LabelKFold
- cross_validation.LeaveOneLabelOut
- cross_validation.LeaveOneOut
- cross_validation.LeavePOut
- cross_validation.LeavePLabelOut
- cross_validation.LabelShuffleSplit
- cross_validation.ShuffleSplit
- cross_validation.StratifiedKFold
- cross_validation.StratifiedShuffleSplit
- cross_validation.PredefinedSplit
- decomposition.RandomizedPCA
- gaussian_process.GaussianProcess
- grid_search.ParameterGrid
- grid_search.ParameterSampler
- grid_search.GridSearchCV
- grid_search.RandomizedSearchCV
- mixture.DPGMM
- mixture.GMM
- mixture.VBGMM
-
-
-.. autosummary::
- :toctree: generated/
- :template: deprecated_function.rst
-
- cross_validation.check_cv
- cross_validation.cross_val_predict
- cross_validation.cross_val_score
- cross_validation.permutation_test_score
- cross_validation.train_test_split
- grid_search.fit_grid_point
- learning_curve.learning_curve
- learning_curve.validation_curve
From 5d3fbe9f332a4b54bc943ec8f3aa934ca13b2c9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Sat, 26 May 2018 13:55:32 -0400
Subject: [PATCH 31/36] fix doctests
---
doc/modules/model_evaluation.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index b5da615fa8851..0fa1037e4d623 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -104,7 +104,7 @@ Usage examples:
>>> model = svm.SVC()
>>> cross_val_score(model, X, y, scoring='wrong_choice')
Traceback (most recent call last):
- ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
+ ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
.. note::
From 9ca4f47a9d4300c3d8f3ef62c8c0165b80064561 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Mon, 4 Jun 2018 13:20:28 -0400
Subject: [PATCH 32/36] address jnothman's comments; some minor fixes
---
doc/tutorial/machine_learning_map/index.rst | 2 +-
sklearn/isotonic.py | 2 +-
sklearn/model_selection/tests/test_search.py | 1 -
sklearn/utils/estimator_checks.py | 16 +++++-----------
4 files changed, 7 insertions(+), 14 deletions(-)
diff --git a/doc/tutorial/machine_learning_map/index.rst b/doc/tutorial/machine_learning_map/index.rst
index b198fa766ec99..3690d76b31bd9 100644
--- a/doc/tutorial/machine_learning_map/index.rst
+++ b/doc/tutorial/machine_learning_map/index.rst
@@ -100,7 +100,7 @@ Click on any estimator in the chart below to see its documentation.
-
+
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index 79c4d11b207f8..7b74048e18594 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -188,7 +188,7 @@ class IsotonicRegression(BaseEstimator, TransformerMixin, RegressorMixin):
Maximum value of input array `X_` for right bound.
f_ : function
- The stepwise interpolating function that covers the domain `X_`.
+ The stepwise interpolating function that covers the input domain ``X``.
Notes
-----
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index a689db24679fc..db74c4296a44b 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -416,7 +416,6 @@ def test_classes__property():
def test_trivial_cv_results_attr():
# Test search over a "grid" with only one point.
- # Non-regression test: grid_scores_ wouldn't be set by GridSearchCV.
clf = MockClassifier()
grid_search = GridSearchCV(clf, {'foo_param': [1]})
grid_search.fit(X, y)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 5e03fa2c96ee5..f63beada36281 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -67,7 +67,7 @@
BOSTON = None
CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']
MULTI_OUTPUT = ['CCA', 'DecisionTreeRegressor', 'ElasticNet',
- 'ExtraTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcess',
+ 'ExtraTreeRegressor', 'ExtraTreesRegressor',
'GaussianProcessRegressor', 'TransformedTargetRegressor',
'KNeighborsRegressor', 'KernelRidge', 'Lars', 'Lasso',
'LassoLars', 'LinearRegression', 'MultiTaskElasticNet',
@@ -104,10 +104,8 @@ def _yield_non_meta_checks(name, estimator):
# Test that all estimators check their input for NaN's and infs
yield check_estimators_nan_inf
- if name not in ['GaussianProcess']:
- # FIXME!
- # in particular GaussianProcess!
- yield check_estimators_overwrite_params
+ yield check_estimators_overwrite_params
+
if hasattr(estimator, 'sparsify'):
yield check_sparsify_coefficients
@@ -185,9 +183,7 @@ def _yield_regressor_checks(name, regressor):
if name != 'CCA':
# check that the regressor handles int input
yield check_regressors_int
- if name != "GaussianProcessRegressor":
- # Test if NotFittedError is raised
- yield check_estimators_unfitted
+ yield check_estimators_unfitted
yield check_non_transformer_estimators_n_iter
@@ -258,9 +254,7 @@ def _yield_all_checks(name, estimator):
yield check
yield check_fit2d_predict1d
yield check_methods_subset_invariance
- if name != 'GaussianProcess': # FIXME
- # XXX GaussianProcess deprecated in 0.20
- yield check_fit2d_1sample
+ yield check_fit2d_1sample
yield check_fit2d_1feature
yield check_fit1d
yield check_get_params_invariance
From fab56a3094bd3aebbfbfbdfa0896057e92bc97a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?=
Date: Mon, 4 Jun 2018 13:23:08 -0400
Subject: [PATCH 33/36] keep randomized PCA in incremental benchmark.
---
benchmarks/bench_plot_incremental_pca.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/benchmarks/bench_plot_incremental_pca.py b/benchmarks/bench_plot_incremental_pca.py
index 43b6ff9452c78..11af628d3b931 100644
--- a/benchmarks/bench_plot_incremental_pca.py
+++ b/benchmarks/bench_plot_incremental_pca.py
@@ -111,6 +111,7 @@ def variable_batch_size_comparison(data):
all_times = defaultdict(list)
all_errors = defaultdict(list)
pca = PCA(n_components=n_components)
+ rpca = PCA(n_components=n_components, svd_solver='randomized', random_state=1999)
results_dict = {k: benchmark(est, data) for k, est in [('pca', pca)]}
# Create flat baselines to compare the variation over batch size
From 5ca5cfc6f44a8117140b5d7f8903c7382b1a04fa Mon Sep 17 00:00:00 2001
From: Joel Nothman
Date: Sun, 24 Jun 2018 21:07:41 +1000
Subject: [PATCH 34/36] Revert change to estimator_checks about
GaussianProcessRegressor
---
sklearn/utils/estimator_checks.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index c04d671c9485d..b937d7f893c7a 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -136,8 +136,9 @@ def _yield_classifier_checks(name, classifier):
yield check_supervised_y_2d
yield check_supervised_y_no_nan
- # test if NotFittedError is raised
- yield check_estimators_unfitted
+ if name != "GaussianProcessRegressor":
+ # test if NotFittedError is raised
+ yield check_estimators_unfitted
if 'class_weight' in classifier.get_params().keys():
yield check_class_weight_classifiers
From bdf90c47951da8ca2a4532100e428da05252c712 Mon Sep 17 00:00:00 2001
From: Joel Nothman
Date: Sun, 24 Jun 2018 21:21:03 +1000
Subject: [PATCH 35/36] Clean up revert of removing rpca from benchmark
---
benchmarks/bench_plot_incremental_pca.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/benchmarks/bench_plot_incremental_pca.py b/benchmarks/bench_plot_incremental_pca.py
index 11af628d3b931..8579abcae3bed 100644
--- a/benchmarks/bench_plot_incremental_pca.py
+++ b/benchmarks/bench_plot_incremental_pca.py
@@ -111,8 +111,10 @@ def variable_batch_size_comparison(data):
all_times = defaultdict(list)
all_errors = defaultdict(list)
pca = PCA(n_components=n_components)
- rpca = PCA(n_components=n_components, svd_solver='randomized', random_state=1999)
- results_dict = {k: benchmark(est, data) for k, est in [('pca', pca)]}
+ rpca = PCA(n_components=n_components, svd_solver='randomized',
+ random_state=1999)
+ results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
+ ('rpca', rpca)]}
# Create flat baselines to compare the variation over batch size
all_times['pca'].extend([results_dict['pca']['time']] *
From ee5710def764aeb13966763ca06fda8b72fc5982 Mon Sep 17 00:00:00 2001
From: Joel Nothman
Date: Sun, 24 Jun 2018 21:40:12 +1000
Subject: [PATCH 36/36] Fix exception of GPR
---
sklearn/utils/estimator_checks.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 50c7f9ea4a480..8a1b33f4d92d3 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -138,9 +138,7 @@ def _yield_classifier_checks(name, classifier):
yield check_supervised_y_2d
yield check_supervised_y_no_nan
- if name != "GaussianProcessRegressor":
- # test if NotFittedError is raised
- yield check_estimators_unfitted
+ yield check_estimators_unfitted
if 'class_weight' in classifier.get_params().keys():
yield check_class_weight_classifiers
@@ -186,7 +184,9 @@ def _yield_regressor_checks(name, regressor):
if name != 'CCA':
# check that the regressor handles int input
yield check_regressors_int
- yield check_estimators_unfitted
+ if name != "GaussianProcessRegressor":
+ # test if NotFittedError is raised
+ yield check_estimators_unfitted
yield check_non_transformer_estimators_n_iter