From 9fb7312b9a64360c4df73c14914d0a89f339004c Mon Sep 17 00:00:00 2001 From: "johann.faouzi" Date: Wed, 26 Jun 2019 11:55:37 +0200 Subject: [PATCH 01/11] Use resample to compute the small training set in HistGBT --- .../gradient_boosting.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 5054c58ffc49f..4a594bc04fc9e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -6,7 +6,7 @@ import numpy as np from timeit import default_timer as time from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin -from sklearn.utils import check_X_y, check_random_state, check_array +from sklearn.utils import check_X_y, check_random_state, check_array, resample from sklearn.utils.validation import check_is_fitted from sklearn.utils.multiclass import check_classification_targets from sklearn.metrics import check_scoring @@ -386,15 +386,16 @@ def _get_small_trainset(self, X_binned_train, y_train, seed): with scorers. """ subsample_size = 10000 - rng = check_random_state(seed) - indices = np.arange(X_binned_train.shape[0]) if X_binned_train.shape[0] > subsample_size: - # TODO: not critical but stratify using resample() - indices = rng.choice(indices, subsample_size, replace=False) - X_binned_small_train = X_binned_train[indices] - y_small_train = y_train[indices] - X_binned_small_train = np.ascontiguousarray(X_binned_small_train) - return X_binned_small_train, y_small_train + indices = np.arange(X_binned_train.shape[0]) + indices = resample(indices, n_samples=subsample_size, + replace=False, random_state=seed) + X_binned_small_train = X_binned_train[indices] + y_small_train = y_train[indices] + X_binned_small_train = np.ascontiguousarray(X_binned_small_train) + return X_binned_small_train, y_small_train + else: + return X_binned_train, y_train def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, X_binned_val, y_val): From e0c7332a8c56163d3164697783dcd87025011890 Mon Sep 17 00:00:00 2001 From: "johann.faouzi" Date: Mon, 1 Jul 2019 12:09:09 +0200 Subject: [PATCH 02/11] Stratify the resampling --- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 4a594bc04fc9e..33c128c66ec4c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -5,7 +5,8 @@ import numpy as np from timeit import default_timer as time -from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin +from sklearn.base import (BaseEstimator, RegressorMixin, ClassifierMixin, + is_classifier) from sklearn.utils import check_X_y, check_random_state, check_array, resample from sklearn.utils.validation import check_is_fitted from sklearn.utils.multiclass import check_classification_targets @@ -388,8 +389,10 @@ def _get_small_trainset(self, X_binned_train, y_train, seed): subsample_size = 10000 if X_binned_train.shape[0] > subsample_size: indices = np.arange(X_binned_train.shape[0]) + stratify = y_train if is_classifier(self) else None indices = resample(indices, n_samples=subsample_size, - replace=False, random_state=seed) + replace=False, random_state=seed, + stratify=stratify) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] X_binned_small_train = np.ascontiguousarray(X_binned_small_train) From 29f5b506976e071e544b0e7a719c305d1087bbd7 Mon Sep 17 00:00:00 2001 From: "johann.faouzi" Date: Mon, 1 Jul 2019 13:57:22 +0200 Subject: [PATCH 03/11] Up-to-date branch --- doc/whats_new/v0.22.rst | 4 + examples/compose/plot_column_transformer.py | 2 +- .../plot_ica_blind_source_separation.py | 2 +- examples/manifold/plot_lle_digits.py | 11 +- examples/manifold/plot_mds.py | 6 +- sklearn/_build_utils/__init__.py | 11 + sklearn/cluster/tests/test_optics.py | 2 +- sklearn/externals/_lobpcg.py | 661 ++++++++++++++++++ .../tests/test_variance_threshold.py | 4 +- sklearn/impute/tests/test_impute.py | 23 +- sklearn/linear_model/tests/test_ridge.py | 5 +- sklearn/manifold/spectral_embedding_.py | 3 +- sklearn/model_selection/_search.py | 15 + .../tests/test_discretization.py | 10 +- sklearn/utils/estimator_checks.py | 10 +- sklearn/utils/fixes.py | 6 + 16 files changed, 743 insertions(+), 32 deletions(-) create mode 100644 sklearn/externals/_lobpcg.py diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 4e6e4b0dd8258..1089284a9f6a9 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -129,6 +129,10 @@ Miscellaneous using a non-fitted estimators are now more uniform. :pr:`13013` by :user:`Agamemnon Krasoulis `. +- |Fix| Port `lobpcg` from SciPy which implement some bug fixes but only + available in 1.3+. + :pr:`14195` by :user:`Guillaume Lemaitre `. + Changes to estimator checks --------------------------- diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py index 02599a12396d6..181e3e9127b56 100644 --- a/examples/compose/plot_column_transformer.py +++ b/examples/compose/plot_column_transformer.py @@ -116,7 +116,7 @@ def transform(self, posts): )), # Use a SVC classifier on the combined features - ('svc', LinearSVC()), + ('svc', LinearSVC(dual=False)), ], verbose=True) # limit the list of categories to make running this example faster. diff --git a/examples/decomposition/plot_ica_blind_source_separation.py b/examples/decomposition/plot_ica_blind_source_separation.py index fb7689064dd06..b405b1770cd34 100644 --- a/examples/decomposition/plot_ica_blind_source_separation.py +++ b/examples/decomposition/plot_ica_blind_source_separation.py @@ -69,5 +69,5 @@ for sig, color in zip(model.T, colors): plt.plot(sig, color=color) -plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.46) +plt.tight_layout() plt.show() diff --git a/examples/manifold/plot_lle_digits.py b/examples/manifold/plot_lle_digits.py index 4a3002a05d0dd..e2b0953e7e747 100644 --- a/examples/manifold/plot_lle_digits.py +++ b/examples/manifold/plot_lle_digits.py @@ -28,14 +28,13 @@ # Gael Varoquaux # License: BSD 3 clause (C) INRIA 2011 -print(__doc__) from time import time - import numpy as np import matplotlib.pyplot as plt from matplotlib import offsetbox from sklearn import (manifold, datasets, decomposition, ensemble, discriminant_analysis, random_projection, neighbors) +print(__doc__) digits = datasets.load_digits(n_class=6) X = digits.data @@ -99,7 +98,7 @@ def plot_embedding(X, title=None): plot_embedding(X_projected, "Random Projection of the digits") -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Projection on to the first 2 principal components print("Computing PCA projection") @@ -116,7 +115,8 @@ def plot_embedding(X, title=None): X2 = X.copy() X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible t0 = time() -X_lda = discriminant_analysis.LinearDiscriminantAnalysis(n_components=2).fit_transform(X2, y) +X_lda = discriminant_analysis.LinearDiscriminantAnalysis(n_components=2 + ).fit_transform(X2, y) plot_embedding(X_lda, "Linear Discriminant projection of the digits (time %.2fs)" % (time() - t0)) @@ -235,7 +235,8 @@ def plot_embedding(X, title=None): # ---------------------------------------------------------------------- # NCA projection of the digits dataset print("Computing NCA projection") -nca = neighbors.NeighborhoodComponentsAnalysis(n_components=2, random_state=0) +nca = neighbors.NeighborhoodComponentsAnalysis(init='random', + n_components=2, random_state=0) t0 = time() X_nca = nca.fit_transform(X, y) diff --git a/examples/manifold/plot_mds.py b/examples/manifold/plot_mds.py index 6398e2f7a6242..555d9b5e92bc3 100644 --- a/examples/manifold/plot_mds.py +++ b/examples/manifold/plot_mds.py @@ -22,6 +22,7 @@ from sklearn.metrics import euclidean_distances from sklearn.decomposition import PCA +EPSILON = np.finfo(np.float32).eps n_samples = 20 seed = np.random.RandomState(seed=3) X_true = seed.randint(0, 20, 2 * n_samples).astype(np.float) @@ -68,9 +69,8 @@ plt.scatter(npos[:, 0], npos[:, 1], color='darkorange', s=s, lw=0, label='NMDS') plt.legend(scatterpoints=1, loc='best', shadow=False) -similarities = similarities.max() / similarities * 100 -similarities[np.isinf(similarities)] = 0 - +similarities = similarities.max() / (similarities + EPSILON) * 100 +np.fill_diagonal(similarities, 0) # Plot the edges start_idx, end_idx = np.where(pos) # a sequence of (*line0*, *line1*, *line2*), where:: diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py index 2d872dd378998..5353d3b297965 100644 --- a/sklearn/_build_utils/__init__.py +++ b/sklearn/_build_utils/__init__.py @@ -8,6 +8,7 @@ import os from distutils.version import LooseVersion +import contextlib from numpy.distutils.system_info import get_info @@ -86,7 +87,17 @@ def maybe_cythonize_extensions(top_path, config): exc.args += (message,) raise + n_jobs = 1 + with contextlib.suppress(ImportError): + import joblib + if LooseVersion(joblib.__version__) > LooseVersion("0.13.0"): + # earlier joblib versions don't account for CPU affinity + # constraints, and may over-estimate the number of available + # CPU particularly in CI (cf loky#114) + n_jobs = joblib.effective_n_jobs() + config.ext_modules = cythonize( config.ext_modules, + nthreads=n_jobs, compile_time_env={'SKLEARN_OPENMP_SUPPORTED': with_openmp}, compiler_directives={'language_level': 3}) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 1e3d99746c9e9..1028d7174bb4e 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -109,7 +109,7 @@ def test_extract_xi(): clust = OPTICS(min_samples=3, min_cluster_size=3, max_eps=20, cluster_method='xi', - xi=0.1).fit(X) + xi=0.3).fit(X) # this may fail if the predecessor correction is not at work! assert_array_equal(clust.labels_, expected_labels) diff --git a/sklearn/externals/_lobpcg.py b/sklearn/externals/_lobpcg.py new file mode 100644 index 0000000000000..30492c97c182b --- /dev/null +++ b/sklearn/externals/_lobpcg.py @@ -0,0 +1,661 @@ +""" +scikit-learn copy of scipy/sparse/linalg/eigen/lobpcg/lobpcg.py v1.3.0 +to be deleted after scipy 1.3.0 becomes a dependency in scikit-lean +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Locally Optimal Block Preconditioned Conjugate Gradient Method (LOBPCG). + +References +---------- +.. [1] A. V. Knyazev (2001), + Toward the Optimal Preconditioned Eigensolver: Locally Optimal + Block Preconditioned Conjugate Gradient Method. + SIAM Journal on Scientific Computing 23, no. 2, + pp. 517-541. http://dx.doi.org/10.1137/S1064827500366124 + +.. [2] A. V. Knyazev, I. Lashuk, M. E. Argentati, and E. Ovchinnikov (2007), + Block Locally Optimal Preconditioned Eigenvalue Xolvers (BLOPEX) + in hypre and PETSc. https://arxiv.org/abs/0705.2626 + +.. [3] A. V. Knyazev's C and MATLAB implementations: + https://bitbucket.org/joseroman/blopex +""" + +from __future__ import division, print_function, absolute_import +import warnings +import numpy as np +from scipy.linalg import (inv, eigh, cho_factor, cho_solve, cholesky, + LinAlgError) +from scipy.sparse.linalg import aslinearoperator + +__all__ = ['lobpcg'] + + +def bmat(*args, **kwargs): + with warnings.catch_warnings(record=True): + warnings.filterwarnings( + 'ignore', '.*the matrix subclass is not the recommended way.*') + return np.bmat(*args, **kwargs) + + +def _save(ar, fileName): + # Used only when verbosity level > 10. + np.savetxt(fileName, ar) + + +def _report_nonhermitian(M, a, b, name): + """ + Report if `M` is not a hermitian matrix given the tolerances `a`, `b`. + """ + from scipy.linalg import norm + + md = M - M.T.conj() + + nmd = norm(md, 1) + tol = np.spacing(max(10**a, (10**b)*norm(M, 1))) + if nmd > tol: + print('matrix %s is not sufficiently Hermitian for a=%d, b=%d:' + % (name, a, b)) + print('condition: %.e < %e' % (nmd, tol)) + + +def _as2d(ar): + """ + If the input array is 2D return it, if it is 1D, append a dimension, + making it a column vector. + """ + if ar.ndim == 2: + return ar + else: # Assume 1! + aux = np.array(ar, copy=False) + aux.shape = (ar.shape[0], 1) + return aux + + +def _makeOperator(operatorInput, expectedShape): + """Takes a dense numpy array or a sparse matrix or + a function and makes an operator performing matrix * blockvector + products.""" + if operatorInput is None: + return None + else: + operator = aslinearoperator(operatorInput) + + if operator.shape != expectedShape: + raise ValueError('operator has invalid shape') + + return operator + + +def _applyConstraints(blockVectorV, factYBY, blockVectorBY, blockVectorY): + """Changes blockVectorV in place.""" + gramYBV = np.dot(blockVectorBY.T.conj(), blockVectorV) + tmp = cho_solve(factYBY, gramYBV) + blockVectorV -= np.dot(blockVectorY, tmp) + + +def _b_orthonormalize(B, blockVectorV, blockVectorBV=None, retInvR=False): + if blockVectorBV is None: + if B is not None: + blockVectorBV = B(blockVectorV) + else: + blockVectorBV = blockVectorV # Shared data!!! + gramVBV = np.dot(blockVectorV.T.conj(), blockVectorBV) + gramVBV = cholesky(gramVBV) + gramVBV = inv(gramVBV, overwrite_a=True) + # gramVBV is now R^{-1}. + blockVectorV = np.dot(blockVectorV, gramVBV) + if B is not None: + blockVectorBV = np.dot(blockVectorBV, gramVBV) + else: + blockVectorBV = None + + if retInvR: + return blockVectorV, blockVectorBV, gramVBV + else: + return blockVectorV, blockVectorBV + + +def _get_indx(_lambda, num, largest): + """Get `num` indices into `_lambda` depending on `largest` option.""" + ii = np.argsort(_lambda) + if largest: + ii = ii[:-num-1:-1] + else: + ii = ii[:num] + + return ii + + +def lobpcg(A, X, + B=None, M=None, Y=None, + tol=None, maxiter=20, + largest=True, verbosityLevel=0, + retLambdaHistory=False, retResidualNormsHistory=False): + """Locally Optimal Block Preconditioned Conjugate Gradient Method (LOBPCG) + + LOBPCG is a preconditioned eigensolver for large symmetric positive + definite (SPD) generalized eigenproblems. + + Parameters + ---------- + A : {sparse matrix, dense matrix, LinearOperator} + The symmetric linear operator of the problem, usually a + sparse matrix. Often called the "stiffness matrix". + X : array_like + Initial approximation to the k eigenvectors. If A has + shape=(n,n) then X should have shape shape=(n,k). + B : {dense matrix, sparse matrix, LinearOperator}, optional + the right hand side operator in a generalized eigenproblem. + by default, B = Identity + often called the "mass matrix" + M : {dense matrix, sparse matrix, LinearOperator}, optional + preconditioner to A; by default M = Identity + M should approximate the inverse of A + Y : array_like, optional + n-by-sizeY matrix of constraints, sizeY < n + The iterations will be performed in the B-orthogonal complement + of the column-space of Y. Y must be full rank. + tol : scalar, optional + Solver tolerance (stopping criterion) + by default: tol=n*sqrt(eps) + maxiter : integer, optional + maximum number of iterations + by default: maxiter=min(n,20) + largest : bool, optional + when True, solve for the largest eigenvalues, otherwise the smallest + verbosityLevel : integer, optional + controls solver output. default: verbosityLevel = 0. + retLambdaHistory : boolean, optional + whether to return eigenvalue history + retResidualNormsHistory : boolean, optional + whether to return history of residual norms + + Returns + ------- + w : array + Array of k eigenvalues + v : array + An array of k eigenvectors. V has the same shape as X. + lambdas : list of arrays, optional + The eigenvalue history, if `retLambdaHistory` is True. + rnorms : list of arrays, optional + The history of residual norms, if `retResidualNormsHistory` is True. + + Examples + -------- + + Solve A x = lambda B x with constraints and preconditioning. + + >>> from scipy.sparse import spdiags, issparse + >>> from scipy.sparse.linalg import lobpcg, LinearOperator + >>> n = 100 + >>> vals = [np.arange(n, dtype=np.float64) + 1] + >>> A = spdiags(vals, 0, n, n) + >>> A.toarray() + array([[ 1., 0., 0., ..., 0., 0., 0.], + [ 0., 2., 0., ..., 0., 0., 0.], + [ 0., 0., 3., ..., 0., 0., 0.], + ..., + [ 0., 0., 0., ..., 98., 0., 0.], + [ 0., 0., 0., ..., 0., 99., 0.], + [ 0., 0., 0., ..., 0., 0., 100.]]) + + Constraints. + + >>> Y = np.eye(n, 3) + + Initial guess for eigenvectors, should have linearly independent + columns. Column dimension = number of requested eigenvalues. + + >>> X = np.random.rand(n, 3) + + Preconditioner -- inverse of A (as an abstract linear operator). + + >>> invA = spdiags([1./vals[0]], 0, n, n) + >>> def precond( x ): + ... return invA * x + >>> M = LinearOperator(matvec=precond, shape=(n, n), dtype=float) + + Here, ``invA`` could of course have been used directly as a preconditioner. + Let us then solve the problem: + + >>> eigs, vecs = lobpcg(A, X, Y=Y, M=M, largest=False) + >>> eigs + array([4., 5., 6.]) + + Note that the vectors passed in Y are the eigenvectors of the 3 smallest + eigenvalues. The results returned are orthogonal to those. + + Notes + ----- + If both retLambdaHistory and retResidualNormsHistory are True, + the return tuple has the following format + (lambda, V, lambda history, residual norms history). + + In the following ``n`` denotes the matrix size and ``m`` the number + of required eigenvalues (smallest or largest). + + The LOBPCG code internally solves eigenproblems of the size 3``m`` on every + iteration by calling the "standard" dense eigensolver, so if ``m`` is not + small enough compared to ``n``, it does not make sense to call the LOBPCG + code, but rather one should use the "standard" eigensolver, + e.g. numpy or scipy function in this case. + If one calls the LOBPCG algorithm for 5``m``>``n``, + it will most likely break internally, so the code tries to call + the standard function instead. + + It is not that n should be large for the LOBPCG to work, but rather the + ratio ``n``/``m`` should be large. It you call LOBPCG with ``m``=1 + and ``n``=10, it works though ``n`` is small. The method is intended + for extremely large ``n``/``m``, see e.g., reference [28] in + https://arxiv.org/abs/0705.2626 + + The convergence speed depends basically on two factors: + + 1. How well relatively separated the seeking eigenvalues are from the rest + of the eigenvalues. One can try to vary ``m`` to make this better. + + 2. How well conditioned the problem is. This can be changed by using proper + preconditioning. For example, a rod vibration test problem (under tests + directory) is ill-conditioned for large ``n``, so convergence will be + slow, unless efficient preconditioning is used. For this specific + problem, a good simple preconditioner function would be a linear solve + for A, which is easy to code since A is tridiagonal. + + *Acknowledgements* + + lobpcg.py code was written by Robert Cimrman. + Many thanks belong to Andrew Knyazev, the author of the algorithm, + for lots of advice and support. + + References + ---------- + .. [1] A. V. Knyazev (2001), + Toward the Optimal Preconditioned Eigensolver: Locally Optimal + Block Preconditioned Conjugate Gradient Method. + SIAM Journal on Scientific Computing 23, no. 2, + pp. 517-541. http://dx.doi.org/10.1137/S1064827500366124 + + .. [2] A. V. Knyazev, I. Lashuk, M. E. Argentati, and E. Ovchinnikov + (2007), Block Locally Optimal Preconditioned Eigenvalue Xolvers + (BLOPEX) in hypre and PETSc. https://arxiv.org/abs/0705.2626 + + .. [3] A. V. Knyazev's C and MATLAB implementations: + https://bitbucket.org/joseroman/blopex + """ + blockVectorX = X + blockVectorY = Y + residualTolerance = tol + maxIterations = maxiter + + if blockVectorY is not None: + sizeY = blockVectorY.shape[1] + else: + sizeY = 0 + + # Block size. + if len(blockVectorX.shape) != 2: + raise ValueError('expected rank-2 array for argument X') + + n, sizeX = blockVectorX.shape + + if verbosityLevel: + aux = "Solving " + if B is None: + aux += "standard" + else: + aux += "generalized" + aux += " eigenvalue problem with" + if M is None: + aux += "out" + aux += " preconditioning\n\n" + aux += "matrix size %d\n" % n + aux += "block size %d\n\n" % sizeX + if blockVectorY is None: + aux += "No constraints\n\n" + else: + if sizeY > 1: + aux += "%d constraints\n\n" % sizeY + else: + aux += "%d constraint\n\n" % sizeY + print(aux) + + A = _makeOperator(A, (n, n)) + B = _makeOperator(B, (n, n)) + M = _makeOperator(M, (n, n)) + + if (n - sizeY) < (5 * sizeX): + # warn('The problem size is small compared to the block size.' \ + # ' Using dense eigensolver instead of LOBPCG.') + + sizeX = min(sizeX, n) + + if blockVectorY is not None: + raise NotImplementedError('The dense eigensolver ' + 'does not support constraints.') + + # Define the closed range of indices of eigenvalues to return. + if largest: + eigvals = (n - sizeX, n-1) + else: + eigvals = (0, sizeX-1) + + A_dense = A(np.eye(n, dtype=A.dtype)) + B_dense = None if B is None else B(np.eye(n, dtype=B.dtype)) + + vals, vecs = eigh(A_dense, B_dense, eigvals=eigvals, + check_finite=False) + if largest: + # Reverse order to be compatible with eigs() in 'LM' mode. + vals = vals[::-1] + vecs = vecs[:, ::-1] + + return vals, vecs + + if (residualTolerance is None) or (residualTolerance <= 0.0): + residualTolerance = np.sqrt(1e-15) * n + + # Apply constraints to X. + if blockVectorY is not None: + + if B is not None: + blockVectorBY = B(blockVectorY) + else: + blockVectorBY = blockVectorY + + # gramYBY is a dense array. + gramYBY = np.dot(blockVectorY.T.conj(), blockVectorBY) + try: + # gramYBY is a Cholesky factor from now on... + gramYBY = cho_factor(gramYBY) + except LinAlgError: + raise ValueError('cannot handle linearly dependent constraints') + + _applyConstraints(blockVectorX, gramYBY, blockVectorBY, blockVectorY) + + ## + # B-orthonormalize X. + blockVectorX, blockVectorBX = _b_orthonormalize(B, blockVectorX) + + ## + # Compute the initial Ritz vectors: solve the eigenproblem. + blockVectorAX = A(blockVectorX) + gramXAX = np.dot(blockVectorX.T.conj(), blockVectorAX) + + _lambda, eigBlockVector = eigh(gramXAX, check_finite=False) + ii = _get_indx(_lambda, sizeX, largest) + _lambda = _lambda[ii] + + eigBlockVector = np.asarray(eigBlockVector[:, ii]) + blockVectorX = np.dot(blockVectorX, eigBlockVector) + blockVectorAX = np.dot(blockVectorAX, eigBlockVector) + if B is not None: + blockVectorBX = np.dot(blockVectorBX, eigBlockVector) + + ## + # Active index set. + activeMask = np.ones((sizeX,), dtype=bool) + + lambdaHistory = [_lambda] + residualNormsHistory = [] + + previousBlockSize = sizeX + ident = np.eye(sizeX, dtype=A.dtype) + ident0 = np.eye(sizeX, dtype=A.dtype) + + ## + # Main iteration loop. + + blockVectorP = None # set during iteration + blockVectorAP = None + blockVectorBP = None + + iterationNumber = -1 + while iterationNumber < maxIterations: + iterationNumber += 1 + if verbosityLevel > 0: + print('iteration %d' % iterationNumber) + + if B is not None: + aux = blockVectorBX * _lambda[np.newaxis, :] + + else: + aux = blockVectorX * _lambda[np.newaxis, :] + + blockVectorR = blockVectorAX - aux + + aux = np.sum(blockVectorR.conjugate() * blockVectorR, 0) + residualNorms = np.sqrt(aux) + + residualNormsHistory.append(residualNorms) + + ii = np.where(residualNorms > residualTolerance, True, False) + activeMask = activeMask & ii + if verbosityLevel > 2: + print(activeMask) + + currentBlockSize = activeMask.sum() + if currentBlockSize != previousBlockSize: + previousBlockSize = currentBlockSize + ident = np.eye(currentBlockSize, dtype=A.dtype) + + if currentBlockSize == 0: + break + + if verbosityLevel > 0: + print('current block size:', currentBlockSize) + print('eigenvalue:', _lambda) + print('residual norms:', residualNorms) + if verbosityLevel > 10: + print(eigBlockVector) + + activeBlockVectorR = _as2d(blockVectorR[:, activeMask]) + + if iterationNumber > 0: + activeBlockVectorP = _as2d(blockVectorP[:, activeMask]) + activeBlockVectorAP = _as2d(blockVectorAP[:, activeMask]) + if B is not None: + activeBlockVectorBP = _as2d(blockVectorBP[:, activeMask]) + + if M is not None: + # Apply preconditioner T to the active residuals. + activeBlockVectorR = M(activeBlockVectorR) + + ## + # Apply constraints to the preconditioned residuals. + if blockVectorY is not None: + _applyConstraints(activeBlockVectorR, + gramYBY, blockVectorBY, blockVectorY) + + ## + # B-orthonormalize the preconditioned residuals. + + aux = _b_orthonormalize(B, activeBlockVectorR) + activeBlockVectorR, activeBlockVectorBR = aux + + activeBlockVectorAR = A(activeBlockVectorR) + + if iterationNumber > 0: + if B is not None: + aux = _b_orthonormalize(B, activeBlockVectorP, + activeBlockVectorBP, retInvR=True) + activeBlockVectorP, activeBlockVectorBP, invR = aux + activeBlockVectorAP = np.dot(activeBlockVectorAP, invR) + + else: + aux = _b_orthonormalize(B, activeBlockVectorP, retInvR=True) + activeBlockVectorP, _, invR = aux + activeBlockVectorAP = np.dot(activeBlockVectorAP, invR) + + ## + # Perform the Rayleigh Ritz Procedure: + # Compute symmetric Gram matrices: + + if B is not None: + xaw = np.dot(blockVectorX.T.conj(), activeBlockVectorAR) + waw = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAR) + xbw = np.dot(blockVectorX.T.conj(), activeBlockVectorBR) + + if iterationNumber > 0: + xap = np.dot(blockVectorX.T.conj(), activeBlockVectorAP) + wap = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAP) + pap = np.dot(activeBlockVectorP.T.conj(), activeBlockVectorAP) + xbp = np.dot(blockVectorX.T.conj(), activeBlockVectorBP) + wbp = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorBP) + + gramA = bmat([[np.diag(_lambda), xaw, xap], + [xaw.T.conj(), waw, wap], + [xap.T.conj(), wap.T.conj(), pap]]) + + gramB = bmat([[ident0, xbw, xbp], + [xbw.T.conj(), ident, wbp], + [xbp.T.conj(), wbp.T.conj(), ident]]) + else: + gramA = bmat([[np.diag(_lambda), xaw], + [xaw.T.conj(), waw]]) + gramB = bmat([[ident0, xbw], + [xbw.T.conj(), ident]]) + + else: + xaw = np.dot(blockVectorX.T.conj(), activeBlockVectorAR) + waw = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAR) + xbw = np.dot(blockVectorX.T.conj(), activeBlockVectorR) + + if iterationNumber > 0: + xap = np.dot(blockVectorX.T.conj(), activeBlockVectorAP) + wap = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAP) + pap = np.dot(activeBlockVectorP.T.conj(), activeBlockVectorAP) + xbp = np.dot(blockVectorX.T.conj(), activeBlockVectorP) + wbp = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorP) + + gramA = bmat([[np.diag(_lambda), xaw, xap], + [xaw.T.conj(), waw, wap], + [xap.T.conj(), wap.T.conj(), pap]]) + + gramB = bmat([[ident0, xbw, xbp], + [xbw.T.conj(), ident, wbp], + [xbp.T.conj(), wbp.T.conj(), ident]]) + else: + gramA = bmat([[np.diag(_lambda), xaw], + [xaw.T.conj(), waw]]) + gramB = bmat([[ident0, xbw], + [xbw.T.conj(), ident]]) + + if verbosityLevel > 0: + _report_nonhermitian(gramA, 3, -1, 'gramA') + _report_nonhermitian(gramB, 3, -1, 'gramB') + + if verbosityLevel > 10: + _save(gramA, 'gramA') + _save(gramB, 'gramB') + + # Solve the generalized eigenvalue problem. + _lambda, eigBlockVector = eigh(gramA, gramB, check_finite=False) + ii = _get_indx(_lambda, sizeX, largest) + + if verbosityLevel > 10: + print(ii) + print(_lambda) + + _lambda = _lambda[ii] + eigBlockVector = eigBlockVector[:, ii] + + lambdaHistory.append(_lambda) + + if verbosityLevel > 10: + print('lambda:', _lambda) +# # Normalize eigenvectors! +# aux = np.sum( eigBlockVector.conjugate() * eigBlockVector, 0 ) +# eigVecNorms = np.sqrt( aux ) +# eigBlockVector = eigBlockVector / eigVecNorms[np.newaxis, :] +# eigBlockVector, aux = _b_orthonormalize( B, eigBlockVector ) + + if verbosityLevel > 10: + print(eigBlockVector) + + # Compute Ritz vectors. + if B is not None: + if iterationNumber > 0: + eigBlockVectorX = eigBlockVector[:sizeX] + eigBlockVectorR = eigBlockVector[sizeX:sizeX+currentBlockSize] + eigBlockVectorP = eigBlockVector[sizeX+currentBlockSize:] + + pp = np.dot(activeBlockVectorR, eigBlockVectorR) + pp += np.dot(activeBlockVectorP, eigBlockVectorP) + + app = np.dot(activeBlockVectorAR, eigBlockVectorR) + app += np.dot(activeBlockVectorAP, eigBlockVectorP) + + bpp = np.dot(activeBlockVectorBR, eigBlockVectorR) + bpp += np.dot(activeBlockVectorBP, eigBlockVectorP) + else: + eigBlockVectorX = eigBlockVector[:sizeX] + eigBlockVectorR = eigBlockVector[sizeX:] + + pp = np.dot(activeBlockVectorR, eigBlockVectorR) + app = np.dot(activeBlockVectorAR, eigBlockVectorR) + bpp = np.dot(activeBlockVectorBR, eigBlockVectorR) + + if verbosityLevel > 10: + print(pp) + print(app) + print(bpp) + + blockVectorX = np.dot(blockVectorX, eigBlockVectorX) + pp + blockVectorAX = np.dot(blockVectorAX, eigBlockVectorX) + app + blockVectorBX = np.dot(blockVectorBX, eigBlockVectorX) + bpp + + blockVectorP, blockVectorAP, blockVectorBP = pp, app, bpp + + else: + if iterationNumber > 0: + eigBlockVectorX = eigBlockVector[:sizeX] + eigBlockVectorR = eigBlockVector[sizeX:sizeX+currentBlockSize] + eigBlockVectorP = eigBlockVector[sizeX+currentBlockSize:] + + pp = np.dot(activeBlockVectorR, eigBlockVectorR) + pp += np.dot(activeBlockVectorP, eigBlockVectorP) + + app = np.dot(activeBlockVectorAR, eigBlockVectorR) + app += np.dot(activeBlockVectorAP, eigBlockVectorP) + else: + eigBlockVectorX = eigBlockVector[:sizeX] + eigBlockVectorR = eigBlockVector[sizeX:] + + pp = np.dot(activeBlockVectorR, eigBlockVectorR) + app = np.dot(activeBlockVectorAR, eigBlockVectorR) + + if verbosityLevel > 10: + print(pp) + print(app) + + blockVectorX = np.dot(blockVectorX, eigBlockVectorX) + pp + blockVectorAX = np.dot(blockVectorAX, eigBlockVectorX) + app + + blockVectorP, blockVectorAP = pp, app + + if B is not None: + aux = blockVectorBX * _lambda[np.newaxis, :] + + else: + aux = blockVectorX * _lambda[np.newaxis, :] + + blockVectorR = blockVectorAX - aux + + aux = np.sum(blockVectorR.conjugate() * blockVectorR, 0) + residualNorms = np.sqrt(aux) + + if verbosityLevel > 0: + print('final eigenvalue:', _lambda) + print('final residual norms:', residualNorms) + + if retLambdaHistory: + if retResidualNormsHistory: + return _lambda, blockVectorX, lambdaHistory, residualNormsHistory + else: + return _lambda, blockVectorX, lambdaHistory + else: + if retResidualNormsHistory: + return _lambda, blockVectorX, residualNormsHistory + else: + return _lambda, blockVectorX diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index fba4478a28e2f..53a90ace37a40 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -38,7 +38,9 @@ def test_zero_variance_floating_point_error(): # See #13691 data = [[-0.13725701]] * 10 - assert np.var(data) != 0 + if np.var(data) == 0: + pytest.skip('This test is not valid for this platform, as it relies ' + 'on numerical instabilities.') for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: msg = "No feature in X meets the variance threshold 0.00000" with pytest.raises(ValueError, match=msg): diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index 9562eb54adb3d..ebabc5c311f6b 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -65,21 +65,22 @@ def _check_statistics(X, X_true, assert_ae(X_trans, X_true, err_msg=err_msg.format(True)) -def test_imputation_shape(): +@pytest.mark.parametrize("strategy", + ['mean', 'median', 'most_frequent', "constant"]) +def test_imputation_shape(strategy): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan - for strategy in ['mean', 'median', 'most_frequent', "constant"]: - imputer = SimpleImputer(strategy=strategy) - X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) - assert X_imputed.shape == (10, 2) - X_imputed = imputer.fit_transform(X) - assert X_imputed.shape == (10, 2) - - iterative_imputer = IterativeImputer(initial_strategy=strategy) - X_imputed = iterative_imputer.fit_transform(X) - assert X_imputed.shape == (10, 2) + imputer = SimpleImputer(strategy=strategy) + X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) + assert X_imputed.shape == (10, 2) + X_imputed = imputer.fit_transform(X) + assert X_imputed.shape == (10, 2) + + iterative_imputer = IterativeImputer(initial_strategy=strategy) + X_imputed = iterative_imputer.fit_transform(X) + assert X_imputed.shape == (10, 2) @pytest.mark.parametrize("strategy", ["const", 101, None]) diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index ddaf0f5e63d0f..cfc487c6ffe66 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -1099,13 +1099,14 @@ def test_dtype_match(solver): X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) + tol = 2 * np.finfo(np.float32).resolution # Check type consistency 32bits - ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,) + ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=tol) ridge_32.fit(X_32, y_32) coef_32 = ridge_32.coef_ # Check type consistency 64 bits - ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,) + ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=tol) ridge_64.fit(X_64, y_64) coef_64 = ridge_64.coef_ diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index a6d5af54f9bc4..42227db8a72ad 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -10,13 +10,14 @@ import numpy as np from scipy import sparse from scipy.linalg import eigh -from scipy.sparse.linalg import eigsh, lobpcg +from scipy.sparse.linalg import eigsh from scipy.sparse.csgraph import connected_components from scipy.sparse.csgraph import laplacian as csgraph_laplacian from ..base import BaseEstimator from ..utils import check_random_state, check_array, check_symmetric from ..utils.extmath import _deterministic_vector_sign_flip +from ..utils.fixes import lobpcg from ..metrics.pairwise import rbf_kernel from ..neighbors import kneighbors_graph diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 7d2c60d34b02d..5b402c17ee86f 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1411,6 +1411,21 @@ class RandomizedSearchCV(BaseSearchCV): A generator over parameter settings, constructed from param_distributions. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import RandomizedSearchCV + >>> from scipy.stats import uniform + >>> iris = load_iris() + >>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200) + >>> distributions = dict(C=uniform(loc=0, scale=4), + ... penalty=['l2', 'l1']) + >>> clf = RandomizedSearchCV(logistic, distributions, random_state=0) + >>> search = clf.fit(iris.data, iris.target) + >>> search.best_params_ + {'C': 2..., 'penalty': 'l1'} """ _required_parameters = ["estimator", "param_distributions"] diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 102b789eb093d..6dd0abdb99e9f 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -135,15 +135,15 @@ def test_transform_1d_behavior(): assert_raises(ValueError, est.transform, X) -def test_numeric_stability(): +@pytest.mark.parametrize('i', range(1, 9)) +def test_numeric_stability(i): X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1) Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1) # Test up to discretizing nano units - for i in range(1, 9): - X = X_init / 10**i - Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X) - assert_array_equal(Xt_expected, Xt) + X = X_init / 10**i + Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X) + assert_array_equal(Xt_expected, Xt) def test_invalid_encode_option(): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 159b89846cb39..0bec5c3911681 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2506,4 +2506,12 @@ def check_fit_idempotent(name, estimator_orig): for method in check_methods: if hasattr(estimator, method): new_result = getattr(estimator, method)(X_test) - assert_allclose_dense_sparse(result[method], new_result) + if np.issubdtype(new_result.dtype, np.floating): + tol = 2*np.finfo(new_result.dtype).eps + else: + tol = 2*np.finfo(np.float64).eps + assert_allclose_dense_sparse( + result[method], new_result, + atol=max(tol, 1e-9), rtol=max(tol, 1e-7), + err_msg="Idempotency check failed for method {}".format(method) + ) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 712b6826a2b75..2481eb39c9d0c 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -38,6 +38,12 @@ def _parse_version(version_string): except ImportError: from scipy.misc import comb, logsumexp # noqa +if sp_version >= (1, 3): + from scipy.sparse.linalg import lobpcg +else: + # Backport of lobpcg functionality from scipy 1.3.0, can be removed + # once support for sp_version < (1, 3) is dropped + from ..externals._lobpcg import lobpcg # noqa if sp_version >= (0, 19): def _argmax(arr_or_spmatrix, axis=None): From 828b662266a653eddc8c1be18420c48ba732ea49 Mon Sep 17 00:00:00 2001 From: "johann.faouzi" Date: Mon, 1 Jul 2019 14:02:10 +0200 Subject: [PATCH 04/11] Add resample in imports --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 011907df5ec39..3650867b752cc 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -7,7 +7,7 @@ from timeit import default_timer as time from ...base import (BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier) -from ...utils import check_X_y, check_random_state, check_array +from ...utils import check_X_y, check_random_state, check_array, resample from ...utils.validation import check_is_fitted from ...utils.multiclass import check_classification_targets from ...metrics import check_scoring From d5fd250cc743c0b73e7b520da9017f10b62d3787 Mon Sep 17 00:00:00 2001 From: "johann.faouzi" Date: Mon, 1 Jul 2019 16:14:53 +0200 Subject: [PATCH 05/11] Add tests for small trainset computation --- .../tests/test_gradient_boosting.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 86b3eeb239c3d..155c20edb5099 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -190,3 +190,29 @@ def test_zero_division_hessians(data): X, y = data gb = HistGradientBoostingClassifier(learning_rate=100, max_iter=10) gb.fit(X, y) + + +@pytest.mark.parametrize('GradientBoosting, data', [ + (HistGradientBoostingClassifier, + make_classification(n_samples=10001, random_state=0, n_features=2, + n_informative=2, n_redundant=0)), + (HistGradientBoostingRegressor, + make_regression(n_samples=10001, random_state=0, n_features=2, + n_informative=2))] +) +def test_small_trainset(GradientBoosting, data): + # Make sure that a small trainset has the expected length (10k samples) + X, y = data + gb = GradientBoosting(random_state=42) + X_small, y_small = gb._get_small_trainset(X, y, seed=42) + assert X_small.shape[0] == 10000 + assert y_small.shape[0] == 10000 + + +def test_stratification_small_trainset(): + # Make sure that the small trainset is stratified + X, y = make_classification(n_samples=20000, n_features=2, + n_informative=2, n_redundant=0) + gb = HistGradientBoostingClassifier(random_state=42) + X_small_train, y_small_train = gb._get_small_trainset(X, y, seed=42) + np.testing.assert_almost_equal(y.mean(), y_small_train.mean(), decimal=3) From 058383d6676fe7567d86b67b8853d8f1c7ce0020 Mon Sep 17 00:00:00 2001 From: "johann.faouzi" Date: Mon, 1 Jul 2019 16:57:09 +0200 Subject: [PATCH 06/11] Make an imbalanced dataset with a deterministic balance. --- .../tests/test_gradient_boosting.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 155c20edb5099..6a9e95e91efae 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -211,8 +211,14 @@ def test_small_trainset(GradientBoosting, data): def test_stratification_small_trainset(): # Make sure that the small trainset is stratified - X, y = make_classification(n_samples=20000, n_features=2, - n_informative=2, n_redundant=0) + n_samples = 20000 + class_one_prop = 0.1 + rng = np.random.RandomState(42) + X = rng.randn(n_samples).reshape(n_samples, 1) + y = np.asarray( + [0] * int(n_samples * (1 - class_one_prop)) + + [1] * int(n_samples * class_one_prop) + ) gb = HistGradientBoostingClassifier(random_state=42) X_small_train, y_small_train = gb._get_small_trainset(X, y, seed=42) - np.testing.assert_almost_equal(y.mean(), y_small_train.mean(), decimal=3) + np.testing.assert_equal(y_small_train.mean(), class_one_prop) From 4a02b0127391ebe3ad00053237e4949caff09387 Mon Sep 17 00:00:00 2001 From: "johann.faouzi" Date: Mon, 1 Jul 2019 17:13:48 +0200 Subject: [PATCH 07/11] Use assert instead of assert_equal --- .../_hist_gradient_boosting/tests/test_gradient_boosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 6a9e95e91efae..ea5457e9f9f6e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -221,4 +221,4 @@ def test_stratification_small_trainset(): ) gb = HistGradientBoostingClassifier(random_state=42) X_small_train, y_small_train = gb._get_small_trainset(X, y, seed=42) - np.testing.assert_equal(y_small_train.mean(), class_one_prop) + assert y_small_train.mean() == class_one_prop From 6ff6fa8de70e20e459b0bf16b4ff1adcbf9cf244 Mon Sep 17 00:00:00 2001 From: "johann.faouzi" Date: Mon, 1 Jul 2019 17:26:27 +0200 Subject: [PATCH 08/11] Merge both tests --- .../tests/test_gradient_boosting.py | 35 +++++-------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index ea5457e9f9f6e..2b33095b13a64 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -192,33 +192,16 @@ def test_zero_division_hessians(data): gb.fit(X, y) -@pytest.mark.parametrize('GradientBoosting, data', [ - (HistGradientBoostingClassifier, - make_classification(n_samples=10001, random_state=0, n_features=2, - n_informative=2, n_redundant=0)), - (HistGradientBoostingRegressor, - make_regression(n_samples=10001, random_state=0, n_features=2, - n_informative=2))] -) -def test_small_trainset(GradientBoosting, data): - # Make sure that a small trainset has the expected length (10k samples) - X, y = data - gb = GradientBoosting(random_state=42) - X_small, y_small = gb._get_small_trainset(X, y, seed=42) - assert X_small.shape[0] == 10000 - assert y_small.shape[0] == 10000 - - -def test_stratification_small_trainset(): - # Make sure that the small trainset is stratified +def test_small_trainset(): + # Make sure that the small trainset is stratified and has the expected + # length (10k samples) n_samples = 20000 class_one_prop = 0.1 rng = np.random.RandomState(42) X = rng.randn(n_samples).reshape(n_samples, 1) - y = np.asarray( - [0] * int(n_samples * (1 - class_one_prop)) - + [1] * int(n_samples * class_one_prop) - ) - gb = HistGradientBoostingClassifier(random_state=42) - X_small_train, y_small_train = gb._get_small_trainset(X, y, seed=42) - assert y_small_train.mean() == class_one_prop + y = rng.binomial(1, p=0.1, size=n_samples) + gb = HistGradientBoostingClassifier() + X_small, y_small = gb._get_small_trainset(X, y, seed=42) + assert X_small.shape[0] == 10000 + assert y_small.shape[0] == 10000 + assert y_small.mean() == pytest.approx(class_one_prop, rel=0, abs=1e-2) From 98755bed1129a87fbdc6a4f772810ac2fdd1a94c Mon Sep 17 00:00:00 2001 From: "johann.faouzi" Date: Tue, 2 Jul 2019 08:49:48 +0200 Subject: [PATCH 09/11] Make the class distribution deterministic and add more classes --- .../tests/test_gradient_boosting.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 2b33095b13a64..93ca437f7fdeb 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -7,6 +7,7 @@ from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.utils import shuffle X_classification, y_classification = make_classification(random_state=0) @@ -196,12 +197,26 @@ def test_small_trainset(): # Make sure that the small trainset is stratified and has the expected # length (10k samples) n_samples = 20000 - class_one_prop = 0.1 + original_prop = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4} rng = np.random.RandomState(42) X = rng.randn(n_samples).reshape(n_samples, 1) - y = rng.binomial(1, p=0.1, size=n_samples) + y = [[class_] * int(prop * n_samples) for (class_, prop) + in original_prop.items()] + y = shuffle(np.concatenate(y)) gb = HistGradientBoostingClassifier() + + # Compute the small training set X_small, y_small = gb._get_small_trainset(X, y, seed=42) + + # Compute the class distribution in the small training set + unique, counts = np.unique(y_small, return_counts=True) + small_prop = {class_: count / 10000 for (class_, count) + in zip(unique, counts)} + + # Test that the small training set has the expected length assert X_small.shape[0] == 10000 assert y_small.shape[0] == 10000 - assert y_small.mean() == pytest.approx(class_one_prop, rel=0, abs=1e-2) + + # Test that the class distributions in the whole dataset and in the small + # training set are identical + assert small_prop == original_prop From dedfa4b64ea6ba656d5a3759aa1f3630fa210567 Mon Sep 17 00:00:00 2001 From: "johann.faouzi" Date: Tue, 2 Jul 2019 17:00:10 +0200 Subject: [PATCH 10/11] Use pytest.approx to compare the dictionaries --- .../tests/test_gradient_boosting.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 93ca437f7fdeb..8c3f2188b7434 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -197,11 +197,11 @@ def test_small_trainset(): # Make sure that the small trainset is stratified and has the expected # length (10k samples) n_samples = 20000 - original_prop = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4} + original_distrib = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4} rng = np.random.RandomState(42) X = rng.randn(n_samples).reshape(n_samples, 1) y = [[class_] * int(prop * n_samples) for (class_, prop) - in original_prop.items()] + in original_distrib.items()] y = shuffle(np.concatenate(y)) gb = HistGradientBoostingClassifier() @@ -210,8 +210,8 @@ def test_small_trainset(): # Compute the class distribution in the small training set unique, counts = np.unique(y_small, return_counts=True) - small_prop = {class_: count / 10000 for (class_, count) - in zip(unique, counts)} + small_distrib = {class_: count / 10000 for (class_, count) + in zip(unique, counts)} # Test that the small training set has the expected length assert X_small.shape[0] == 10000 @@ -219,4 +219,4 @@ def test_small_trainset(): # Test that the class distributions in the whole dataset and in the small # training set are identical - assert small_prop == original_prop + assert small_distrib == pytest.approx(original_distrib) From 92de4948f7f07e44de40e11aa84584623593120a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 8 Jul 2019 09:53:25 +0200 Subject: [PATCH 11/11] DOC add changelog entry for stratified GBDT training loss --- doc/whats_new/v0.22.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 1089284a9f6a9..1aea9bba2ce8c 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -60,6 +60,11 @@ Changelog parameter called `warm_start` that enables warm starting. :pr:`14012` by :user:`Johann Faouzi `. +- |Enhancement| :class:`ensemble.HistGradientBoostingClassifier` the training + loss or score is now monitored on a class-wise stratified subsample to + preserve the class balance of the original training set. :pr:`14194` + by :user:`Johann Faouzi `. + :mod:`sklearn.linear_model` ...........................