From c56c2af28efbf31de3bb5ffe694c58f09ee86e29 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 1 Oct 2018 17:43:44 -0400 Subject: [PATCH 01/11] simple deprecations and removals --- doc/modules/classes.rst | 21 +- sklearn/cluster/hierarchical.py | 12 +- sklearn/covariance/graph_lasso_.py | 6 - sklearn/covariance/tests/test_graph_lasso.py | 24 +- .../covariance/tests/test_graphical_lasso.py | 24 +- sklearn/datasets/mlcomp.py | 114 --- sklearn/decomposition/fastica_.py | 11 +- sklearn/decomposition/online_lda.py | 38 +- sklearn/decomposition/sparse_pca.py | 21 +- sklearn/discriminant_analysis.py | 15 +- sklearn/feature_extraction/hashing.py | 22 +- sklearn/feature_extraction/text.py | 13 +- sklearn/gaussian_process/gpr.py | 12 - sklearn/kernel_approximation.py | 7 +- sklearn/linear_model/least_angle.py | 7 - sklearn/linear_model/randomized_l1.py | 670 ------------------ .../linear_model/tests/test_randomized_l1.py | 219 ------ sklearn/manifold/t_sne.py | 5 - sklearn/metrics/pairwise.py | 10 +- sklearn/neighbors/approximate.py | 589 --------------- sklearn/neighbors/tests/test_approximate.py | 498 ------------- sklearn/preprocessing/__init__.py | 2 - .../preprocessing/_function_transformer.py | 37 +- sklearn/preprocessing/data.py | 58 +- sklearn/semi_supervised/label_propagation.py | 19 +- sklearn/tests/test_discriminant_analysis.py | 14 - sklearn/utils/arpack.py | 23 - sklearn/utils/extmath.py | 42 -- sklearn/utils/graph.py | 14 - sklearn/utils/random.py | 100 --- sklearn/utils/sparsetools/__init__.py | 13 - sklearn/utils/sparsetools/setup.py | 15 - sklearn/utils/sparsetools/tests/__init__.py | 0 sklearn/utils/stats.py | 7 - sklearn/utils/tests/test_extmath.py | 28 - sklearn/utils/tests/test_stats.py | 9 - sklearn/utils/tests/test_utils.py | 50 -- 37 files changed, 43 insertions(+), 2726 deletions(-) delete mode 100644 sklearn/datasets/mlcomp.py delete mode 100644 sklearn/linear_model/randomized_l1.py delete mode 100644 sklearn/linear_model/tests/test_randomized_l1.py delete mode 100644 sklearn/neighbors/approximate.py delete mode 100644 sklearn/neighbors/tests/test_approximate.py delete mode 100644 sklearn/utils/arpack.py delete mode 100644 sklearn/utils/sparsetools/__init__.py delete mode 100644 sklearn/utils/sparsetools/setup.py delete mode 100644 sklearn/utils/sparsetools/tests/__init__.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 57ccfb5cff704..04d60cecea3d7 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1512,23 +1512,4 @@ To be removed in 0.22 :template: deprecated_function.rst covariance.graph_lasso - datasets.fetch_mldata - - -To be removed in 0.21 ---------------------- - -.. autosummary:: - :toctree: generated/ - :template: deprecated_class.rst - - linear_model.RandomizedLasso - linear_model.RandomizedLogisticRegression - neighbors.LSHForest - -.. autosummary:: - :toctree: generated/ - :template: deprecated_function.rst - - datasets.load_mlcomp - linear_model.lasso_stability_path + datasets.fetch_mldata \ No newline at end of file diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 1d6755fd72060..a23542ff5a97f 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -339,9 +339,8 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False): # single average and complete linkage -def linkage_tree(X, connectivity=None, n_components='deprecated', - n_clusters=None, linkage='complete', affinity="euclidean", - return_distance=False): +def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete', + affinity="euclidean", return_distance=False): """Linkage agglomerative clustering based on a Feature matrix. The inertia matrix uses a Heapq-based representation. @@ -362,9 +361,6 @@ def linkage_tree(X, connectivity=None, n_components='deprecated', be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. - n_components : int (optional) - The number of connected components in the graph. - n_clusters : int (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is @@ -420,10 +416,6 @@ def linkage_tree(X, connectivity=None, n_components='deprecated', -------- ward_tree : hierarchical clustering with ward linkage """ - if n_components != 'deprecated': - warnings.warn("n_components was deprecated in 0.19" - "will be removed in 0.21", DeprecationWarning) - X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py index b10e3c7f3f828..3280aacbf6c8a 100644 --- a/sklearn/covariance/graph_lasso_.py +++ b/sklearn/covariance/graph_lasso_.py @@ -584,12 +584,6 @@ def __init__(self, alphas=4, n_refinements=4, cv='warn', tol=1e-4, self.cv = cv self.n_jobs = n_jobs - @property - @deprecated("Attribute grid_scores was deprecated in version 0.19 and " - "will be removed in 0.21. Use ``grid_scores_`` instead") - def grid_scores(self): - return self.grid_scores_ - def fit(self, X, y=None): """Fits the GraphicalLasso covariance model to X. diff --git a/sklearn/covariance/tests/test_graph_lasso.py b/sklearn/covariance/tests/test_graph_lasso.py index 8c07536363614..33c724df781d4 100644 --- a/sklearn/covariance/tests/test_graph_lasso.py +++ b/sklearn/covariance/tests/test_graph_lasso.py @@ -140,26 +140,4 @@ def test_graph_lasso_cv(random_state=1): sys.stdout = orig_stdout # Smoke test with specified alphas - GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) - - -@ignore_warnings(category=DeprecationWarning) -@pytest.mark.filterwarnings('ignore: You should specify a value') # 0.22 -def test_deprecated_grid_scores(random_state=1): - dim = 5 - n_samples = 6 - random_state = check_random_state(random_state) - prec = make_sparse_spd_matrix(dim, alpha=.96, - random_state=random_state) - cov = linalg.inv(prec) - X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) - graph_lasso = GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1) - graph_lasso.fit(X) - - depr_message = ("Attribute grid_scores was deprecated in version " - "0.19 and will be removed in 0.21. Use " - "``grid_scores_`` instead") - - assert_warns_message(DeprecationWarning, depr_message, - lambda: graph_lasso.grid_scores) - assert_equal(graph_lasso.grid_scores, graph_lasso.grid_scores_) + GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) \ No newline at end of file diff --git a/sklearn/covariance/tests/test_graphical_lasso.py b/sklearn/covariance/tests/test_graphical_lasso.py index f1d6aab6a9b26..25e2f191d3ec8 100644 --- a/sklearn/covariance/tests/test_graphical_lasso.py +++ b/sklearn/covariance/tests/test_graphical_lasso.py @@ -136,26 +136,4 @@ def test_graphical_lasso_cv(random_state=1): sys.stdout = orig_stdout # Smoke test with specified alphas - GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) - - -@pytest.mark.filterwarnings('ignore: You should specify a value') # 0.22 -@pytest.mark.skipif(not PY3_OR_LATER, - reason='On Python 2 DeprecationWarning is not issued for some unkown reason.') -def test_deprecated_grid_scores(random_state=1): - dim = 5 - n_samples = 6 - random_state = check_random_state(random_state) - prec = make_sparse_spd_matrix(dim, alpha=.96, - random_state=random_state) - cov = linalg.inv(prec) - X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) - graphical_lasso = GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1) - graphical_lasso.fit(X) - - depr_message = ("Attribute grid_scores was deprecated in version " - "0.19 and will be removed in 0.21. Use " - "``grid_scores_`` instead") - - with pytest.warns(DeprecationWarning, match=depr_message): - assert_equal(graphical_lasso.grid_scores, graphical_lasso.grid_scores_) + GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) \ No newline at end of file diff --git a/sklearn/datasets/mlcomp.py b/sklearn/datasets/mlcomp.py deleted file mode 100644 index 9adb7bbc1c06e..0000000000000 --- a/sklearn/datasets/mlcomp.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright (c) 2010 Olivier Grisel -# License: BSD 3 clause -"""Glue code to load http://mlcomp.org data as a scikit.learn dataset""" - -import os -import numbers -from sklearn.datasets.base import load_files -from sklearn.utils import deprecated - - -def _load_document_classification(dataset_path, metadata, set_=None, **kwargs): - if set_ is not None: - dataset_path = os.path.join(dataset_path, set_) - return load_files(dataset_path, metadata.get('description'), **kwargs) - - -LOADERS = { - 'DocumentClassification': _load_document_classification, - # TODO: implement the remaining domain formats -} - - -@deprecated("since the http://mlcomp.org/ website will shut down " - "in March 2017, the load_mlcomp function was deprecated " - "in version 0.19 and will be removed in 0.21.") -def load_mlcomp(name_or_id, set_="raw", mlcomp_root=None, **kwargs): - r"""Load a datasets as downloaded from http://mlcomp.org - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - - name_or_id : int or str - The integer id or the string name metadata of the MLComp - dataset to load - - set\_ : str, default='raw' - Select the portion to load: 'train', 'test' or 'raw' - - mlcomp_root : str, optional - The filesystem path to the root folder where MLComp datasets - are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME - environment variable is looked up instead. - - **kwargs : domain specific kwargs to be passed to the dataset loader. - - Returns - ------- - - data : Bunch - Dictionary-like object, the interesting attributes are: - 'filenames', the files holding the raw to learn, 'target', the - classification labels (integer index), 'target_names', - the meaning of the labels, and 'DESCR', the full description of the - dataset. - - Note on the lookup process: depending on the type of name_or_id, - will choose between integer id lookup or metadata name lookup by - looking at the unzipped archives and metadata file. - - TODO: implement zip dataset loading too - """ - - if mlcomp_root is None: - try: - mlcomp_root = os.environ['MLCOMP_DATASETS_HOME'] - except KeyError: - raise ValueError("MLCOMP_DATASETS_HOME env variable is undefined") - - mlcomp_root = os.path.expanduser(mlcomp_root) - mlcomp_root = os.path.abspath(mlcomp_root) - mlcomp_root = os.path.normpath(mlcomp_root) - - if not os.path.exists(mlcomp_root): - raise ValueError("Could not find folder: " + mlcomp_root) - - # dataset lookup - if isinstance(name_or_id, numbers.Integral): - # id lookup - dataset_path = os.path.join(mlcomp_root, str(name_or_id)) - else: - # assume name based lookup - dataset_path = None - expected_name_line = "name: " + name_or_id - for dataset in os.listdir(mlcomp_root): - metadata_file = os.path.join(mlcomp_root, dataset, 'metadata') - if not os.path.exists(metadata_file): - continue - with open(metadata_file) as f: - for line in f: - if line.strip() == expected_name_line: - dataset_path = os.path.join(mlcomp_root, dataset) - break - if dataset_path is None: - raise ValueError("Could not find dataset with metadata line: " + - expected_name_line) - - # loading the dataset metadata - metadata = dict() - metadata_file = os.path.join(dataset_path, 'metadata') - if not os.path.exists(metadata_file): - raise ValueError(dataset_path + ' is not a valid MLComp dataset') - with open(metadata_file) as f: - for line in f: - if ":" in line: - key, value = line.split(":", 1) - metadata[key.strip()] = value.strip() - - format = metadata.get('format', 'unknow') - loader = LOADERS.get(format) - if loader is None: - raise ValueError("No loader implemented for format: " + format) - return loader(dataset_path, metadata, set_=set_, **kwargs) diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index f64d4787b3f71..2eead18b2678d 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -553,7 +553,7 @@ def fit(self, X, y=None): self._fit(X, compute_sources=False) return self - def transform(self, X, y='deprecated', copy=True): + def transform(self, X, copy=True): """Recover the sources from X (apply the unmixing matrix). Parameters @@ -561,9 +561,7 @@ def transform(self, X, y='deprecated', copy=True): X : array-like, shape (n_samples, n_features) Data to transform, where n_samples is the number of samples and n_features is the number of features. - y : (ignored) - .. deprecated:: 0.19 - This parameter will be removed in 0.21. + copy : bool (optional) If False, data passed to fit are overwritten. Defaults to True. @@ -571,11 +569,6 @@ def transform(self, X, y='deprecated', copy=True): ------- X_new : array-like, shape (n_samples, n_components) """ - if not isinstance(y, string_types) or y != 'deprecated': - warnings.warn("The parameter y on transform() is " - "deprecated since 0.19 and will be removed in 0.21", - DeprecationWarning) - check_is_fitted(self, 'mixing_') X = check_array(X, copy=copy, dtype=FLOAT_DTYPES) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 5b48ea1a26b30..51c199e50c2d7 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -230,11 +230,6 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): If None, the random number generator is the RandomState instance used by `np.random`. - n_topics : int, optional (default=None) - This parameter has been renamed to n_components and will - be removed in version 0.21. - .. deprecated:: 0.19 - Attributes ---------- components_ : array, [n_components, n_features] @@ -286,7 +281,7 @@ def __init__(self, n_components=10, doc_topic_prior=None, learning_decay=.7, learning_offset=10., max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1e6, perp_tol=1e-1, mean_change_tol=1e-3, max_doc_update_iter=100, - n_jobs=None, verbose=0, random_state=None, n_topics=None): + n_jobs=None, verbose=0, random_state=None): self.n_components = n_components self.doc_topic_prior = doc_topic_prior self.topic_word_prior = topic_word_prior @@ -303,21 +298,12 @@ def __init__(self, n_components=10, doc_topic_prior=None, self.n_jobs = n_jobs self.verbose = verbose self.random_state = random_state - self.n_topics = n_topics def _check_params(self): """Check model parameters.""" - if self.n_topics is not None: - self._n_components = self.n_topics - warnings.warn("n_topics has been renamed to n_components in " - "version 0.19 and will be removed in 0.21", - DeprecationWarning) - else: - self._n_components = self.n_components - - if self._n_components <= 0: + if self.n_components <= 0: raise ValueError("Invalid 'n_components' parameter: %r" - % self._n_components) + % self.n_components) if self.total_samples <= 0: raise ValueError("Invalid 'total_samples' parameter: %r" @@ -339,12 +325,12 @@ def _init_latent_vars(self, n_features): self.n_iter_ = 0 if self.doc_topic_prior is None: - self.doc_topic_prior_ = 1. / self._n_components + self.doc_topic_prior_ = 1. / self.n_components else: self.doc_topic_prior_ = self.doc_topic_prior if self.topic_word_prior is None: - self.topic_word_prior_ = 1. / self._n_components + self.topic_word_prior_ = 1. / self.n_components else: self.topic_word_prior_ = self.topic_word_prior @@ -352,7 +338,7 @@ def _init_latent_vars(self, n_features): init_var = 1. / init_gamma # In the literature, this is called `lambda` self.components_ = self.random_state_.gamma( - init_gamma, init_var, (self._n_components, n_features)) + init_gamma, init_var, (self.n_components, n_features)) # In the literature, this is `exp(E[log(beta)])` self.exp_dirichlet_component_ = np.exp( @@ -711,7 +697,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size): # compute E[log p(theta | alpha) - log q(theta | gamma)] score += _loglikelihood(doc_topic_prior, doc_topic_distr, - dirichlet_doc_topic, self._n_components) + dirichlet_doc_topic, self.n_components) # Compensate for the subsampling of the population of documents if sub_sampling: @@ -781,7 +767,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, raise ValueError("Number of samples in X and doc_topic_distr" " do not match.") - if n_components != self._n_components: + if n_components != self.n_components: raise ValueError("Number of topics does not match.") current_samples = X.shape[0] @@ -795,7 +781,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, return np.exp(-1.0 * perword_bound) - def perplexity(self, X, doc_topic_distr='deprecated', sub_sampling=False): + def perplexity(self, X, sub_sampling=False): """Calculate approximate perplexity for data X. Perplexity is defined as exp(-1. * log-likelihood per word) @@ -823,10 +809,4 @@ def perplexity(self, X, doc_topic_distr='deprecated', sub_sampling=False): score : float Perplexity score. """ - if doc_topic_distr != 'deprecated': - warnings.warn("Argument 'doc_topic_distr' is deprecated and is " - "being ignored as of 0.19. Support for this " - "argument will be removed in 0.21.", - DeprecationWarning) - return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py index 95c9ab8960e64..5bc2107f7f31c 100644 --- a/sklearn/decomposition/sparse_pca.py +++ b/sklearn/decomposition/sparse_pca.py @@ -197,7 +197,7 @@ def fit(self, X, y=None): self.error_ = E return self - def transform(self, X, ridge_alpha='deprecated'): + def transform(self, X): """Least Squares projection of the data onto the sparse components. To avoid instability issues in case the system is under-determined, @@ -213,14 +213,6 @@ def transform(self, X, ridge_alpha='deprecated'): Test data to be transformed, must have the same number of features as the data used to train the model. - ridge_alpha : float, default: 0.01 - Amount of ridge shrinkage to apply in order to improve - conditioning. - - .. deprecated:: 0.19 - This parameter will be removed in 0.21. - Specify ``ridge_alpha`` in the ``SparsePCA`` constructor. - Returns ------- X_new array, shape (n_samples, n_components) @@ -229,20 +221,11 @@ def transform(self, X, ridge_alpha='deprecated'): check_is_fitted(self, 'components_') X = check_array(X) - if ridge_alpha != 'deprecated': - warnings.warn("The ridge_alpha parameter on transform() is " - "deprecated since 0.19 and will be removed in 0.21. " - "Specify ridge_alpha in the SparsePCA constructor.", - DeprecationWarning) - if ridge_alpha is None: - ridge_alpha = self.ridge_alpha - else: - ridge_alpha = self.ridge_alpha if self.normalize_components: X = X - self.mean_ - U = ridge_regression(self.components_.T, X.T, ridge_alpha, + U = ridge_regression(self.components_.T, X.T, self.ridge_alpha, solver='cholesky') if not self.normalize_components: diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index a635792c6f6ca..bf6b3a4f44631 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -614,20 +614,12 @@ class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin): """ def __init__(self, priors=None, reg_param=0., store_covariance=False, - tol=1.0e-4, store_covariances=None): + tol=1.0e-4): self.priors = np.asarray(priors) if priors is not None else None self.reg_param = reg_param - self.store_covariances = store_covariances self.store_covariance = store_covariance self.tol = tol - @property - @deprecated("Attribute ``covariances_`` was deprecated in version" - " 0.19 and will be removed in 0.21. Use " - "``covariance_`` instead") - def covariances_(self): - return self.covariance_ - def fit(self, X, y): """Fit the model according to the given training data and parameters. @@ -662,10 +654,7 @@ def fit(self, X, y): cov = None store_covariance = self.store_covariance or self.store_covariances - if self.store_covariances: - warnings.warn("'store_covariances' was renamed to store_covariance" - " in version 0.19 and will be removed in 0.21.", - DeprecationWarning) + if store_covariance: cov = [] means = [] diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index 744a073090bad..f670e9cbec89e 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -57,8 +57,7 @@ class FeatureHasher(BaseEstimator, TransformerMixin): feature_name should be a string, while value should be a number. In the case of "string", a value of 1 is implied. The feature_name is hashed to find the appropriate column for the - feature. The value's sign might be flipped in the output (but see - non_negative, below). + feature. The value's sign might be flipped in the output. dtype : numpy type, optional, default np.float64 The type of feature values. Passed to scipy.sparse matrix constructors as the dtype argument. Do not set this to bool, np.boolean or any @@ -68,15 +67,6 @@ class FeatureHasher(BaseEstimator, TransformerMixin): approximately conserve the inner product in the hashed space even for small n_features. This approach is similar to sparse random projection. - non_negative : boolean, optional, default False - When True, an absolute value is applied to the features matrix prior to - returning it. When used in conjunction with alternate_sign=True, this - significantly reduces the inner product preservation property. - - .. deprecated:: 0.19 - This option will be removed in 0.21. - - Examples -------- >>> from sklearn.feature_extraction import FeatureHasher @@ -94,18 +84,12 @@ class FeatureHasher(BaseEstimator, TransformerMixin): """ def __init__(self, n_features=(2 ** 20), input_type="dict", - dtype=np.float64, alternate_sign=True, non_negative=False): + dtype=np.float64, alternate_sign=True): self._validate_params(n_features, input_type) - if non_negative: - warnings.warn("the option non_negative=True has been deprecated" - " in 0.19 and will be removed" - " in version 0.21.", DeprecationWarning) - self.dtype = dtype self.input_type = input_type self.n_features = n_features self.alternate_sign = alternate_sign - self.non_negative = non_negative @staticmethod def _validate_params(n_features, input_type): @@ -175,6 +159,4 @@ def transform(self, raw_X): shape=(n_samples, self.n_features)) X.sum_duplicates() # also sorts the indices - if self.non_negative: - np.abs(X.data, X.data) return X diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 05f60d2805c7c..a4de38d959db1 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -491,13 +491,6 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin): .. versionadded:: 0.19 - non_negative : boolean, optional, default False - When True, an absolute value is applied to the features matrix prior to - returning it. When used in conjunction with alternate_sign=True, this - significantly reduces the inner product preservation property. - - .. deprecated:: 0.19 - This option will be removed in 0.21. dtype : type, optional Type of the matrix returned by fit_transform() or transform(). @@ -526,7 +519,7 @@ def __init__(self, input='content', encoding='utf-8', stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20), binary=False, norm='l2', alternate_sign=True, - non_negative=False, dtype=np.float64): + dtype=np.float64): self.input = input self.encoding = encoding self.decode_error = decode_error @@ -542,7 +535,6 @@ def __init__(self, input='content', encoding='utf-8', self.binary = binary self.norm = norm self.alternate_sign = alternate_sign - self.non_negative = non_negative self.dtype = dtype def partial_fit(self, X, y=None): @@ -630,8 +622,7 @@ def fit_transform(self, X, y=None): def _get_hasher(self): return FeatureHasher(n_features=self.n_features, input_type='string', dtype=self.dtype, - alternate_sign=self.alternate_sign, - non_negative=self.non_negative) + alternate_sign=self.alternate_sign) def _document_frequency(X): diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py index ac2c0a46b6866..ebe8dd3b65ade 100644 --- a/sklearn/gaussian_process/gpr.py +++ b/sklearn/gaussian_process/gpr.py @@ -158,18 +158,6 @@ def __init__(self, kernel=None, alpha=1e-10, self.copy_X_train = copy_X_train self.random_state = random_state - @property - @deprecated("Attribute rng was deprecated in version 0.19 and " - "will be removed in 0.21.") - def rng(self): - return self._rng - - @property - @deprecated("Attribute y_train_mean was deprecated in version 0.19 and " - "will be removed in 0.21.") - def y_train_mean(self): - return self._y_train_mean - def fit(self, X, y): """Fit Gaussian process regression model. diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 79d915fa1e2df..585f453e389b2 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -619,10 +619,7 @@ def _get_kernel_params(self): if (self.gamma is not None or self.coef0 is not None or self.degree is not None): - warnings.warn( - "Passing gamma, coef0 or degree to Nystroem when using a" - " callable kernel is deprecated in version 0.19 and will" - " raise an error in 0.21, as they are ignored. Use " - "kernel_params instead.", DeprecationWarning) + raise ValueErrror("Don't pass gamma, coef0 or degree to " + "Nystroem if using a callable kernel.") return params diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index ce13b99b6aae5..0e923a424c221 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -1185,13 +1185,6 @@ def fit(self, X, y): Xy=None, fit_path=True) return self - @property - @deprecated("Attribute alpha is deprecated in 0.19 and " - "will be removed in 0.21. See ``alpha_`` instead") - def alpha(self): - # impedance matching for the above Lars.fit (should not be documented) - return self.alpha_ - class LassoLarsCV(LarsCV): """Cross-validated Lasso, using the LARS algorithm diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py deleted file mode 100644 index 40ebe3c57826b..0000000000000 --- a/sklearn/linear_model/randomized_l1.py +++ /dev/null @@ -1,670 +0,0 @@ -""" -Randomized Lasso/Logistic: feature selection based on Lasso and -sparse Logistic Regression -""" - -# Author: Gael Varoquaux, Alexandre Gramfort -# -# License: BSD 3 clause - -import warnings -import itertools -from abc import ABCMeta, abstractmethod - -import numpy as np -from scipy.sparse import issparse -from scipy import sparse -from scipy.interpolate import interp1d - -from .base import _preprocess_data -from ..base import BaseEstimator -from ..externals import six -from ..utils import Memory, Parallel, delayed -from ..feature_selection.base import SelectorMixin -from ..utils import (as_float_array, check_random_state, check_X_y, safe_mask, - deprecated) -from ..utils.validation import check_is_fitted -from .least_angle import lars_path, LassoLarsIC -from .logistic import LogisticRegression -from ..exceptions import ConvergenceWarning - - -############################################################################### -# Randomized linear model: feature selection - -def _resample_model(estimator_func, X, y, scaling=.5, n_resampling=200, - n_jobs=None, verbose=False, pre_dispatch='3*n_jobs', - random_state=None, sample_fraction=.75, **params): - random_state = check_random_state(random_state) - # We are generating 1 - weights, and not weights - n_samples, n_features = X.shape - - if not (0 < scaling < 1): - raise ValueError( - "'scaling' should be between 0 and 1. Got %r instead." % scaling) - - scaling = 1. - scaling - scores_ = 0.0 - for active_set in Parallel(n_jobs=n_jobs, verbose=verbose, - pre_dispatch=pre_dispatch)( - delayed(estimator_func)( - X, y, weights=scaling * random_state.randint( - 0, 2, size=(n_features,)), - mask=(random_state.rand(n_samples) < sample_fraction), - verbose=max(0, verbose - 1), - **params) - for _ in range(n_resampling)): - scores_ += active_set - - scores_ /= n_resampling - return scores_ - - -@deprecated("The class BaseRandomizedLinearModel is deprecated in 0.19" - " and will be removed in 0.21.") -class BaseRandomizedLinearModel(six.with_metaclass(ABCMeta, BaseEstimator, - SelectorMixin)): - """Base class to implement randomized linear models for feature selection - - This implements the strategy by Meinshausen and Buhlman: - stability selection with randomized sampling, and random re-weighting of - the penalty. - """ - - @abstractmethod - def __init__(self): - pass - - _preprocess_data = staticmethod(_preprocess_data) - - def fit(self, X, y): - """Fit the model using X, y as training data. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training data. - - y : array-like, shape = [n_samples] - Target values. Will be cast to X's dtype if necessary - - Returns - ------- - self : object - Returns an instance of self. - """ - X, y = check_X_y(X, y, ['csr', 'csc'], y_numeric=True, - ensure_min_samples=2, estimator=self) - X = as_float_array(X, copy=False) - n_samples, n_features = X.shape - - X, y, X_offset, y_offset, X_scale = \ - self._preprocess_data(X, y, self.fit_intercept, self.normalize) - - estimator_func, params = self._make_estimator_and_params(X, y) - memory = self.memory - if memory is None: - memory = Memory(cachedir=None, verbose=0) - elif isinstance(memory, six.string_types): - memory = Memory(cachedir=memory, verbose=0) - elif not isinstance(memory, Memory): - raise ValueError("'memory' should either be a string or" - " a sklearn.utils.Memory" - " instance, got 'memory={!r}' instead.".format( - type(memory))) - - scores_ = memory.cache( - _resample_model, ignore=['verbose', 'n_jobs', 'pre_dispatch'] - )( - estimator_func, X, y, - scaling=self.scaling, n_resampling=self.n_resampling, - n_jobs=self.n_jobs, verbose=self.verbose, - pre_dispatch=self.pre_dispatch, random_state=self.random_state, - sample_fraction=self.sample_fraction, **params) - - if scores_.ndim == 1: - scores_ = scores_[:, np.newaxis] - self.all_scores_ = scores_ - self.scores_ = np.max(self.all_scores_, axis=1) - return self - - def _make_estimator_and_params(self, X, y): - """Return the parameters passed to the estimator""" - raise NotImplementedError - - def _get_support_mask(self): - """Get the boolean mask indicating which features are selected. - - Returns - ------- - support : boolean array of shape [# input features] - An element is True iff its corresponding feature is selected - for retention. - """ - check_is_fitted(self, 'scores_') - return self.scores_ > self.selection_threshold - - -############################################################################### -# Randomized lasso: regression settings - -def _randomized_lasso(X, y, weights, mask, alpha=1., verbose=False, - precompute=False, eps=np.finfo(np.float).eps, - max_iter=500): - X = X[safe_mask(X, mask)] - y = y[mask] - - # Center X and y to avoid fit the intercept - X -= X.mean(axis=0) - y -= y.mean() - - alpha = np.atleast_1d(np.asarray(alpha, dtype=np.float64)) - - X = (1 - weights) * X - - with warnings.catch_warnings(): - warnings.simplefilter('ignore', ConvergenceWarning) - alphas_, _, coef_ = lars_path(X, y, - Gram=precompute, copy_X=False, - copy_Gram=False, alpha_min=np.min(alpha), - method='lasso', verbose=verbose, - max_iter=max_iter, eps=eps) - - if len(alpha) > 1: - if len(alphas_) > 1: # np.min(alpha) < alpha_min - interpolator = interp1d(alphas_[::-1], coef_[:, ::-1], - bounds_error=False, fill_value=0.) - scores = (interpolator(alpha) != 0.0) - else: - scores = np.zeros((X.shape[1], len(alpha)), dtype=np.bool) - else: - scores = coef_[:, -1] != 0.0 - return scores - - -@deprecated("The class RandomizedLasso is deprecated in 0.19" - " and will be removed in 0.21.") -class RandomizedLasso(BaseRandomizedLinearModel): - """Randomized Lasso. - - Randomized Lasso works by subsampling the training data and - computing a Lasso estimate where the penalty of a random subset of - coefficients has been scaled. By performing this double - randomization several times, the method assigns high scores to - features that are repeatedly selected across randomizations. This - is known as stability selection. In short, features selected more - often are considered good features. - - Parameters - ---------- - alpha : float, 'aic', or 'bic', optional - The regularization parameter alpha parameter in the Lasso. - Warning: this is not the alpha parameter in the stability selection - article which is scaling. - - scaling : float, optional - The s parameter used to randomly scale the penalty of different - features. - Should be between 0 and 1. - - sample_fraction : float, optional - The fraction of samples to be used in each randomized design. - Should be between 0 and 1. If 1, all samples are used. - - n_resampling : int, optional - Number of randomized models. - - selection_threshold : float, optional - The score above which features should be selected. - - fit_intercept : boolean, optional - whether to calculate the intercept for this model. If set - to false, no intercept will be used in calculations - (e.g. data is expected to be already centered). - - verbose : boolean or integer, optional - Sets the verbosity amount - - normalize : boolean, optional, default True - If True, the regressors X will be normalized before regression. - This parameter is ignored when `fit_intercept` is set to False. - When the regressors are normalized, note that this makes the - hyperparameters learned more robust and almost independent of - the number of samples. The same property is not valid for - standardized data. However, if you wish to standardize, please - use `preprocessing.StandardScaler` before calling `fit` on an - estimator with `normalize=False`. - - precompute : True | False | 'auto' | array-like - Whether to use a precomputed Gram matrix to speed up calculations. - If set to 'auto' let us decide. - The Gram matrix can also be passed as argument, but it will be used - only for the selection of parameter alpha, if alpha is 'aic' or 'bic'. - - max_iter : integer, optional - Maximum number of iterations to perform in the Lars algorithm. - - eps : float, optional - The machine-precision regularization in the computation of the - Cholesky diagonal factors. Increase this for very ill-conditioned - systems. Unlike the 'tol' parameter in some iterative - optimization-based algorithms, this parameter does not control - the tolerance of the optimization. - - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - n_jobs : int or None, optional (default=None) - Number of CPUs to use during the resampling. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: - - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - - An int, giving the exact number of total jobs that are - spawned - - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' - - memory : None, str or object with the joblib.Memory interface, optional \ - (default=None) - Used for internal caching. By default, no caching is done. - If a string is given, it is the path to the caching directory. - - Attributes - ---------- - scores_ : array, shape = [n_features] - Feature scores between 0 and 1. - - all_scores_ : array, shape = [n_features, n_reg_parameter] - Feature scores between 0 and 1 for all values of the regularization \ - parameter. The reference article suggests ``scores_`` is the max of \ - ``all_scores_``. - - Examples - -------- - >>> from sklearn.linear_model import RandomizedLasso - >>> randomized_lasso = RandomizedLasso() # doctest: +SKIP - - References - ---------- - Stability selection - Nicolai Meinshausen, Peter Buhlmann - Journal of the Royal Statistical Society: Series B - Volume 72, Issue 4, pages 417-473, September 2010 - DOI: 10.1111/j.1467-9868.2010.00740.x - - See also - -------- - RandomizedLogisticRegression, Lasso, ElasticNet - """ - def __init__(self, alpha='aic', scaling=.5, sample_fraction=.75, - n_resampling=200, selection_threshold=.25, - fit_intercept=True, verbose=False, - normalize=True, precompute='auto', - max_iter=500, - eps=np.finfo(np.float).eps, random_state=None, - n_jobs=None, pre_dispatch='3*n_jobs', - memory=None): - self.alpha = alpha - self.scaling = scaling - self.sample_fraction = sample_fraction - self.n_resampling = n_resampling - self.fit_intercept = fit_intercept - self.max_iter = max_iter - self.verbose = verbose - self.normalize = normalize - self.precompute = precompute - self.eps = eps - self.random_state = random_state - self.n_jobs = n_jobs - self.selection_threshold = selection_threshold - self.pre_dispatch = pre_dispatch - self.memory = memory - - def _make_estimator_and_params(self, X, y): - alpha = self.alpha - if isinstance(alpha, six.string_types) and alpha in ('aic', 'bic'): - model = LassoLarsIC(precompute=self.precompute, - criterion=self.alpha, - max_iter=self.max_iter, - eps=self.eps) - model.fit(X, y) - self.alpha_ = alpha = model.alpha_ - - precompute = self.precompute - # A precomputed Gram array is useless, since _randomized_lasso - # change X a each iteration - if hasattr(precompute, '__array__'): - precompute = 'auto' - assert precompute in (True, False, None, 'auto') - return _randomized_lasso, dict(alpha=alpha, max_iter=self.max_iter, - eps=self.eps, - precompute=precompute) - - -############################################################################### -# Randomized logistic: classification settings - -def _randomized_logistic(X, y, weights, mask, C=1., verbose=False, - fit_intercept=True, tol=1e-3): - X = X[safe_mask(X, mask)] - y = y[mask] - if issparse(X): - size = len(weights) - weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size)) - X = X * weight_dia - else: - X *= (1 - weights) - - C = np.atleast_1d(np.asarray(C, dtype=np.float64)) - if C.ndim > 1: - raise ValueError("C should be 1-dimensional array-like, " - "but got a {}-dimensional array-like instead: {}." - .format(C.ndim, C)) - - scores = np.zeros((X.shape[1], len(C)), dtype=np.bool) - - for this_C, this_scores in zip(C, scores.T): - # XXX : would be great to do it with a warm_start ... - clf = LogisticRegression(C=this_C, tol=tol, penalty='l1', dual=False, - fit_intercept=fit_intercept, - solver='liblinear', multi_class='ovr') - clf.fit(X, y) - this_scores[:] = np.any( - np.abs(clf.coef_) > 10 * np.finfo(np.float).eps, axis=0) - return scores - - -@deprecated("The class RandomizedLogisticRegression is deprecated in 0.19" - " and will be removed in 0.21.") -class RandomizedLogisticRegression(BaseRandomizedLinearModel): - """Randomized Logistic Regression - - Randomized Logistic Regression works by subsampling the training - data and fitting a L1-penalized LogisticRegression model where the - penalty of a random subset of coefficients has been scaled. By - performing this double randomization several times, the method - assigns high scores to features that are repeatedly selected across - randomizations. This is known as stability selection. In short, - features selected more often are considered good features. - - Parameters - ---------- - C : float or array-like of shape [n_reg_parameter], optional, default=1 - The regularization parameter C in the LogisticRegression. - When C is an array, fit will take each regularization parameter in C - one by one for LogisticRegression and store results for each one - in ``all_scores_``, where columns and rows represent corresponding - reg_parameters and features. - - scaling : float, optional, default=0.5 - The s parameter used to randomly scale the penalty of different - features. - Should be between 0 and 1. - - sample_fraction : float, optional, default=0.75 - The fraction of samples to be used in each randomized design. - Should be between 0 and 1. If 1, all samples are used. - - n_resampling : int, optional, default=200 - Number of randomized models. - - selection_threshold : float, optional, default=0.25 - The score above which features should be selected. - - tol : float, optional, default=1e-3 - tolerance for stopping criteria of LogisticRegression - - fit_intercept : boolean, optional, default=True - whether to calculate the intercept for this model. If set - to false, no intercept will be used in calculations - (e.g. data is expected to be already centered). - - verbose : boolean or integer, optional - Sets the verbosity amount - - normalize : boolean, optional, default True - If True, the regressors X will be normalized before regression. - This parameter is ignored when `fit_intercept` is set to False. - When the regressors are normalized, note that this makes the - hyperparameters learnt more robust and almost independent of the number - of samples. The same property is not valid for standardized data. - However, if you wish to standardize, please use - `preprocessing.StandardScaler` before calling `fit` on an estimator - with `normalize=False`. - - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - n_jobs : int or None, optional (default=None) - Number of CPUs to use during the resampling. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: - - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - - An int, giving the exact number of total jobs that are - spawned - - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' - - memory : None, str or object with the joblib.Memory interface, optional \ - (default=None) - Used for internal caching. By default, no caching is done. - If a string is given, it is the path to the caching directory. - - Attributes - ---------- - scores_ : array, shape = [n_features] - Feature scores between 0 and 1. - - all_scores_ : array, shape = [n_features, n_reg_parameter] - Feature scores between 0 and 1 for all values of the regularization \ - parameter. The reference article suggests ``scores_`` is the max \ - of ``all_scores_``. - - Examples - -------- - >>> from sklearn.linear_model import RandomizedLogisticRegression - >>> randomized_logistic = RandomizedLogisticRegression() # doctest: +SKIP - - References - ---------- - Stability selection - Nicolai Meinshausen, Peter Buhlmann - Journal of the Royal Statistical Society: Series B - Volume 72, Issue 4, pages 417-473, September 2010 - DOI: 10.1111/j.1467-9868.2010.00740.x - - See also - -------- - RandomizedLasso, LogisticRegression - """ - def __init__(self, C=1, scaling=.5, sample_fraction=.75, - n_resampling=200, - selection_threshold=.25, tol=1e-3, - fit_intercept=True, verbose=False, - normalize=True, - random_state=None, - n_jobs=None, pre_dispatch='3*n_jobs', - memory=None): - self.C = C - self.scaling = scaling - self.sample_fraction = sample_fraction - self.n_resampling = n_resampling - self.fit_intercept = fit_intercept - self.verbose = verbose - self.normalize = normalize - self.tol = tol - self.random_state = random_state - self.n_jobs = n_jobs - self.selection_threshold = selection_threshold - self.pre_dispatch = pre_dispatch - self.memory = memory - - def _make_estimator_and_params(self, X, y): - params = dict(C=self.C, tol=self.tol, - fit_intercept=self.fit_intercept) - return _randomized_logistic, params - - def _preprocess_data(self, X, y, fit_intercept, normalize=False): - """Center the data in X but not in y""" - X, _, X_offset, _, X_scale = _preprocess_data(X, y, fit_intercept, - normalize=normalize) - return X, y, X_offset, y, X_scale - - -############################################################################### -# Stability paths -def _lasso_stability_path(X, y, mask, weights, eps): - "Inner loop of lasso_stability_path" - X = X * weights[np.newaxis, :] - X = X[safe_mask(X, mask), :] - y = y[mask] - - alpha_max = np.max(np.abs(np.dot(X.T, y))) / X.shape[0] - alpha_min = eps * alpha_max # set for early stopping in path - with warnings.catch_warnings(): - warnings.simplefilter('ignore', ConvergenceWarning) - alphas, _, coefs = lars_path(X, y, method='lasso', verbose=False, - alpha_min=alpha_min) - # Scale alpha by alpha_max - alphas /= alphas[0] - # Sort alphas in ascending order - alphas = alphas[::-1] - coefs = coefs[:, ::-1] - # Get rid of the alphas that are too small - mask = alphas >= eps - # We also want to keep the first one: it should be close to the OLS - # solution - mask[0] = True - alphas = alphas[mask] - coefs = coefs[:, mask] - return alphas, coefs - - -@deprecated("The function lasso_stability_path is deprecated in 0.19" - " and will be removed in 0.21.") -def lasso_stability_path(X, y, scaling=0.5, random_state=None, - n_resampling=200, n_grid=100, - sample_fraction=0.75, - eps=4 * np.finfo(np.float).eps, n_jobs=None, - verbose=False): - """Stability path based on randomized Lasso estimates - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - training data. - - y : array-like, shape = [n_samples] - target values. - - scaling : float, optional, default=0.5 - The alpha parameter in the stability selection article used to - randomly scale the features. Should be between 0 and 1. - - random_state : int, RandomState instance or None, optional, default=None - The generator used to randomize the design. If int, random_state is - the seed used by the random number generator; If RandomState instance, - random_state is the random number generator; If None, the random number - generator is the RandomState instance used by `np.random`. - - n_resampling : int, optional, default=200 - Number of randomized models. - - n_grid : int, optional, default=100 - Number of grid points. The path is linearly reinterpolated - on a grid between 0 and 1 before computing the scores. - - sample_fraction : float, optional, default=0.75 - The fraction of samples to be used in each randomized design. - Should be between 0 and 1. If 1, all samples are used. - - eps : float, optional - Smallest value of alpha / alpha_max considered - - n_jobs : int or None, optional (default=None) - Number of CPUs to use during the resampling. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - verbose : boolean or integer, optional - Sets the verbosity amount - - Returns - ------- - alphas_grid : array, shape ~ [n_grid] - The grid points between 0 and 1: alpha/alpha_max - - scores_path : array, shape = [n_features, n_grid] - The scores for each feature along the path. - """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo']) - rng = check_random_state(random_state) - - if not (0 < scaling < 1): - raise ValueError("Parameter 'scaling' should be between 0 and 1." - " Got %r instead." % scaling) - - n_samples, n_features = X.shape - - paths = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(_lasso_stability_path)( - X, y, mask=rng.rand(n_samples) < sample_fraction, - weights=1. - scaling * rng.randint(0, 2, size=(n_features,)), - eps=eps) - for k in range(n_resampling)) - - all_alphas = sorted(list(set(itertools.chain(*[p[0] for p in paths])))) - # Take approximately n_grid values - stride = int(max(1, int(len(all_alphas) / float(n_grid)))) - all_alphas = all_alphas[::stride] - if not all_alphas[-1] == 1: - all_alphas.append(1.) - all_alphas = np.array(all_alphas) - scores_path = np.zeros((n_features, len(all_alphas))) - - for alphas, coefs in paths: - if alphas[0] != 0: - alphas = np.r_[0, alphas] - coefs = np.c_[np.ones((n_features, 1)), coefs] - if alphas[-1] != all_alphas[-1]: - alphas = np.r_[alphas, all_alphas[-1]] - coefs = np.c_[coefs, np.zeros((n_features, 1))] - scores_path += (interp1d(alphas, coefs, - kind='nearest', bounds_error=False, - fill_value=0, axis=-1)(all_alphas) != 0) - - scores_path /= n_resampling - return all_alphas, scores_path diff --git a/sklearn/linear_model/tests/test_randomized_l1.py b/sklearn/linear_model/tests/test_randomized_l1.py deleted file mode 100644 index 564fbd4e7827d..0000000000000 --- a/sklearn/linear_model/tests/test_randomized_l1.py +++ /dev/null @@ -1,219 +0,0 @@ -# Authors: Alexandre Gramfort -# License: BSD 3 clause - -from tempfile import mkdtemp -import shutil - -import numpy as np -from scipy import sparse - -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_raises_regex -from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import ignore_warnings -from sklearn.utils.testing import assert_warns_message - -from sklearn.linear_model.randomized_l1 import(lasso_stability_path, - RandomizedLasso, - RandomizedLogisticRegression) - -from sklearn.datasets import load_diabetes, load_iris -from sklearn.feature_selection import f_regression, f_classif -from sklearn.preprocessing import StandardScaler -from sklearn.linear_model.base import _preprocess_data - -diabetes = load_diabetes() -X = diabetes.data -y = diabetes.target -X = StandardScaler().fit_transform(X) -X = X[:, [2, 3, 6, 7, 8]] - -# test that the feature score of the best features -F, _ = f_regression(X, y) - - -@ignore_warnings(category=DeprecationWarning) -def test_lasso_stability_path(): - # Check lasso stability path - # Load diabetes data and add noisy features - scaling = 0.3 - coef_grid, scores_path = lasso_stability_path(X, y, scaling=scaling, - random_state=42, - n_resampling=30) - - assert_array_equal(np.argsort(F)[-3:], - np.argsort(np.sum(scores_path, axis=1))[-3:]) - - -@ignore_warnings(category=DeprecationWarning) -def test_randomized_lasso_error_memory(): - scaling = 0.3 - selection_threshold = 0.5 - tempdir = 5 - clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42, - scaling=scaling, - selection_threshold=selection_threshold, - memory=tempdir) - assert_raises_regex(ValueError, "'memory' should either be a string or" - " a sklearn.utils.Memory instance", - clf.fit, X, y) - - -@ignore_warnings(category=DeprecationWarning) -def test_randomized_lasso(): - # Check randomized lasso - scaling = 0.3 - selection_threshold = 0.5 - n_resampling = 20 - - # or with 1 alpha - clf = RandomizedLasso(verbose=False, alpha=1, random_state=42, - scaling=scaling, n_resampling=n_resampling, - selection_threshold=selection_threshold) - feature_scores = clf.fit(X, y).scores_ - assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:]) - - # or with many alphas - clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42, - scaling=scaling, n_resampling=n_resampling, - selection_threshold=selection_threshold) - feature_scores = clf.fit(X, y).scores_ - assert_equal(clf.all_scores_.shape, (X.shape[1], 2)) - assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:]) - # test caching - try: - tempdir = mkdtemp() - clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42, - scaling=scaling, - selection_threshold=selection_threshold, - memory=tempdir) - feature_scores = clf.fit(X, y).scores_ - assert_equal(clf.all_scores_.shape, (X.shape[1], 2)) - assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:]) - finally: - shutil.rmtree(tempdir) - - X_r = clf.transform(X) - X_full = clf.inverse_transform(X_r) - assert_equal(X_r.shape[1], np.sum(feature_scores > selection_threshold)) - assert_equal(X_full.shape, X.shape) - - clf = RandomizedLasso(verbose=False, alpha='aic', random_state=42, - scaling=scaling, n_resampling=100) - feature_scores = clf.fit(X, y).scores_ - assert_allclose(feature_scores, [1., 1., 1., 0.225, 1.], rtol=0.2) - - clf = RandomizedLasso(verbose=False, scaling=-0.1) - assert_raises(ValueError, clf.fit, X, y) - - clf = RandomizedLasso(verbose=False, scaling=1.1) - assert_raises(ValueError, clf.fit, X, y) - - -@ignore_warnings(category=DeprecationWarning) -def test_randomized_lasso_precompute(): - # Check randomized lasso for different values of precompute - n_resampling = 20 - alpha = 1 - random_state = 42 - - G = np.dot(X.T, X) - - clf = RandomizedLasso(alpha=alpha, random_state=random_state, - precompute=G, n_resampling=n_resampling) - feature_scores_1 = clf.fit(X, y).scores_ - - for precompute in [True, False, None, 'auto']: - clf = RandomizedLasso(alpha=alpha, random_state=random_state, - precompute=precompute, n_resampling=n_resampling) - feature_scores_2 = clf.fit(X, y).scores_ - assert_array_equal(feature_scores_1, feature_scores_2) - - -@ignore_warnings(category=DeprecationWarning) -def test_randomized_logistic(): - # Check randomized sparse logistic regression - iris = load_iris() - X = iris.data[:, [0, 2]] - y = iris.target - X = X[y != 2] - y = y[y != 2] - - F, _ = f_classif(X, y) - - scaling = 0.3 - clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, - scaling=scaling, n_resampling=50, - tol=1e-3) - X_orig = X.copy() - feature_scores = clf.fit(X, y).scores_ - assert_array_equal(X, X_orig) # fit does not modify X - assert_array_equal(np.argsort(F), np.argsort(feature_scores)) - - clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5], - random_state=42, scaling=scaling, - n_resampling=50, tol=1e-3) - feature_scores = clf.fit(X, y).scores_ - assert_array_equal(np.argsort(F), np.argsort(feature_scores)) - - clf = RandomizedLogisticRegression(verbose=False, C=[[1., 0.5]]) - assert_raises(ValueError, clf.fit, X, y) - - -@ignore_warnings(category=DeprecationWarning) -def test_randomized_logistic_sparse(): - # Check randomized sparse logistic regression on sparse data - iris = load_iris() - X = iris.data[:, [0, 2]] - y = iris.target - X = X[y != 2] - y = y[y != 2] - - # center here because sparse matrices are usually not centered - # labels should not be centered - X, _, _, _, _ = _preprocess_data(X, y, True, True) - - X_sp = sparse.csr_matrix(X) - - F, _ = f_classif(X, y) - - scaling = 0.3 - clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, - scaling=scaling, n_resampling=50, - tol=1e-3) - feature_scores = clf.fit(X, y).scores_ - clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, - scaling=scaling, n_resampling=50, - tol=1e-3) - feature_scores_sp = clf.fit(X_sp, y).scores_ - assert_array_equal(feature_scores, feature_scores_sp) - - -def test_warning_raised(): - - scaling = 0.3 - selection_threshold = 0.5 - tempdir = 5 - assert_warns_message(DeprecationWarning, "The function" - " lasso_stability_path is " - "deprecated in 0.19 and will be removed in 0.21.", - lasso_stability_path, X, y, scaling=scaling, - random_state=42, n_resampling=30) - - assert_warns_message(DeprecationWarning, "Class RandomizedLasso is" - " deprecated; The class RandomizedLasso is " - "deprecated in 0.19 and will be removed in 0.21.", - RandomizedLasso, verbose=False, alpha=[1, 0.8], - random_state=42, scaling=scaling, - selection_threshold=selection_threshold, - memory=tempdir) - - assert_warns_message(DeprecationWarning, "The class" - " RandomizedLogisticRegression is " - "deprecated in 0.19 and will be removed in 0.21.", - RandomizedLogisticRegression, - verbose=False, C=1., random_state=42, - scaling=scaling, n_resampling=50, - tol=1e-3) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 1c69036d0d27a..5ddda56491564 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -805,11 +805,6 @@ def _fit(self, X, skip_num_points=0): neighbors=neighbors_nn, skip_num_points=skip_num_points) - @property - @deprecated("Attribute n_iter_final was deprecated in version 0.19 and " - "will be removed in 0.21. Use ``n_iter_`` instead") - def n_iter_final(self): - return self.n_iter_ def _tsne(self, P, degrees_of_freedom, n_samples, X_embedded, neighbors=None, skip_num_points=0): diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 526d4d9f3d512..5aba68f861253 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -443,8 +443,7 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", batch_size=batch_size)[0] -def manhattan_distances(X, Y=None, sum_over_features=True, - size_threshold=None): +def manhattan_distances(X, Y=None, sum_over_features=True): """ Compute the L1 distances between the vectors in X and Y. With sum_over_features equal to False it returns the componentwise @@ -465,9 +464,6 @@ def manhattan_distances(X, Y=None, sum_over_features=True, else it returns the componentwise L1 pairwise-distances. Not supported for sparse matrix inputs. - size_threshold : int, default=5e8 - Unused parameter. - Returns ------- D : array @@ -497,10 +493,6 @@ def manhattan_distances(X, Y=None, sum_over_features=True, array([[1., 1.], [1., 1.]]) """ - if size_threshold is not None: - warnings.warn('Use of the "size_threshold" is deprecated ' - 'in 0.19 and it will be removed version ' - '0.21 of scikit-learn', DeprecationWarning) X, Y = check_pairwise_arrays(X, Y) if issparse(X) or issparse(Y): diff --git a/sklearn/neighbors/approximate.py b/sklearn/neighbors/approximate.py deleted file mode 100644 index 650af47e0d81b..0000000000000 --- a/sklearn/neighbors/approximate.py +++ /dev/null @@ -1,589 +0,0 @@ -"""Approximate nearest neighbor search""" -# Author: Maheshakya Wijewardena -# Joel Nothman - -import numpy as np -import warnings - -from scipy import sparse - -from .base import KNeighborsMixin, RadiusNeighborsMixin -from ..base import BaseEstimator -from ..utils.validation import check_array -from ..utils import check_random_state -from ..metrics.pairwise import pairwise_distances - -from ..random_projection import GaussianRandomProjection - -__all__ = ["LSHForest"] - -HASH_DTYPE = '>u4' -MAX_HASH_SIZE = np.dtype(HASH_DTYPE).itemsize * 8 - - -def _find_matching_indices(tree, bin_X, left_mask, right_mask): - """Finds indices in sorted array of integers. - - Most significant h bits in the binary representations of the - integers are matched with the items' most significant h bits. - """ - left_index = np.searchsorted(tree, bin_X & left_mask) - right_index = np.searchsorted(tree, bin_X | right_mask, - side='right') - return left_index, right_index - - -def _find_longest_prefix_match(tree, bin_X, hash_size, - left_masks, right_masks): - """Find the longest prefix match in tree for each query in bin_X - - Most significant bits are considered as the prefix. - """ - hi = np.empty_like(bin_X, dtype=np.intp) - hi.fill(hash_size) - lo = np.zeros_like(bin_X, dtype=np.intp) - res = np.empty_like(bin_X, dtype=np.intp) - - left_idx, right_idx = _find_matching_indices(tree, bin_X, - left_masks[hi], - right_masks[hi]) - found = right_idx > left_idx - res[found] = lo[found] = hash_size - - r = np.arange(bin_X.shape[0]) - kept = r[lo < hi] # indices remaining in bin_X mask - while kept.shape[0]: - mid = (lo.take(kept) + hi.take(kept)) // 2 - - left_idx, right_idx = _find_matching_indices(tree, - bin_X.take(kept), - left_masks[mid], - right_masks[mid]) - found = right_idx > left_idx - mid_found = mid[found] - lo[kept[found]] = mid_found + 1 - res[kept[found]] = mid_found - hi[kept[~found]] = mid[~found] - - kept = r[lo < hi] - - return res - - -class ProjectionToHashMixin(object): - """Turn a transformed real-valued array into a hash""" - @staticmethod - def _to_hash(projected): - if projected.shape[1] % 8 != 0: - raise ValueError('Require reduced dimensionality to be a multiple ' - 'of 8 for hashing') - # XXX: perhaps non-copying operation better - out = np.packbits((projected > 0).astype(int)).view(dtype=HASH_DTYPE) - return out.reshape(projected.shape[0], -1) - - def fit_transform(self, X, y=None): - """ - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vectors, where n_samples is the number of samples and - n_features is the number of predictors. - """ - - self.fit(X) - return self.transform(X) - - def transform(self, X): - """ - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vectors, where n_samples is the number of samples and - n_features is the number of predictors. - """ - return self._to_hash(super(ProjectionToHashMixin, self).transform(X)) - - -class GaussianRandomProjectionHash(ProjectionToHashMixin, - GaussianRandomProjection): - """Use GaussianRandomProjection to produce a cosine LSH fingerprint - - Parameters - ---------- - - n_components : int or 'auto', optional (default = 32) - Dimensionality of the target projection space. - - n_components can be automatically adjusted according to the - number of samples in the dataset and the bound given by the - Johnson-Lindenstrauss lemma. In that case the quality of the - embedding is controlled by the ``eps`` parameter. - - It should be noted that Johnson-Lindenstrauss lemma can yield - very conservative estimated of the required number of components - as it makes no assumption on the structure of the dataset. - - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - """ - def __init__(self, - n_components=32, - random_state=None): - super(GaussianRandomProjectionHash, self).__init__( - n_components=n_components, - random_state=random_state) - - -def _array_of_arrays(list_of_arrays): - """Creates an array of array from list of arrays.""" - out = np.empty(len(list_of_arrays), dtype=object) - out[:] = list_of_arrays - return out - - -class LSHForest(BaseEstimator, KNeighborsMixin, RadiusNeighborsMixin): - """Performs approximate nearest neighbor search using LSH forest. - - LSH Forest: Locality Sensitive Hashing forest [1] is an alternative - method for vanilla approximate nearest neighbor search methods. - LSH forest data structure has been implemented using sorted - arrays and binary search and 32 bit fixed-length hashes. - Random projection is used as the hash family which approximates - cosine distance. - - The cosine distance is defined as ``1 - cosine_similarity``: the lowest - value is 0 (identical point) but it is bounded above by 2 for the farthest - points. Its value does not depend on the norm of the vector points but - only on their relative angles. - - Parameters - ---------- - - n_estimators : int (default = 10) - Number of trees in the LSH Forest. - - radius : float, optinal (default = 1.0) - Radius from the data point to its neighbors. This is the parameter - space to use by default for the :meth:`radius_neighbors` queries. - - n_candidates : int (default = 50) - Minimum number of candidates evaluated per estimator, assuming enough - items meet the `min_hash_match` constraint. - - n_neighbors : int (default = 5) - Number of neighbors to be returned from query function when - it is not provided to the :meth:`kneighbors` method. - - min_hash_match : int (default = 4) - lowest hash length to be searched when candidate selection is - performed for nearest neighbors. - - radius_cutoff_ratio : float, optional (default = 0.9) - A value ranges from 0 to 1. Radius neighbors will be searched until - the ratio between total neighbors within the radius and the total - candidates becomes less than this value unless it is terminated by - hash length reaching `min_hash_match`. - - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - Attributes - ---------- - - hash_functions_ : list of GaussianRandomProjectionHash objects - Hash function g(p,x) for a tree is an array of 32 randomly generated - float arrays with the same dimension as the data set. This array is - stored in GaussianRandomProjectionHash object and can be obtained - from ``components_`` attribute. - - trees_ : array, shape (n_estimators, n_samples) - Each tree (corresponding to a hash function) contains an array of - sorted hashed values. The array representation may change in future - versions. - - original_indices_ : array, shape (n_estimators, n_samples) - Original indices of sorted hashed values in the fitted index. - - References - ---------- - - .. [1] M. Bawa, T. Condie and P. Ganesan, "LSH Forest: Self-Tuning - Indexes for Similarity Search", WWW '05 Proceedings of the - 14th international conference on World Wide Web, 651-660, - 2005. - - Examples - -------- - >>> from sklearn.neighbors import LSHForest - - >>> X_train = [[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]] - >>> X_test = [[9, 1, 6], [3, 1, 10], [7, 10, 3]] - >>> lshf = LSHForest(random_state=42) # doctest: +SKIP - >>> lshf.fit(X_train) # doctest: +SKIP - LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10, - n_neighbors=5, radius=1.0, radius_cutoff_ratio=0.9, - random_state=42) - >>> distances, indices = lshf.kneighbors(X_test, n_neighbors=2) - ... # doctest: +SKIP - >>> distances # doctest: +SKIP - array([[0.069..., 0.149...], - [0.229..., 0.481...], - [0.004..., 0.014...]]) - >>> indices # doctest: +SKIP - array([[1, 2], - [2, 0], - [4, 0]]) - - """ - - def __init__(self, n_estimators=10, radius=1.0, n_candidates=50, - n_neighbors=5, min_hash_match=4, radius_cutoff_ratio=.9, - random_state=None): - self.n_estimators = n_estimators - self.radius = radius - self.random_state = random_state - self.n_candidates = n_candidates - self.n_neighbors = n_neighbors - self.min_hash_match = min_hash_match - self.radius_cutoff_ratio = radius_cutoff_ratio - - warnings.warn("LSHForest has poor performance and has been deprecated " - "in 0.19. It will be removed in version 0.21.", - DeprecationWarning) - - def _compute_distances(self, query, candidates): - """Computes the cosine distance. - - Distance is from the query to points in the candidates array. - Returns argsort of distances in the candidates - array and sorted distances. - """ - if candidates.shape == (0,): - # needed since _fit_X[np.array([])] doesn't work if _fit_X sparse - return np.empty(0, dtype=np.int), np.empty(0, dtype=float) - - if sparse.issparse(self._fit_X): - candidate_X = self._fit_X[candidates] - else: - candidate_X = self._fit_X.take(candidates, axis=0, mode='clip') - distances = pairwise_distances(query, candidate_X, - metric='cosine')[0] - distance_positions = np.argsort(distances) - distances = distances.take(distance_positions, mode='clip', axis=0) - return distance_positions, distances - - def _generate_masks(self): - """Creates left and right masks for all hash lengths.""" - tri_size = MAX_HASH_SIZE + 1 - # Called once on fitting, output is independent of hashes - left_mask = np.tril(np.ones((tri_size, tri_size), dtype=int))[:, 1:] - right_mask = left_mask[::-1, ::-1] - - self._left_mask = np.packbits(left_mask).view(dtype=HASH_DTYPE) - self._right_mask = np.packbits(right_mask).view(dtype=HASH_DTYPE) - - def _get_candidates(self, query, max_depth, bin_queries, n_neighbors): - """Performs the Synchronous ascending phase. - - Returns an array of candidates, their distance ranks and - distances. - """ - index_size = self._fit_X.shape[0] - # Number of candidates considered including duplicates - # XXX: not sure whether this is being calculated correctly wrt - # duplicates from different iterations through a single tree - n_candidates = 0 - candidate_set = set() - min_candidates = self.n_candidates * self.n_estimators - while (max_depth > self.min_hash_match and - (n_candidates < min_candidates or - len(candidate_set) < n_neighbors)): - - left_mask = self._left_mask[max_depth] - right_mask = self._right_mask[max_depth] - for i in range(self.n_estimators): - start, stop = _find_matching_indices(self.trees_[i], - bin_queries[i], - left_mask, right_mask) - n_candidates += stop - start - candidate_set.update( - self.original_indices_[i][start:stop].tolist()) - max_depth -= 1 - - candidates = np.fromiter(candidate_set, count=len(candidate_set), - dtype=np.intp) - # For insufficient candidates, candidates are filled. - # Candidates are filled from unselected indices uniformly. - if candidates.shape[0] < n_neighbors: - warnings.warn( - "Number of candidates is not sufficient to retrieve" - " %i neighbors with" - " min_hash_match = %i. Candidates are filled up" - " uniformly from unselected" - " indices." % (n_neighbors, self.min_hash_match)) - remaining = np.setdiff1d(np.arange(0, index_size), candidates) - to_fill = n_neighbors - candidates.shape[0] - candidates = np.concatenate((candidates, remaining[:to_fill])) - - ranks, distances = self._compute_distances(query, - candidates.astype(int)) - - return (candidates[ranks[:n_neighbors]], - distances[:n_neighbors]) - - def _get_radius_neighbors(self, query, max_depth, bin_queries, radius): - """Finds radius neighbors from the candidates obtained. - - Their distances from query are smaller than radius. - Returns radius neighbors and distances. - """ - ratio_within_radius = 1 - threshold = 1 - self.radius_cutoff_ratio - total_candidates = np.array([], dtype=int) - total_neighbors = np.array([], dtype=int) - total_distances = np.array([], dtype=float) - - while (max_depth > self.min_hash_match and - ratio_within_radius > threshold): - left_mask = self._left_mask[max_depth] - right_mask = self._right_mask[max_depth] - candidates = [] - for i in range(self.n_estimators): - start, stop = _find_matching_indices(self.trees_[i], - bin_queries[i], - left_mask, right_mask) - candidates.extend( - self.original_indices_[i][start:stop].tolist()) - candidates = np.setdiff1d(candidates, total_candidates) - total_candidates = np.append(total_candidates, candidates) - ranks, distances = self._compute_distances(query, candidates) - m = np.searchsorted(distances, radius, side='right') - positions = np.searchsorted(total_distances, distances[:m]) - total_neighbors = np.insert(total_neighbors, positions, - candidates[ranks[:m]]) - total_distances = np.insert(total_distances, positions, - distances[:m]) - ratio_within_radius = (total_neighbors.shape[0] / - float(total_candidates.shape[0])) - max_depth = max_depth - 1 - return total_neighbors, total_distances - - def fit(self, X, y=None): - """Fit the LSH forest on the data. - - This creates binary hashes of input data points by getting the - dot product of input points and hash_function then - transforming the projection into a binary string array based - on the sign (positive/negative) of the projection. - A sorted array of binary hashes is created. - - Parameters - ---------- - X : array_like or sparse (CSR) matrix, shape (n_samples, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single data point. - - Returns - ------- - self : object - """ - - self._fit_X = check_array(X, accept_sparse='csr') - - # Creates a g(p,x) for each tree - self.hash_functions_ = [] - self.trees_ = [] - self.original_indices_ = [] - - rng = check_random_state(self.random_state) - int_max = np.iinfo(np.int32).max - - for i in range(self.n_estimators): - # This is g(p,x) for a particular tree. - # Builds a single tree. Hashing is done on an array of data points. - # `GaussianRandomProjection` is used for hashing. - # `n_components=hash size and n_features=n_dim. - hasher = GaussianRandomProjectionHash(MAX_HASH_SIZE, - rng.randint(0, int_max)) - hashes = hasher.fit_transform(self._fit_X)[:, 0] - original_index = np.argsort(hashes) - bin_hashes = hashes[original_index] - self.original_indices_.append(original_index) - self.trees_.append(bin_hashes) - self.hash_functions_.append(hasher) - - self._generate_masks() - - return self - - def _query(self, X): - """Performs descending phase to find maximum depth.""" - # Calculate hashes of shape (n_samples, n_estimators, [hash_size]) - bin_queries = np.asarray([hasher.transform(X)[:, 0] - for hasher in self.hash_functions_]) - bin_queries = np.rollaxis(bin_queries, 1) - - # descend phase - depths = [_find_longest_prefix_match(tree, tree_queries, MAX_HASH_SIZE, - self._left_mask, self._right_mask) - for tree, tree_queries in zip(self.trees_, - np.rollaxis(bin_queries, 1))] - - return bin_queries, np.max(depths, axis=0) - - def kneighbors(self, X, n_neighbors=None, return_distance=True): - """Returns n_neighbors of approximate nearest neighbors. - - Parameters - ---------- - X : array_like or sparse (CSR) matrix, shape (n_samples, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single query. - - n_neighbors : int, optional (default = None) - Number of neighbors required. If not provided, this will - return the number specified at the initialization. - - return_distance : boolean, optional (default = True) - Returns the distances of neighbors if set to True. - - Returns - ------- - dist : array, shape (n_samples, n_neighbors) - Array representing the cosine distances to each point, - only present if return_distance=True. - - ind : array, shape (n_samples, n_neighbors) - Indices of the approximate nearest points in the population - matrix. - """ - if not hasattr(self, 'hash_functions_'): - raise ValueError("estimator should be fitted.") - - if n_neighbors is None: - n_neighbors = self.n_neighbors - - X = check_array(X, accept_sparse='csr') - - neighbors, distances = [], [] - bin_queries, max_depth = self._query(X) - for i in range(X.shape[0]): - - neighs, dists = self._get_candidates(X[[i]], max_depth[i], - bin_queries[i], - n_neighbors) - neighbors.append(neighs) - distances.append(dists) - - if return_distance: - return np.array(distances), np.array(neighbors) - else: - return np.array(neighbors) - - def radius_neighbors(self, X, radius=None, return_distance=True): - """Finds the neighbors within a given radius of a point or points. - - Return the indices and distances of some points from the dataset - lying in a ball with size ``radius`` around the points of the query - array. Points lying on the boundary are included in the results. - - The result points are *not* necessarily sorted by distance to their - query point. - - LSH Forest being an approximate method, some true neighbors from the - indexed dataset might be missing from the results. - - Parameters - ---------- - X : array_like or sparse (CSR) matrix, shape (n_samples, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single query. - - radius : float - Limiting distance of neighbors to return. - (default is the value passed to the constructor). - - return_distance : boolean, optional (default = False) - Returns the distances of neighbors if set to True. - - Returns - ------- - dist : array, shape (n_samples,) of arrays - Each element is an array representing the cosine distances - to some points found within ``radius`` of the respective query. - Only present if ``return_distance=True``. - - ind : array, shape (n_samples,) of arrays - Each element is an array of indices for neighbors within ``radius`` - of the respective query. - """ - if not hasattr(self, 'hash_functions_'): - raise ValueError("estimator should be fitted.") - - if radius is None: - radius = self.radius - - X = check_array(X, accept_sparse='csr') - - neighbors, distances = [], [] - bin_queries, max_depth = self._query(X) - for i in range(X.shape[0]): - - neighs, dists = self._get_radius_neighbors(X[[i]], max_depth[i], - bin_queries[i], radius) - neighbors.append(neighs) - distances.append(dists) - - if return_distance: - return _array_of_arrays(distances), _array_of_arrays(neighbors) - else: - return _array_of_arrays(neighbors) - - def partial_fit(self, X, y=None): - """ - Inserts new data into the already fitted LSH Forest. - Cost is proportional to new total size, so additions - should be batched. - - Parameters - ---------- - X : array_like or sparse (CSR) matrix, shape (n_samples, n_features) - New data point to be inserted into the LSH Forest. - """ - X = check_array(X, accept_sparse='csr') - if not hasattr(self, 'hash_functions_'): - return self.fit(X) - - if X.shape[1] != self._fit_X.shape[1]: - raise ValueError("Number of features in X and" - " fitted array does not match.") - n_samples = X.shape[0] - n_indexed = self._fit_X.shape[0] - - for i in range(self.n_estimators): - bin_X = self.hash_functions_[i].transform(X)[:, 0] - # gets the position to be added in the tree. - positions = self.trees_[i].searchsorted(bin_X) - # adds the hashed value into the tree. - self.trees_[i] = np.insert(self.trees_[i], - positions, bin_X) - # add the entry into the original_indices_. - self.original_indices_[i] = np.insert(self.original_indices_[i], - positions, - np.arange(n_indexed, - n_indexed + - n_samples)) - - # adds the entry into the input_array. - if sparse.issparse(X) or sparse.issparse(self._fit_X): - self._fit_X = sparse.vstack((self._fit_X, X)) - else: - self._fit_X = np.row_stack((self._fit_X, X)) - - return self diff --git a/sklearn/neighbors/tests/test_approximate.py b/sklearn/neighbors/tests/test_approximate.py deleted file mode 100644 index 1536271897625..0000000000000 --- a/sklearn/neighbors/tests/test_approximate.py +++ /dev/null @@ -1,498 +0,0 @@ -""" -Testing for the approximate neighbor search using -Locality Sensitive Hashing Forest module -(sklearn.neighbors.LSHForest). -""" - -# Author: Maheshakya Wijewardena, Joel Nothman - -import numpy as np -import scipy.sparse as sp - -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_almost_equal -from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_array_less -from sklearn.utils.testing import assert_greater -from sklearn.utils.testing import assert_true -from sklearn.utils.testing import assert_not_equal -from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import ignore_warnings - -from sklearn.metrics.pairwise import pairwise_distances -from sklearn.neighbors import LSHForest -from sklearn.neighbors import NearestNeighbors - - -def test_lsh_forest_deprecation(): - assert_warns_message(DeprecationWarning, - "LSHForest has poor performance and has been " - "deprecated in 0.19. It will be removed " - "in version 0.21.", LSHForest) - - -def test_neighbors_accuracy_with_n_candidates(): - # Checks whether accuracy increases as `n_candidates` increases. - n_candidates_values = np.array([.1, 50, 500]) - n_samples = 100 - n_features = 10 - n_iter = 10 - n_points = 5 - rng = np.random.RandomState(42) - accuracies = np.zeros(n_candidates_values.shape[0], dtype=float) - X = rng.rand(n_samples, n_features) - - for i, n_candidates in enumerate(n_candidates_values): - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( - n_candidates=n_candidates, random_state=0) - ignore_warnings(lshf.fit)(X) - for j in range(n_iter): - query = X[rng.randint(0, n_samples)].reshape(1, -1) - - neighbors = lshf.kneighbors(query, n_neighbors=n_points, - return_distance=False) - distances = pairwise_distances(query, X, metric='cosine') - ranks = np.argsort(distances)[0, :n_points] - - intersection = np.intersect1d(ranks, neighbors).shape[0] - ratio = intersection / float(n_points) - accuracies[i] = accuracies[i] + ratio - - accuracies[i] = accuracies[i] / float(n_iter) - # Sorted accuracies should be equal to original accuracies - print('accuracies:', accuracies) - assert_true(np.all(np.diff(accuracies) >= 0), - msg="Accuracies are not non-decreasing.") - # Highest accuracy should be strictly greater than the lowest - assert_true(np.ptp(accuracies) > 0, - msg="Highest accuracy is not strictly greater than lowest.") - - -def test_neighbors_accuracy_with_n_estimators(): - # Checks whether accuracy increases as `n_estimators` increases. - n_estimators = np.array([1, 10, 100]) - n_samples = 100 - n_features = 10 - n_iter = 10 - n_points = 5 - rng = np.random.RandomState(42) - accuracies = np.zeros(n_estimators.shape[0], dtype=float) - X = rng.rand(n_samples, n_features) - - for i, t in enumerate(n_estimators): - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( - n_candidates=500, n_estimators=t) - ignore_warnings(lshf.fit)(X) - for j in range(n_iter): - query = X[rng.randint(0, n_samples)].reshape(1, -1) - neighbors = lshf.kneighbors(query, n_neighbors=n_points, - return_distance=False) - distances = pairwise_distances(query, X, metric='cosine') - ranks = np.argsort(distances)[0, :n_points] - - intersection = np.intersect1d(ranks, neighbors).shape[0] - ratio = intersection / float(n_points) - accuracies[i] = accuracies[i] + ratio - - accuracies[i] = accuracies[i] / float(n_iter) - # Sorted accuracies should be equal to original accuracies - assert_true(np.all(np.diff(accuracies) >= 0), - msg="Accuracies are not non-decreasing.") - # Highest accuracy should be strictly greater than the lowest - assert_true(np.ptp(accuracies) > 0, - msg="Highest accuracy is not strictly greater than lowest.") - - -@ignore_warnings -def test_kneighbors(): - # Checks whether desired number of neighbors are returned. - # It is guaranteed to return the requested number of neighbors - # if `min_hash_match` is set to 0. Returned distances should be - # in ascending order. - n_samples = 12 - n_features = 2 - n_iter = 10 - rng = np.random.RandomState(42) - X = rng.rand(n_samples, n_features) - - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( - min_hash_match=0) - # Test unfitted estimator - assert_raises(ValueError, lshf.kneighbors, X[0]) - - ignore_warnings(lshf.fit)(X) - - for i in range(n_iter): - n_neighbors = rng.randint(0, n_samples) - query = X[rng.randint(0, n_samples)].reshape(1, -1) - neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors, - return_distance=False) - # Desired number of neighbors should be returned. - assert_equal(neighbors.shape[1], n_neighbors) - - # Multiple points - n_queries = 5 - queries = X[rng.randint(0, n_samples, n_queries)] - distances, neighbors = lshf.kneighbors(queries, - n_neighbors=1, - return_distance=True) - assert_equal(neighbors.shape[0], n_queries) - assert_equal(distances.shape[0], n_queries) - # Test only neighbors - neighbors = lshf.kneighbors(queries, n_neighbors=1, - return_distance=False) - assert_equal(neighbors.shape[0], n_queries) - # Test random point(not in the data set) - query = rng.randn(n_features).reshape(1, -1) - lshf.kneighbors(query, n_neighbors=1, - return_distance=False) - # Test n_neighbors at initialization - neighbors = lshf.kneighbors(query, return_distance=False) - assert_equal(neighbors.shape[1], 5) - # Test `neighbors` has an integer dtype - assert_true(neighbors.dtype.kind == 'i', - msg="neighbors are not in integer dtype.") - - -def test_radius_neighbors(): - # Checks whether Returned distances are less than `radius` - # At least one point should be returned when the `radius` is set - # to mean distance from the considering point to other points in - # the database. - # Moreover, this test compares the radius neighbors of LSHForest - # with the `sklearn.neighbors.NearestNeighbors`. - n_samples = 12 - n_features = 2 - n_iter = 10 - rng = np.random.RandomState(42) - X = rng.rand(n_samples, n_features) - - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)() - # Test unfitted estimator - assert_raises(ValueError, lshf.radius_neighbors, X[0]) - - ignore_warnings(lshf.fit)(X) - - for i in range(n_iter): - # Select a random point in the dataset as the query - query = X[rng.randint(0, n_samples)].reshape(1, -1) - - # At least one neighbor should be returned when the radius is the - # mean distance from the query to the points of the dataset. - mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) - neighbors = lshf.radius_neighbors(query, radius=mean_dist, - return_distance=False) - - assert_equal(neighbors.shape, (1,)) - assert_equal(neighbors.dtype, object) - assert_greater(neighbors[0].shape[0], 0) - # All distances to points in the results of the radius query should - # be less than mean_dist - distances, neighbors = lshf.radius_neighbors(query, - radius=mean_dist, - return_distance=True) - assert_array_less(distances[0], mean_dist) - - # Multiple points - n_queries = 5 - queries = X[rng.randint(0, n_samples, n_queries)] - distances, neighbors = lshf.radius_neighbors(queries, - return_distance=True) - - # dists and inds should not be 1D arrays or arrays of variable lengths - # hence the use of the object dtype. - assert_equal(distances.shape, (n_queries,)) - assert_equal(distances.dtype, object) - assert_equal(neighbors.shape, (n_queries,)) - assert_equal(neighbors.dtype, object) - - # Compare with exact neighbor search - query = X[rng.randint(0, n_samples)].reshape(1, -1) - mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) - nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) - - distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) - distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) - - # The following fails on some platforms. See #10244 - - # # Radius-based queries do not sort the result points and the order - # # depends on the method, the random_state and the dataset order. - # # We need to sort the results ourselves before performing any comparison. - # sorted_dists_exact = np.sort(distances_exact[0]) - # sorted_dists_approx = np.sort(distances_approx[0]) - # - # # Distances to exact neighbors are less than or equal to approximate - # # counterparts as the approximate radius query might have missed some - # # closer neighbors. - # - # assert_true(np.all(np.less_equal(sorted_dists_exact, - # sorted_dists_approx))) - - -@ignore_warnings -def test_radius_neighbors_boundary_handling(): - X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]] - n_points = len(X) - - # Build an exact nearest neighbors model as reference model to ensure - # consistency between exact and approximate methods - nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) - - # Build a LSHForest model with hyperparameter values that always guarantee - # exact results on this toy dataset. - lsfh = ignore_warnings(LSHForest, category=DeprecationWarning)( - min_hash_match=0, n_candidates=n_points, random_state=42).fit(X) - - # define a query aligned with the first axis - query = [[1., 0.]] - - # Compute the exact cosine distances of the query to the four points of - # the dataset - dists = pairwise_distances(query, X, metric='cosine').ravel() - - # The first point is almost aligned with the query (very small angle), - # the cosine distance should therefore be almost null: - assert_almost_equal(dists[0], 0, decimal=5) - - # The second point form an angle of 45 degrees to the query vector - assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4)) - - # The third point is orthogonal from the query vector hence at a distance - # exactly one: - assert_almost_equal(dists[2], 1) - - # The last point is almost colinear but with opposite sign to the query - # therefore it has a cosine 'distance' very close to the maximum possible - # value of 2. - assert_almost_equal(dists[3], 2, decimal=5) - - # If we query with a radius of one, all the samples except the last sample - # should be included in the results. This means that the third sample - # is lying on the boundary of the radius query: - exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1) - approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1) - - assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2]) - assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2]) - assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1]) - assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1]) - - # If we perform the same query with a slightly lower radius, the third - # point of the dataset that lay on the boundary of the previous query - # is now rejected: - eps = np.finfo(np.float64).eps - exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps) - approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps) - - assert_array_equal(np.sort(exact_idx[0]), [0, 1]) - assert_array_equal(np.sort(approx_idx[0]), [0, 1]) - assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2]) - assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2]) - - -def test_distances(): - # Checks whether returned neighbors are from closest to farthest. - n_samples = 12 - n_features = 2 - n_iter = 10 - rng = np.random.RandomState(42) - X = rng.rand(n_samples, n_features) - - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)() - ignore_warnings(lshf.fit)(X) - - for i in range(n_iter): - n_neighbors = rng.randint(0, n_samples) - query = X[rng.randint(0, n_samples)].reshape(1, -1) - distances, neighbors = lshf.kneighbors(query, - n_neighbors=n_neighbors, - return_distance=True) - - # Returned neighbors should be from closest to farthest, that is - # increasing distance values. - assert_true(np.all(np.diff(distances[0]) >= 0)) - - # Note: the radius_neighbors method does not guarantee the order of - # the results. - - -def test_fit(): - # Checks whether `fit` method sets all attribute values correctly. - n_samples = 12 - n_features = 2 - n_estimators = 5 - rng = np.random.RandomState(42) - X = rng.rand(n_samples, n_features) - - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( - n_estimators=n_estimators) - ignore_warnings(lshf.fit)(X) - - # _input_array = X - assert_array_equal(X, lshf._fit_X) - # A hash function g(p) for each tree - assert_equal(n_estimators, len(lshf.hash_functions_)) - # Hash length = 32 - assert_equal(32, lshf.hash_functions_[0].components_.shape[0]) - # Number of trees_ in the forest - assert_equal(n_estimators, len(lshf.trees_)) - # Each tree has entries for every data point - assert_equal(n_samples, len(lshf.trees_[0])) - # Original indices after sorting the hashes - assert_equal(n_estimators, len(lshf.original_indices_)) - # Each set of original indices in a tree has entries for every data point - assert_equal(n_samples, len(lshf.original_indices_[0])) - - -def test_partial_fit(): - # Checks whether inserting array is consistent with fitted data. - # `partial_fit` method should set all attribute values correctly. - n_samples = 12 - n_samples_partial_fit = 3 - n_features = 2 - rng = np.random.RandomState(42) - X = rng.rand(n_samples, n_features) - X_partial_fit = rng.rand(n_samples_partial_fit, n_features) - - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)() - - # Test unfitted estimator - ignore_warnings(lshf.partial_fit)(X) - assert_array_equal(X, lshf._fit_X) - - ignore_warnings(lshf.fit)(X) - - # Insert wrong dimension - assert_raises(ValueError, lshf.partial_fit, - np.random.randn(n_samples_partial_fit, n_features - 1)) - - ignore_warnings(lshf.partial_fit)(X_partial_fit) - - # size of _input_array = samples + 1 after insertion - assert_equal(lshf._fit_X.shape[0], - n_samples + n_samples_partial_fit) - # size of original_indices_[1] = samples + 1 - assert_equal(len(lshf.original_indices_[0]), - n_samples + n_samples_partial_fit) - # size of trees_[1] = samples + 1 - assert_equal(len(lshf.trees_[1]), - n_samples + n_samples_partial_fit) - - -def test_hash_functions(): - # Checks randomness of hash functions. - # Variance and mean of each hash function (projection vector) - # should be different from flattened array of hash functions. - # If hash functions are not randomly built (seeded with - # same value), variances and means of all functions are equal. - n_samples = 12 - n_features = 2 - n_estimators = 5 - rng = np.random.RandomState(42) - X = rng.rand(n_samples, n_features) - - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( - n_estimators=n_estimators, - random_state=rng.randint(0, np.iinfo(np.int32).max)) - ignore_warnings(lshf.fit)(X) - - hash_functions = [] - for i in range(n_estimators): - hash_functions.append(lshf.hash_functions_[i].components_) - - for i in range(n_estimators): - assert_not_equal(np.var(hash_functions), - np.var(lshf.hash_functions_[i].components_)) - - for i in range(n_estimators): - assert_not_equal(np.mean(hash_functions), - np.mean(lshf.hash_functions_[i].components_)) - - -def test_candidates(): - # Checks whether candidates are sufficient. - # This should handle the cases when number of candidates is 0. - # User should be warned when number of candidates is less than - # requested number of neighbors. - X_train = np.array([[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], - [6, 10, 2]], dtype=np.float32) - X_test = np.array([7, 10, 3], dtype=np.float32).reshape(1, -1) - - # For zero candidates - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( - min_hash_match=32) - ignore_warnings(lshf.fit)(X_train) - - message = ("Number of candidates is not sufficient to retrieve" - " %i neighbors with" - " min_hash_match = %i. Candidates are filled up" - " uniformly from unselected" - " indices." % (3, 32)) - assert_warns_message(UserWarning, message, lshf.kneighbors, - X_test, n_neighbors=3) - distances, neighbors = lshf.kneighbors(X_test, n_neighbors=3) - assert_equal(distances.shape[1], 3) - - # For candidates less than n_neighbors - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( - min_hash_match=31) - ignore_warnings(lshf.fit)(X_train) - - message = ("Number of candidates is not sufficient to retrieve" - " %i neighbors with" - " min_hash_match = %i. Candidates are filled up" - " uniformly from unselected" - " indices." % (5, 31)) - assert_warns_message(UserWarning, message, lshf.kneighbors, - X_test, n_neighbors=5) - distances, neighbors = lshf.kneighbors(X_test, n_neighbors=5) - assert_equal(distances.shape[1], 5) - - -def test_graphs(): - # Smoke tests for graph methods. - n_samples_sizes = [5, 10, 20] - n_features = 3 - rng = np.random.RandomState(42) - - for n_samples in n_samples_sizes: - X = rng.rand(n_samples, n_features) - lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( - min_hash_match=0) - ignore_warnings(lshf.fit)(X) - - kneighbors_graph = lshf.kneighbors_graph(X) - radius_neighbors_graph = lshf.radius_neighbors_graph(X) - - assert_equal(kneighbors_graph.shape[0], n_samples) - assert_equal(kneighbors_graph.shape[1], n_samples) - assert_equal(radius_neighbors_graph.shape[0], n_samples) - assert_equal(radius_neighbors_graph.shape[1], n_samples) - - -def test_sparse_input(): - X1 = sp.rand(50, 100, random_state=0) - X2 = sp.rand(10, 100, random_state=1) - forest_sparse = ignore_warnings(LSHForest, category=DeprecationWarning)( - radius=1, random_state=0).fit(X1) - forest_dense = ignore_warnings(LSHForest, category=DeprecationWarning)( - radius=1, random_state=0).fit(X1.A) - - d_sparse, i_sparse = forest_sparse.kneighbors(X2, return_distance=True) - d_dense, i_dense = forest_dense.kneighbors(X2.A, return_distance=True) - - assert_almost_equal(d_sparse, d_dense) - assert_almost_equal(i_sparse, i_dense) - - d_sparse, i_sparse = forest_sparse.radius_neighbors(X2, - return_distance=True) - d_dense, i_dense = forest_dense.radius_neighbors(X2.A, - return_distance=True) - assert_equal(d_sparse.shape, d_dense.shape) - for a, b in zip(d_sparse, d_dense): - assert_almost_equal(a, b) - for a, b in zip(i_sparse, i_dense): - assert_almost_equal(a, b) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 15905bf37d2e5..d1d69bde6f4a8 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -37,8 +37,6 @@ from .imputation import Imputer -# stub, remove in version 0.21 -from .data import CategoricalEncoder # noqa __all__ = [ 'Binarizer', diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 0c79543338212..93afcc646e3fb 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -136,7 +136,7 @@ def fit(self, X, y=None): self._check_inverse_transform(X) return self - def transform(self, X, y='deprecated'): + def transform(self, X): """Transform X using the forward function. Parameters @@ -144,22 +144,14 @@ def transform(self, X, y='deprecated'): X : array-like, shape (n_samples, n_features) Input array. - y : (ignored) - .. deprecated::0.19 - Returns ------- X_out : array-like, shape (n_samples, n_features) Transformed input. """ - if not isinstance(y, string_types) or y != 'deprecated': - warnings.warn("The parameter y on transform() is " - "deprecated since 0.19 and will be removed in 0.21", - DeprecationWarning) - - return self._transform(X, y=y, func=self.func, kw_args=self.kw_args) + return self._transform(X, func=self.func, kw_args=self.kw_args) - def inverse_transform(self, X, y='deprecated'): + def inverse_transform(self, X): """Transform X using the inverse function. Parameters @@ -167,35 +159,18 @@ def inverse_transform(self, X, y='deprecated'): X : array-like, shape (n_samples, n_features) Input array. - y : (ignored) - .. deprecated::0.19 - Returns ------- X_out : array-like, shape (n_samples, n_features) Transformed input. """ - if not isinstance(y, string_types) or y != 'deprecated': - warnings.warn("The parameter y on inverse_transform() is " - "deprecated since 0.19 and will be removed in 0.21", - DeprecationWarning) - return self._transform(X, y=y, func=self.inverse_func, + return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args) - def _transform(self, X, y=None, func=None, kw_args=None): + def _transform(self, X, func=None, kw_args=None): X = self._check_input(X) if func is None: func = _identity - if (not isinstance(self.pass_y, string_types) or - self.pass_y != 'deprecated'): - # We do this to know if pass_y was set to False / True - pass_y = self.pass_y - warnings.warn("The parameter pass_y is deprecated since 0.19 and " - "will be removed in 0.21", DeprecationWarning) - else: - pass_y = False - - return func(X, *((y,) if pass_y else ()), - **(kw_args if kw_args else {})) + return func(X, **(kw_args if kw_args else {})) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 0a33f9140f902..9b3eaa98e4c08 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -718,24 +718,16 @@ def partial_fit(self, X, y=None): return self - def transform(self, X, y='deprecated', copy=None): + def transform(self, X, copy=None): """Perform standardization by centering and scaling Parameters ---------- X : array-like, shape [n_samples, n_features] The data used to scale along the features axis. - y : (ignored) - .. deprecated:: 0.19 - This parameter will be removed in 0.21. copy : bool, optional (default: None) Copy the input X or not. """ - if not isinstance(y, string_types) or y != 'deprecated': - warnings.warn("The parameter y on transform() is " - "deprecated since 0.19 and will be removed in 0.21", - DeprecationWarning) - check_is_fitted(self, 'scale_') copy = copy if copy is not None else self.copy @@ -1655,7 +1647,7 @@ def fit(self, X, y=None): X = check_array(X, accept_sparse='csr') return self - def transform(self, X, y='deprecated', copy=None): + def transform(self, X, copy=None): """Scale each non zero row of X to unit norm Parameters @@ -1663,17 +1655,9 @@ def transform(self, X, y='deprecated', copy=None): X : {array-like, sparse matrix}, shape [n_samples, n_features] The data to normalize, row by row. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. - y : (ignored) - .. deprecated:: 0.19 - This parameter will be removed in 0.21. copy : bool, optional (default: None) Copy the input X or not. """ - if not isinstance(y, string_types) or y != 'deprecated': - warnings.warn("The parameter y on transform() is " - "deprecated since 0.19 and will be removed in 0.21", - DeprecationWarning) - copy = copy if copy is not None else self.copy X = check_array(X, accept_sparse='csr') return normalize(X, norm=self.norm, axis=1, copy=copy) @@ -1794,7 +1778,7 @@ def fit(self, X, y=None): check_array(X, accept_sparse='csr') return self - def transform(self, X, y='deprecated', copy=None): + def transform(self, X, copy=None): """Binarize each element of X Parameters @@ -1803,17 +1787,10 @@ def transform(self, X, y='deprecated', copy=None): The data to binarize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. - y : (ignored) - .. deprecated:: 0.19 - This parameter will be removed in 0.21. + copy : bool Copy the input X or not. """ - if not isinstance(y, string_types) or y != 'deprecated': - warnings.warn("The parameter y on transform() is " - "deprecated since 0.19 and will be removed in 0.21", - DeprecationWarning) - copy = copy if copy is not None else self.copy return binarize(X, threshold=self.threshold, copy=copy) @@ -1872,16 +1849,14 @@ def fit(self, K, y=None): self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples return self - def transform(self, K, y='deprecated', copy=True): + def transform(self, K, copy=True): """Center kernel matrix. Parameters ---------- K : numpy array of shape [n_samples1, n_samples2] Kernel matrix. - y : (ignored) - .. deprecated:: 0.19 - This parameter will be removed in 0.21. + copy : boolean, optional, default True Set to False to perform inplace computation. @@ -1889,11 +1864,6 @@ def transform(self, K, y='deprecated', copy=True): ------- K_new : numpy array of shape [n_samples1, n_samples2] """ - if not isinstance(y, string_types) or y != 'deprecated': - warnings.warn("The parameter y on transform() is " - "deprecated since 0.19 and will be removed in 0.21", - DeprecationWarning) - check_is_fitted(self, 'K_fit_all_') K = check_array(K, copy=copy, dtype=FLOAT_DTYPES) @@ -2902,18 +2872,4 @@ def power_transform(X, method='box-cox', standardize=True, copy=True): Royal Statistical Society B, 26, 211-252 (1964). """ pt = PowerTransformer(method=method, standardize=standardize, copy=copy) - return pt.fit_transform(X) - - -class CategoricalEncoder: - """ - CategoricalEncoder briefly existed in 0.20dev. Its functionality - has been rolled into the OneHotEncoder and OrdinalEncoder. - This stub will be removed in version 0.21. - """ - - def __init__(*args, **kwargs): - raise RuntimeError( - "CategoricalEncoder briefly existed in 0.20dev. Its functionality " - "has been rolled into the OneHotEncoder and OrdinalEncoder. " - "This stub will be removed in version 0.21.") + return pt.fit_transform(X) \ No newline at end of file diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index ff32005399fe2..081e54fbb0dfb 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -322,13 +322,6 @@ class LabelPropagation(BaseLabelPropagation): n_neighbors : integer > 0 Parameter for knn kernel - alpha : float - Clamping factor. - - .. deprecated:: 0.19 - This parameter will be removed in 0.21. - 'alpha' is fixed to zero in 'LabelPropagation'. - max_iter : integer Change maximum number of iterations allowed @@ -388,10 +381,10 @@ class LabelPropagation(BaseLabelPropagation): _variant = 'propagation' def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, - alpha=None, max_iter=1000, tol=1e-3, n_jobs=None): + max_iter=1000, tol=1e-3, n_jobs=None): super(LabelPropagation, self).__init__( - kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, - max_iter=max_iter, tol=tol, n_jobs=n_jobs) + kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, + max_iter=max_iter, tol=tol, n_jobs=n_jobs, alpha=None) def _build_graph(self): """Matrix representing a fully connected graph between each sample @@ -410,12 +403,6 @@ class distributions will exceed 1 (normalization may be desired). return affinity_matrix def fit(self, X, y): - if self.alpha is not None: - warnings.warn( - "alpha is deprecated since 0.19 and will be removed in 0.21.", - DeprecationWarning - ) - self.alpha = None return super(LabelPropagation, self).fit(X, y) diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py index 6e509949b0a88..4cb8f5d148b04 100644 --- a/sklearn/tests/test_discriminant_analysis.py +++ b/sklearn/tests/test_discriminant_analysis.py @@ -316,20 +316,6 @@ def test_qda_store_covariance(): ) -def test_qda_deprecation(): - # Test the deprecation - clf = QuadraticDiscriminantAnalysis(store_covariances=True) - assert_warns_message(DeprecationWarning, "'store_covariances' was renamed" - " to store_covariance in version 0.19 and will be " - "removed in 0.21.", clf.fit, X, y) - - # check that covariance_ (and covariances_ with warning) is stored - assert_warns_message(DeprecationWarning, "Attribute ``covariances_`` was " - "deprecated in version 0.19 and will be removed " - "in 0.21. Use ``covariance_`` instead", getattr, clf, - 'covariances_') - - def test_qda_regularization(): # the default is reg_param=0. and will cause issues # when there is a constant variable diff --git a/sklearn/utils/arpack.py b/sklearn/utils/arpack.py deleted file mode 100644 index 0343f7243ebdb..0000000000000 --- a/sklearn/utils/arpack.py +++ /dev/null @@ -1,23 +0,0 @@ -# Remove this module in version 0.21 - -from scipy.sparse.linalg import eigs as _eigs, eigsh as _eigsh, svds as _svds - -from .deprecation import deprecated - - -@deprecated("sklearn.utils.arpack.eigs was deprecated in version 0.19 and " - "will be removed in 0.21. Use scipy.sparse.linalg.eigs instead.") -def eigs(A, *args, **kwargs): - return _eigs(A, *args, **kwargs) - - -@deprecated("sklearn.utils.arpack.eigsh was deprecated in version 0.19 and " - "will be removed in 0.21. Use scipy.sparse.linalg.eigsh instead.") -def eigsh(A, *args, **kwargs): - return _eigsh(A, *args, **kwargs) - - -@deprecated("sklearn.utils.arpack.svds was deprecated in version 0.19 and " - "will be removed in 0.21. Use scipy.sparse.linalg.svds instead.") -def svds(A, *args, **kwargs): - return _svds(A, *args, **kwargs) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 95e464f071644..07a83a17377b5 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -26,17 +26,6 @@ from .validation import check_array -@deprecated("sklearn.utils.extmath.norm was deprecated in version 0.19 " - "and will be removed in 0.21. Use scipy.linalg.norm instead.") -def norm(x): - """Compute the Euclidean or Frobenius norm of x. - - Returns the Euclidean norm when x is a vector, the Frobenius norm when x - is a matrix (2-d array). More precise than sqrt(squared_norm(x)). - """ - return linalg.norm(x) - - def squared_norm(x): """Squared Euclidean or Frobenius norm of x. @@ -119,12 +108,6 @@ def _impose_f_order(X): return check_array(X, copy=False, order='F'), False -@deprecated("sklearn.utils.extmath.fast_dot was deprecated in version 0.19 " - "and will be removed in 0.21. Use the equivalent np.dot instead.") -def fast_dot(a, b, out=None): - return np.dot(a, b, out) - - def density(w, **kwargs): """Compute density of a sparse vector @@ -388,25 +371,6 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto', return U[:, :n_components], s[:n_components], V[:n_components, :] -@deprecated("sklearn.utils.extmath.logsumexp was deprecated in version 0.19 " - "and will be removed in 0.21. Use scipy.misc.logsumexp instead.") -def logsumexp(arr, axis=0): - """Computes the sum of arr assuming arr is in the log domain. - Returns log(sum(exp(arr))) while minimizing the possibility of - over/underflow. - Examples - -------- - >>> import numpy as np - >>> from sklearn.utils.extmath import logsumexp - >>> a = np.arange(10) - >>> np.log(np.sum(np.exp(a))) - 9.458... - >>> logsumexp(a) # doctest: +SKIP - 9.458... - """ - return scipy_logsumexp(arr, axis) - - def weighted_mode(a, w, axis=0): """Returns an array of the weighted modal (most common) value in a @@ -480,12 +444,6 @@ def weighted_mode(a, w, axis=0): return mostfrequent, oldcounts -@deprecated("sklearn.utils.extmath.pinvh was deprecated in version 0.19 " - "and will be removed in 0.21. Use scipy.linalg.pinvh instead.") -def pinvh(a, cond=None, rcond=None, lower=True): - return linalg.pinvh(a, cond, rcond, lower) - - def cartesian(arrays, out=None): """Generate a cartesian product of input arrays. diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py index 8bbebbd377451..17caa4fa2cb0d 100644 --- a/sklearn/utils/graph.py +++ b/sklearn/utils/graph.py @@ -68,17 +68,3 @@ def single_source_shortest_path_length(graph, source, cutoff=None): break level += 1 return seen # return all path lengths as dictionary - - -@deprecated("sklearn.utils.graph.connected_components was deprecated in " - "version 0.19 and will be removed in 0.21. Use " - "scipy.sparse.csgraph.connected_components instead.") -def connected_components(*args, **kwargs): - return csgraph.connected_components(*args, **kwargs) - - -@deprecated("sklearn.utils.graph.graph_laplacian was deprecated in version " - "0.19 and will be removed in 0.21. Use " - "scipy.sparse.csgraph.laplacian instead.") -def graph_laplacian(*args, **kwargs): - return csgraph.laplacian(*args, **kwargs) diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py index 24ddf4680c742..29d465fff8705 100644 --- a/sklearn/utils/random.py +++ b/sklearn/utils/random.py @@ -13,106 +13,6 @@ __all__ = ['sample_without_replacement', 'choice'] -# This is a backport of np.random.choice from numpy 1.7 -# The function can be removed when we bump the requirements to >=1.7 -@deprecated("sklearn.utils.random.choice was deprecated in version 0.19 " - "and will be removed in 0.21. Use np.random.choice or " - "np.random.RandomState.choice instead.") -def choice(a, size=None, replace=True, p=None, random_state=None): - """ - choice(a, size=None, replace=True, p=None) - - Generates a random sample from a given 1-D array - - .. versionadded:: 1.7.0 - - Parameters - ----------- - a : 1-D array-like or int - If an ndarray, a random sample is generated from its elements. - If an int, the random sample is generated as if a was np.arange(n) - - size : int or tuple of ints, optional - Output shape. Default is None, in which case a single value is - returned. - - replace : boolean, optional - Whether the sample is with or without replacement. - - p : 1-D array-like, optional - The probabilities associated with each entry in a. - If not given the sample assumes a uniform distribution over all - entries in a. - - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - - Returns - -------- - samples : 1-D ndarray, shape (size,) - The generated random samples - - Raises - ------- - ValueError - If a is an int and less than zero, if a or p are not 1-dimensional, - if a is an array-like of size 0, if p is not a vector of - probabilities, if a and p have different lengths, or if - replace=False and the sample size is greater than the population - size - - See Also - --------- - randint, shuffle, permutation - - Examples - --------- - Generate a uniform random sample from np.arange(5) of size 3: - - >>> np.random.choice(5, 3) # doctest: +SKIP - array([0, 3, 4]) - >>> #This is equivalent to np.random.randint(0,5,3) - - Generate a non-uniform random sample from np.arange(5) of size 3: - - >>> np.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0]) # doctest: +SKIP - array([3, 3, 0]) - - Generate a uniform random sample from np.arange(5) of size 3 without - replacement: - - >>> np.random.choice(5, 3, replace=False) # doctest: +SKIP - array([3,1,0]) - >>> #This is equivalent to np.random.shuffle(np.arange(5))[:3] - - Generate a non-uniform random sample from np.arange(5) of size - 3 without replacement: - - >>> np.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0]) - ... # doctest: +SKIP - array([2, 3, 0]) - - Any of the above can be repeated with an arbitrary array-like - instead of just integers. For instance: - - >>> aa_milne_arr = ['pooh', 'rabbit', 'piglet', 'Christopher'] - >>> np.random.choice(aa_milne_arr, 5, p=[0.5, 0.1, 0.1, 0.3]) - ... # doctest: +SKIP - array(['pooh', 'pooh', 'pooh', 'Christopher', 'piglet'], - dtype='|S11') - - """ - if random_state is not None: - random_state = check_random_state(random_state) - return random_state.choice(a, size, replace, p) - else: - return np.random.choice(a, size, replace, p) - - def random_choice_csc(n_samples, classes, class_probability=None, random_state=None): """Generate a sparse random matrix given column class distributions diff --git a/sklearn/utils/sparsetools/__init__.py b/sklearn/utils/sparsetools/__init__.py deleted file mode 100644 index a86598410e7fe..0000000000000 --- a/sklearn/utils/sparsetools/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Remove in version 0.21 - -from scipy.sparse.csgraph import connected_components as \ - scipy_connected_components - -from sklearn.utils.deprecation import deprecated - - -@deprecated("sklearn.utils.sparsetools.connected_components was deprecated in " - "version 0.19 and will be removed in 0.21. Use " - "scipy.sparse.csgraph.connected_components instead.") -def connected_components(*args, **kwargs): - return scipy_connected_components(*args, **kwargs) diff --git a/sklearn/utils/sparsetools/setup.py b/sklearn/utils/sparsetools/setup.py deleted file mode 100644 index 1ff3097b0db73..0000000000000 --- a/sklearn/utils/sparsetools/setup.py +++ /dev/null @@ -1,15 +0,0 @@ -# Remove in version 0.21 - - -def configuration(parent_package='', top_path=None): - from numpy.distutils.misc_util import Configuration - - config = Configuration('sparsetools', parent_package, top_path) - config.add_subpackage('tests') - - return config - - -if __name__ == '__main__': - from numpy.distutils.core import setup - setup(**configuration(top_path='').todict()) diff --git a/sklearn/utils/sparsetools/tests/__init__.py b/sklearn/utils/sparsetools/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py index 82b8912b78824..458669e23eb3a 100644 --- a/sklearn/utils/stats.py +++ b/sklearn/utils/stats.py @@ -5,13 +5,6 @@ from sklearn.utils.deprecation import deprecated -# Remove in sklearn 0.21 -@deprecated("sklearn.utils.stats.rankdata was deprecated in version 0.19 and " - "will be removed in 0.21. Use scipy.stats.rankdata instead.") -def rankdata(*args, **kwargs): - return scipy_rankdata(*args, **kwargs) - - def _weighted_percentile(array, sample_weight, percentile=50): """ Compute the weighted ``percentile`` of ``array`` with ``sample_weight``. diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 3de67e5a2130c..07431ed11c3bf 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -88,18 +88,6 @@ def test_random_weights(): assert_array_almost_equal(score.ravel(), w[:, :5].sum(1)) -@ignore_warnings # Test deprecated backport to be removed in 0.21 -def test_logsumexp(): - # Try to add some smallish numbers in logspace - x = np.array([1e-40] * 1000000) - logx = np.log(x) - assert_almost_equal(np.exp(logsumexp(logx)), x.sum()) - - X = np.vstack([x, x]) - logX = np.vstack([logx, logx]) - assert_array_almost_equal(np.exp(logsumexp(logX, axis=0)), X.sum(axis=0)) - assert_array_almost_equal(np.exp(logsumexp(logX, axis=1)), X.sum(axis=1)) - def check_randomized_svd_low_rank(dtype): # Check that extmath.randomized_svd is consistent with linalg.svd @@ -179,22 +167,6 @@ def test_randomized_svd_low_rank_all_dtypes(dtype): check_randomized_svd_low_rank(dtype) -@ignore_warnings # extmath.norm is deprecated to be removed in 0.21 -def test_norm_squared_norm(): - X = np.random.RandomState(42).randn(50, 63) - X *= 100 # check stability - X += 200 - - assert_almost_equal(np.linalg.norm(X.ravel()), norm(X)) - assert_almost_equal(norm(X) ** 2, squared_norm(X), decimal=6) - assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6) - # Check the warning with an int array and np.dot potential overflow - assert_warns_message( - UserWarning, 'Array type is integer, np.dot may ' - 'overflow. Data should be float type to avoid this issue', - squared_norm, X.astype(int)) - - @pytest.mark.parametrize('dtype', (np.float32, np.float64)) def test_row_norms(dtype): diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py index 36e3bf72b609b..b6b43644e476e 100644 --- a/sklearn/utils/tests/test_stats.py +++ b/sklearn/utils/tests/test_stats.py @@ -12,12 +12,3 @@ ([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]), ([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]), ) - - -@pytest.mark.parametrize("values, method, expected", _cases) -def test_cases_rankdata(values, method, expected): - - # Test deprecated backport to be removed in 0.21 - with ignore_warnings(): - r = rankdata(values, method=method) - assert_array_equal(r, expected) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index c2474c58c13f7..840e08524c384 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -108,56 +108,6 @@ def test_safe_mask(): assert_equal(X_csr[mask].shape[0], 3) -@ignore_warnings # Test deprecated backport to be removed in 0.21 -def test_pinvh_simple_real(): - a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 10]], dtype=np.float64) - a = np.dot(a, a.T) - a_pinv = pinvh(a) - assert_almost_equal(np.dot(a, a_pinv), np.eye(3)) - - -@ignore_warnings # Test deprecated backport to be removed in 0.21 -def test_pinvh_nonpositive(): - a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float64) - a = np.dot(a, a.T) - u, s, vt = np.linalg.svd(a) - s[0] *= -1 - a = np.dot(u * s, vt) # a is now symmetric non-positive and singular - a_pinv = pinv2(a) - a_pinvh = pinvh(a) - assert_almost_equal(a_pinv, a_pinvh) - - -@ignore_warnings # Test deprecated backport to be removed in 0.21 -def test_pinvh_simple_complex(): - a = (np.array([[1, 2, 3], [4, 5, 6], [7, 8, 10]]) - + 1j * np.array([[10, 8, 7], [6, 5, 4], [3, 2, 1]])) - a = np.dot(a, a.conj().T) - a_pinv = pinvh(a) - assert_almost_equal(np.dot(a, a_pinv), np.eye(3)) - - -@ignore_warnings # Test deprecated backport to be removed in 0.21 -def test_arpack_eigsh_initialization(): - # Non-regression test that shows null-space computation is better with - # initialization of eigsh from [-1,1] instead of [0,1] - random_state = check_random_state(42) - - A = random_state.rand(50, 50) - A = np.dot(A.T, A) # create s.p.d. matrix - A = laplacian(A) + 1e-7 * np.identity(A.shape[0]) - k = 5 - - # Test if eigsh is working correctly - # New initialization [-1,1] (as in original ARPACK) - # Was [0,1] before, with which this test could fail - v0 = random_state.uniform(-1, 1, A.shape[0]) - w, _ = eigsh(A, k=k, sigma=0.0, v0=v0) - - # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest - assert_greater_equal(w[0], 0) - - def test_column_or_1d(): EXAMPLES = [ ("binary", ["spam", "egg", "spam"]), From acf7659fdf565708db8ef971c8008befade19a05 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 1 Oct 2018 18:11:32 -0400 Subject: [PATCH 02/11] fixed some inits and test imports --- sklearn/datasets/__init__.py | 2 -- sklearn/linear_model/__init__.py | 6 ------ sklearn/neighbors/__init__.py | 2 -- sklearn/utils/tests/test_extmath.py | 3 +-- sklearn/utils/tests/test_graph.py | 26 -------------------------- sklearn/utils/tests/test_stats.py | 14 -------------- sklearn/utils/tests/test_utils.py | 2 -- 7 files changed, 1 insertion(+), 54 deletions(-) delete mode 100644 sklearn/utils/tests/test_graph.py delete mode 100644 sklearn/utils/tests/test_stats.py diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index c7d78e633493d..77dac99c1d970 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -17,7 +17,6 @@ from .base import clear_data_home from .covtype import fetch_covtype from .kddcup99 import fetch_kddcup99 -from .mlcomp import load_mlcomp from .lfw import fetch_lfw_pairs from .lfw import fetch_lfw_people from .twenty_newsgroups import fetch_20newsgroups @@ -75,7 +74,6 @@ 'load_iris', 'load_breast_cancer', 'load_linnerud', - 'load_mlcomp', 'load_sample_image', 'load_sample_images', 'load_svmlight_file', diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index f3100d45e2e66..2e01990ccce8c 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -31,9 +31,6 @@ from .passive_aggressive import PassiveAggressiveRegressor from .perceptron import Perceptron -from .randomized_l1 import (RandomizedLasso, RandomizedLogisticRegression, - lasso_stability_path) - from .ransac import RANSACRegressor from .theil_sen import TheilSenRegressor @@ -65,8 +62,6 @@ 'PassiveAggressiveClassifier', 'PassiveAggressiveRegressor', 'Perceptron', - 'RandomizedLasso', - 'RandomizedLogisticRegression', 'Ridge', 'RidgeCV', 'RidgeClassifier', @@ -78,7 +73,6 @@ 'enet_path', 'lars_path', 'lasso_path', - 'lasso_stability_path', 'logistic_regression_path', 'orthogonal_mp', 'orthogonal_mp_gram', diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 93c1bbbba0ba8..51116b3f470e6 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -12,7 +12,6 @@ from .regression import KNeighborsRegressor, RadiusNeighborsRegressor from .nearest_centroid import NearestCentroid from .kde import KernelDensity -from .approximate import LSHForest from .lof import LocalOutlierFactor from .base import VALID_METRICS, VALID_METRICS_SPARSE @@ -28,7 +27,6 @@ 'kneighbors_graph', 'radius_neighbors_graph', 'KernelDensity', - 'LSHForest', 'LocalOutlierFactor', 'VALID_METRICS', 'VALID_METRICS_SPARSE'] diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 07431ed11c3bf..d22ec5b886c89 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -27,8 +27,7 @@ from sklearn.utils.fixes import np_version from sklearn.utils.extmath import density -from sklearn.utils.extmath import logsumexp -from sklearn.utils.extmath import norm, squared_norm +from sklearn.utils.extmath import squared_norm from sklearn.utils.extmath import randomized_svd from sklearn.utils.extmath import row_norms from sklearn.utils.extmath import weighted_mode diff --git a/sklearn/utils/tests/test_graph.py b/sklearn/utils/tests/test_graph.py deleted file mode 100644 index ae1ce4a56cb8e..0000000000000 --- a/sklearn/utils/tests/test_graph.py +++ /dev/null @@ -1,26 +0,0 @@ -# Author: Gael Varoquaux -# License: BSD 3 clause - -import numpy as np -from scipy import sparse - -from sklearn.utils.graph import graph_laplacian -from sklearn.utils.testing import ignore_warnings - - -@ignore_warnings(category=DeprecationWarning) -def test_graph_laplacian(): - for mat in (np.arange(10) * np.arange(10)[:, np.newaxis], - np.ones((7, 7)), - np.eye(19), - np.vander(np.arange(4)) + np.vander(np.arange(4)).T,): - sp_mat = sparse.csr_matrix(mat) - for normed in (True, False): - laplacian = graph_laplacian(mat, normed=normed) - n_nodes = mat.shape[0] - if not normed: - np.testing.assert_array_almost_equal(laplacian.sum(axis=0), - np.zeros(n_nodes)) - np.testing.assert_array_almost_equal(laplacian.T, laplacian) - np.testing.assert_array_almost_equal( - laplacian, graph_laplacian(sp_mat, normed=normed).toarray()) diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py deleted file mode 100644 index b6b43644e476e..0000000000000 --- a/sklearn/utils/tests/test_stats.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest -from sklearn.utils.testing import assert_array_equal, ignore_warnings - -from sklearn.utils.stats import rankdata - - -_cases = ( - # values, method, expected - ([100], 'max', [1.0]), - ([100, 100, 100], 'max', [3.0, 3.0, 3.0]), - ([100, 300, 200], 'max', [1.0, 3.0, 2.0]), - ([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]), - ([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]), -) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 840e08524c384..ce69b70cb1cbb 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -22,8 +22,6 @@ from sklearn.utils import gen_even_slices from sklearn.utils import get_chunk_n_rows from sklearn.utils import is_scalar_nan -from sklearn.utils.extmath import pinvh -from sklearn.utils.arpack import eigsh from sklearn.utils.mocking import MockDataFrame from sklearn import config_context From cd48d6d43ce317d4c1805ecac89119948105ac4b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 1 Oct 2018 18:14:49 -0400 Subject: [PATCH 03/11] remove some more deprecation tests --- sklearn/decomposition/tests/test_sparse_pca.py | 7 ------- sklearn/discriminant_analysis.py | 9 ++------- .../tests/test_feature_hasher.py | 14 +++++++------- sklearn/metrics/tests/test_pairwise.py | 4 ---- 4 files changed, 9 insertions(+), 25 deletions(-) diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py index 5365ccb8f0d36..11ef39869d02a 100644 --- a/sklearn/decomposition/tests/test_sparse_pca.py +++ b/sklearn/decomposition/tests/test_sparse_pca.py @@ -78,13 +78,6 @@ def test_fit_transform(norm_comp): spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_) - # Test that deprecated ridge_alpha parameter throws warning - warning_msg = "The ridge_alpha parameter on transform()" - assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, - Y, ridge_alpha=0.01) - assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, - Y, ridge_alpha=None) - @pytest.mark.filterwarnings("ignore:normalize_components") @pytest.mark.parametrize("norm_comp", [False, True]) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index bf6b3a4f44631..ff8b6833cc557 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -567,9 +567,6 @@ class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin): .. versionadded:: 0.17 - store_covariances : boolean - Deprecated, use `store_covariance`. - Attributes ---------- covariance_ : list of array-like, shape = [n_features, n_features] @@ -602,8 +599,7 @@ class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin): >>> clf.fit(X, y) ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0, - store_covariance=False, - store_covariances=None, tol=0.0001) + store_covariance=False, tol=0.0001) >>> print(clf.predict([[-0.8, -1]])) [1] @@ -653,8 +649,7 @@ def fit(self, X, y): self.priors_ = self.priors cov = None - store_covariance = self.store_covariance or self.store_covariances - + store_covariance = self.store_covariance if store_covariance: cov = [] means = [] diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index 77a21ff4364a7..41fc027a2b1b4 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -33,7 +33,7 @@ def test_feature_hasher_strings(): it = (x for x in raw_X) # iterable - h = FeatureHasher(n_features, non_negative=True, input_type="string") + h = FeatureHasher(n_features, input_type="string") X = h.transform(it) assert_equal(X.shape[0], len(raw_X)) @@ -120,11 +120,11 @@ def test_hasher_alternate_sign(): input_type='string').fit_transform(X) assert Xt.data.min() < 0 and Xt.data.max() > 0 - Xt = FeatureHasher(alternate_sign=True, non_negative=True, + Xt = FeatureHasher(alternate_sign=True, input_type='string').fit_transform(X) assert Xt.data.min() > 0 - Xt = FeatureHasher(alternate_sign=False, non_negative=True, + Xt = FeatureHasher(alternate_sign=False, input_type='string').fit_transform(X) assert Xt.data.min() > 0 Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False, @@ -144,11 +144,11 @@ def test_hash_collisions(): # with an opposite sign and cancel out assert abs(Xt.data[0]) < len(X[0]) - Xt = FeatureHasher(alternate_sign=True, non_negative=True, + Xt = FeatureHasher(alternate_sign=True, n_features=1, input_type='string').fit_transform(X) assert abs(Xt.data[0]) < len(X[0]) - Xt = FeatureHasher(alternate_sign=False, non_negative=True, + Xt = FeatureHasher(alternate_sign=False, n_features=1, input_type='string').fit_transform(X) assert Xt.data[0] == len(X[0]) @@ -159,12 +159,12 @@ def test_hasher_negative(): Xt = FeatureHasher(alternate_sign=False, non_negative=False, input_type="pair").fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) - Xt = FeatureHasher(alternate_sign=False, non_negative=True, + Xt = FeatureHasher(alternate_sign=False, input_type="pair").fit_transform(X) assert_true(Xt.data.min() > 0) Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type="pair").fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) - Xt = FeatureHasher(alternate_sign=True, non_negative=True, + Xt = FeatureHasher(alternate_sign=True, input_type="pair").fit_transform(X) assert_true(Xt.data.min() > 0) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index e28453ee70086..571e764a2c48a 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -82,10 +82,6 @@ def test_pairwise_distances(): assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) - # Using size_threshold argument should raise - # a deprecation warning - assert_warns(DeprecationWarning, - manhattan_distances, X, Y, size_threshold=10) # Test cosine as a string metric versus cosine callable # The string "cosine" uses sklearn.metric, # while the function cosine is scipy.spatial From d4450f3f761937a2defcc8b9e802e76f4dace5b2 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 1 Oct 2018 18:46:50 -0400 Subject: [PATCH 04/11] many more test fixes --- sklearn/cluster/tests/test_hierarchical.py | 15 ------------- .../decomposition/tests/test_online_lda.py | 21 +------------------ .../tests/test_feature_hasher.py | 10 ++++----- sklearn/kernel_approximation.py | 4 ++-- sklearn/preprocessing/tests/test_encoders.py | 7 +------ .../tests/test_function_transformer.py | 11 ++++------ .../tests/test_label_propagation.py | 13 ------------ sklearn/utils/setup.py | 1 - 8 files changed, 13 insertions(+), 69 deletions(-) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 6f03f9aa32106..2456f61c872c5 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -38,21 +38,6 @@ from sklearn.datasets import make_moons, make_circles -def test_deprecation_of_n_components_in_linkage_tree(): - rng = np.random.RandomState(0) - X = rng.randn(50, 100) - # Test for warning of deprecation of n_components in linkage_tree - children, n_nodes, n_leaves, parent = assert_warns(DeprecationWarning, - linkage_tree, - X.T, - n_components=10) - children_t, n_nodes_t, n_leaves_t, parent_t = linkage_tree(X.T) - assert_array_equal(children, children_t) - assert_equal(n_nodes, n_nodes_t) - assert_equal(n_leaves, n_leaves_t) - assert_equal(parent, parent_t) - - def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index f3354cba375c3..655b367e0735a 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -347,19 +347,6 @@ def test_lda_fit_perplexity(): assert_almost_equal(perplexity1, perplexity2) -def test_doc_topic_distr_deprecation(): - # Test that the appropriate warning message is displayed when a user - # attempts to pass the doc_topic_distr argument to the perplexity method - n_components, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, - learning_method='batch', - total_samples=100, random_state=0) - distr1 = lda.fit_transform(X) - distr2 = None - assert_warns(DeprecationWarning, lda.perplexity, X, distr1) - assert_warns(DeprecationWarning, lda.perplexity, X, distr2) - - def test_lda_empty_docs(): """Test LDA on empty document (all-zero rows).""" Z = np.zeros((5, 4)) @@ -414,10 +401,4 @@ def check_verbosity(verbose, evaluate_every, expected_lines, def test_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities): check_verbosity(verbose, evaluate_every, expected_lines, - expected_perplexities) - - -def test_lda_n_topics_deprecation(): - n_components, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=10, learning_method='batch') - assert_warns(DeprecationWarning, lda.fit, X) + expected_perplexities) \ No newline at end of file diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index 41fc027a2b1b4..dff5f090c9b28 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -116,7 +116,7 @@ def test_hasher_zeros(): def test_hasher_alternate_sign(): X = [list("Thequickbrownfoxjumped")] - Xt = FeatureHasher(alternate_sign=True, non_negative=False, + Xt = FeatureHasher(alternate_sign=True, input_type='string').fit_transform(X) assert Xt.data.min() < 0 and Xt.data.max() > 0 @@ -127,7 +127,7 @@ def test_hasher_alternate_sign(): Xt = FeatureHasher(alternate_sign=False, input_type='string').fit_transform(X) assert Xt.data.min() > 0 - Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False, + Xt_2 = FeatureHasher(alternate_sign=False, input_type='string').fit_transform(X) # With initially positive features, the non_negative option should # have no impact when alternate_sign=False @@ -138,7 +138,7 @@ def test_hasher_alternate_sign(): def test_hash_collisions(): X = [list("Thequickbrownfoxjumped")] - Xt = FeatureHasher(alternate_sign=True, non_negative=False, + Xt = FeatureHasher(alternate_sign=True, n_features=1, input_type='string').fit_transform(X) # check that some of the hashed tokens are added # with an opposite sign and cancel out @@ -156,13 +156,13 @@ def test_hash_collisions(): @ignore_warnings(category=DeprecationWarning) def test_hasher_negative(): X = [{"foo": 2, "bar": -4, "baz": -1}.items()] - Xt = FeatureHasher(alternate_sign=False, non_negative=False, + Xt = FeatureHasher(alternate_sign=False, input_type="pair").fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) Xt = FeatureHasher(alternate_sign=False, input_type="pair").fit_transform(X) assert_true(Xt.data.min() > 0) - Xt = FeatureHasher(alternate_sign=True, non_negative=False, + Xt = FeatureHasher(alternate_sign=True, input_type="pair").fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) Xt = FeatureHasher(alternate_sign=True, diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 585f453e389b2..d44c67b417764 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -619,7 +619,7 @@ def _get_kernel_params(self): if (self.gamma is not None or self.coef0 is not None or self.degree is not None): - raise ValueErrror("Don't pass gamma, coef0 or degree to " - "Nystroem if using a callable kernel.") + raise ValueError("Don't pass gamma, coef0 or degree to " + "Nystroem if using a callable kernel.") return params diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 67169432defdc..13dfe08201c1e 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -607,9 +607,4 @@ def test_encoder_dtypes_pandas(): def test_one_hot_encoder_warning(): enc = OneHotEncoder() X = [['Male', 1], ['Female', 3]] - np.testing.assert_no_warnings(enc.fit_transform, X) - - -def test_categorical_encoder_stub(): - from sklearn.preprocessing import CategoricalEncoder - assert_raises(RuntimeError, CategoricalEncoder, encoding='ordinal') + np.testing.assert_no_warnings(enc.fit_transform, X) \ No newline at end of file diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 464581e5e9c2c..b65d6614e2fbd 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -50,20 +50,17 @@ def test_delegate_to_func(): # reset the argument stores. args_store[:] = [] # python2 compatible inplace list clear. kwargs_store.clear() - y = object() - transformed = assert_warns_message( - DeprecationWarning, "pass_y is deprecated", - FunctionTransformer( + FunctionTransformer( _make_func(args_store, kwargs_store), - pass_y=True, validate=False).transform, X, y) + validate=False).transform(X) assert_array_equal(transformed, X, err_msg='transform should have returned X unchanged') - # The function should have received X and y. + # The function should have received X assert_equal( args_store, - [X, y], + [X], 'Incorrect positional arguments passed to func: {args}'.format( args=args_store, ), diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 51b725030cb64..ef594fccb7076 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -66,19 +66,6 @@ def test_predict_proba(): np.array([[0.5, 0.5]])) -def test_alpha_deprecation(): - X, y = make_classification(n_samples=100) - y[::3] = -1 - - lp_default = label_propagation.LabelPropagation(kernel='rbf', gamma=0.1) - lp_default_y = lp_default.fit(X, y).transduction_ - - lp_0 = label_propagation.LabelPropagation(alpha=0, kernel='rbf', gamma=0.1) - lp_0_y = assert_warns(DeprecationWarning, lp_0.fit, X, y).transduction_ - - assert_array_equal(lp_default_y, lp_0_y) - - def test_label_spreading_closed_form(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index 9590692b0dff0..13d772a5a53b7 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -9,7 +9,6 @@ def configuration(parent_package='', top_path=None): from numpy.distutils.misc_util import Configuration config = Configuration('utils', parent_package, top_path) - config.add_subpackage('sparsetools') cblas_libs, blas_info = get_blas_info() cblas_compile_args = blas_info.pop('extra_compile_args', []) From 3d1fae5fa26a7ff2a85dc71925a35326c8f3a620 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 1 Oct 2018 18:51:28 -0400 Subject: [PATCH 05/11] more test fixes --- sklearn/decomposition/online_lda.py | 6 ------ sklearn/preprocessing/tests/test_function_transformer.py | 6 +++--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 51c199e50c2d7..cfeba87ef3ce3 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -795,12 +795,6 @@ def perplexity(self, X, sub_sampling=False): X : array-like or sparse matrix, [n_samples, n_features] Document word matrix. - doc_topic_distr : None or array, shape=(n_samples, n_components) - Document topic distribution. - This argument is deprecated and is currently being ignored. - - .. deprecated:: 0.19 - sub_sampling : bool Do sub-sampling or not. diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index b65d6614e2fbd..663b4e6b4c9f0 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -50,9 +50,9 @@ def test_delegate_to_func(): # reset the argument stores. args_store[:] = [] # python2 compatible inplace list clear. kwargs_store.clear() - FunctionTransformer( - _make_func(args_store, kwargs_store), - validate=False).transform(X) + transformed = FunctionTransformer( + _make_func(args_store, kwargs_store), + validate=False).transform(X) assert_array_equal(transformed, X, err_msg='transform should have returned X unchanged') From b2b0136934870e66123cea7e76414b05eda9f954 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 1 Oct 2018 18:52:31 -0400 Subject: [PATCH 06/11] undo non-negative stuff for now, seems annoying --- sklearn/feature_extraction/hashing.py | 22 +++++++++++++++-- .../tests/test_feature_hasher.py | 24 +++++++++---------- sklearn/feature_extraction/text.py | 13 ++++++++-- 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index f670e9cbec89e..744a073090bad 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -57,7 +57,8 @@ class FeatureHasher(BaseEstimator, TransformerMixin): feature_name should be a string, while value should be a number. In the case of "string", a value of 1 is implied. The feature_name is hashed to find the appropriate column for the - feature. The value's sign might be flipped in the output. + feature. The value's sign might be flipped in the output (but see + non_negative, below). dtype : numpy type, optional, default np.float64 The type of feature values. Passed to scipy.sparse matrix constructors as the dtype argument. Do not set this to bool, np.boolean or any @@ -67,6 +68,15 @@ class FeatureHasher(BaseEstimator, TransformerMixin): approximately conserve the inner product in the hashed space even for small n_features. This approach is similar to sparse random projection. + non_negative : boolean, optional, default False + When True, an absolute value is applied to the features matrix prior to + returning it. When used in conjunction with alternate_sign=True, this + significantly reduces the inner product preservation property. + + .. deprecated:: 0.19 + This option will be removed in 0.21. + + Examples -------- >>> from sklearn.feature_extraction import FeatureHasher @@ -84,12 +94,18 @@ class FeatureHasher(BaseEstimator, TransformerMixin): """ def __init__(self, n_features=(2 ** 20), input_type="dict", - dtype=np.float64, alternate_sign=True): + dtype=np.float64, alternate_sign=True, non_negative=False): self._validate_params(n_features, input_type) + if non_negative: + warnings.warn("the option non_negative=True has been deprecated" + " in 0.19 and will be removed" + " in version 0.21.", DeprecationWarning) + self.dtype = dtype self.input_type = input_type self.n_features = n_features self.alternate_sign = alternate_sign + self.non_negative = non_negative @staticmethod def _validate_params(n_features, input_type): @@ -159,4 +175,6 @@ def transform(self, raw_X): shape=(n_samples, self.n_features)) X.sum_duplicates() # also sorts the indices + if self.non_negative: + np.abs(X.data, X.data) return X diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index dff5f090c9b28..77a21ff4364a7 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -33,7 +33,7 @@ def test_feature_hasher_strings(): it = (x for x in raw_X) # iterable - h = FeatureHasher(n_features, input_type="string") + h = FeatureHasher(n_features, non_negative=True, input_type="string") X = h.transform(it) assert_equal(X.shape[0], len(raw_X)) @@ -116,18 +116,18 @@ def test_hasher_zeros(): def test_hasher_alternate_sign(): X = [list("Thequickbrownfoxjumped")] - Xt = FeatureHasher(alternate_sign=True, + Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type='string').fit_transform(X) assert Xt.data.min() < 0 and Xt.data.max() > 0 - Xt = FeatureHasher(alternate_sign=True, + Xt = FeatureHasher(alternate_sign=True, non_negative=True, input_type='string').fit_transform(X) assert Xt.data.min() > 0 - Xt = FeatureHasher(alternate_sign=False, + Xt = FeatureHasher(alternate_sign=False, non_negative=True, input_type='string').fit_transform(X) assert Xt.data.min() > 0 - Xt_2 = FeatureHasher(alternate_sign=False, + Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False, input_type='string').fit_transform(X) # With initially positive features, the non_negative option should # have no impact when alternate_sign=False @@ -138,17 +138,17 @@ def test_hasher_alternate_sign(): def test_hash_collisions(): X = [list("Thequickbrownfoxjumped")] - Xt = FeatureHasher(alternate_sign=True, + Xt = FeatureHasher(alternate_sign=True, non_negative=False, n_features=1, input_type='string').fit_transform(X) # check that some of the hashed tokens are added # with an opposite sign and cancel out assert abs(Xt.data[0]) < len(X[0]) - Xt = FeatureHasher(alternate_sign=True, + Xt = FeatureHasher(alternate_sign=True, non_negative=True, n_features=1, input_type='string').fit_transform(X) assert abs(Xt.data[0]) < len(X[0]) - Xt = FeatureHasher(alternate_sign=False, + Xt = FeatureHasher(alternate_sign=False, non_negative=True, n_features=1, input_type='string').fit_transform(X) assert Xt.data[0] == len(X[0]) @@ -156,15 +156,15 @@ def test_hash_collisions(): @ignore_warnings(category=DeprecationWarning) def test_hasher_negative(): X = [{"foo": 2, "bar": -4, "baz": -1}.items()] - Xt = FeatureHasher(alternate_sign=False, + Xt = FeatureHasher(alternate_sign=False, non_negative=False, input_type="pair").fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) - Xt = FeatureHasher(alternate_sign=False, + Xt = FeatureHasher(alternate_sign=False, non_negative=True, input_type="pair").fit_transform(X) assert_true(Xt.data.min() > 0) - Xt = FeatureHasher(alternate_sign=True, + Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type="pair").fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) - Xt = FeatureHasher(alternate_sign=True, + Xt = FeatureHasher(alternate_sign=True, non_negative=True, input_type="pair").fit_transform(X) assert_true(Xt.data.min() > 0) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index a4de38d959db1..05f60d2805c7c 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -491,6 +491,13 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin): .. versionadded:: 0.19 + non_negative : boolean, optional, default False + When True, an absolute value is applied to the features matrix prior to + returning it. When used in conjunction with alternate_sign=True, this + significantly reduces the inner product preservation property. + + .. deprecated:: 0.19 + This option will be removed in 0.21. dtype : type, optional Type of the matrix returned by fit_transform() or transform(). @@ -519,7 +526,7 @@ def __init__(self, input='content', encoding='utf-8', stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20), binary=False, norm='l2', alternate_sign=True, - dtype=np.float64): + non_negative=False, dtype=np.float64): self.input = input self.encoding = encoding self.decode_error = decode_error @@ -535,6 +542,7 @@ def __init__(self, input='content', encoding='utf-8', self.binary = binary self.norm = norm self.alternate_sign = alternate_sign + self.non_negative = non_negative self.dtype = dtype def partial_fit(self, X, y=None): @@ -622,7 +630,8 @@ def fit_transform(self, X, y=None): def _get_hasher(self): return FeatureHasher(n_features=self.n_features, input_type='string', dtype=self.dtype, - alternate_sign=self.alternate_sign) + alternate_sign=self.alternate_sign, + non_negative=self.non_negative) def _document_frequency(X): From a750abc6266ab7412aaf3d5969cec22918be3dcb Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 1 Oct 2018 18:52:47 -0400 Subject: [PATCH 07/11] fix kernel_approximation test --- sklearn/tests/test_kernel_approximation.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py index 8a2208b20af99..71ec07452abeb 100644 --- a/sklearn/tests/test_kernel_approximation.py +++ b/sklearn/tests/test_kernel_approximation.py @@ -245,13 +245,3 @@ def logging_histogram_kernel(x, y, log): n_components=(n_samples - 1), kernel_params={'log': kernel_log}).fit(X) assert_equal(len(kernel_log), n_samples * (n_samples - 1) / 2) - - def linear_kernel(X, Y): - return np.dot(X, Y.T) - - # if degree, gamma or coef0 is passed, we raise a warning - msg = "Passing gamma, coef0 or degree to Nystroem" - params = ({'gamma': 1}, {'coef0': 1}, {'degree': 2}) - for param in params: - ny = Nystroem(kernel=linear_kernel, **param) - assert_warns_message(DeprecationWarning, msg, ny.fit, X) From 5069dcf2196ffcbd79683ee3aa825d7e8ef7cc05 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 1 Oct 2018 19:10:22 -0400 Subject: [PATCH 08/11] remove unused imports --- sklearn/covariance/tests/test_graph_lasso.py | 5 +---- sklearn/decomposition/fastica_.py | 1 - sklearn/decomposition/online_lda.py | 1 - sklearn/discriminant_analysis.py | 1 - sklearn/gaussian_process/gpr.py | 1 - sklearn/linear_model/least_angle.py | 2 +- sklearn/manifold/t_sne.py | 2 -- sklearn/preprocessing/_function_transformer.py | 1 - sklearn/preprocessing/data.py | 3 +-- sklearn/tests/test_calibration.py | 3 +-- sklearn/tests/test_discriminant_analysis.py | 1 - sklearn/tests/test_kernel_approximation.py | 1 - sklearn/utils/extmath.py | 3 +-- sklearn/utils/graph.py | 2 -- sklearn/utils/random.py | 3 +-- sklearn/utils/stats.py | 2 -- sklearn/utils/tests/test_utils.py | 5 +---- 17 files changed, 7 insertions(+), 30 deletions(-) diff --git a/sklearn/covariance/tests/test_graph_lasso.py b/sklearn/covariance/tests/test_graph_lasso.py index 33c724df781d4..d368356100a4f 100644 --- a/sklearn/covariance/tests/test_graph_lasso.py +++ b/sklearn/covariance/tests/test_graph_lasso.py @@ -9,7 +9,6 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_less -from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import ignore_warnings from sklearn.covariance import (graph_lasso, GraphLasso, GraphLassoCV, @@ -19,8 +18,6 @@ from sklearn.utils import check_random_state from sklearn import datasets -from numpy.testing import assert_equal - @ignore_warnings(category=DeprecationWarning) def test_graph_lasso(random_state=0): @@ -140,4 +137,4 @@ def test_graph_lasso_cv(random_state=1): sys.stdout = orig_stdout # Smoke test with specified alphas - GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) \ No newline at end of file + GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index 2eead18b2678d..693d46d31fab5 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -18,7 +18,6 @@ from ..exceptions import ConvergenceWarning from ..externals import six from ..externals.six import moves -from ..externals.six import string_types from ..utils import check_array, as_float_array, check_random_state from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index cfeba87ef3ce3..4c0f8625771c7 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -14,7 +14,6 @@ import numpy as np import scipy.sparse as sp from scipy.special import gammaln -import warnings from ..base import BaseEstimator, TransformerMixin from ..utils import (check_random_state, check_array, diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index ff8b6833cc557..e0084741e583f 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -12,7 +12,6 @@ from __future__ import print_function import warnings import numpy as np -from .utils import deprecated from scipy import linalg from .externals.six import string_types from .externals.six.moves import xrange diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py index ebe8dd3b65ade..c5ff9674bc575 100644 --- a/sklearn/gaussian_process/gpr.py +++ b/sklearn/gaussian_process/gpr.py @@ -15,7 +15,6 @@ from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C from sklearn.utils import check_random_state from sklearn.utils.validation import check_X_y, check_array -from sklearn.utils.deprecation import deprecated from sklearn.exceptions import ConvergenceWarning diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index 0e923a424c221..bdee91f468737 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -20,7 +20,7 @@ from .base import LinearModel from ..base import RegressorMixin -from ..utils import arrayfuncs, as_float_array, check_X_y, deprecated +from ..utils import arrayfuncs, as_float_array, check_X_y from ..model_selection import check_cv from ..exceptions import ConvergenceWarning from ..utils import Parallel, delayed diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 5ddda56491564..213d75c2a4730 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -26,7 +26,6 @@ from . import _utils from . import _barnes_hut_tsne from ..externals.six import string_types -from ..utils import deprecated MACHINE_EPSILON = np.finfo(np.double).eps @@ -805,7 +804,6 @@ def _fit(self, X, skip_num_points=0): neighbors=neighbors_nn, skip_num_points=skip_num_points) - def _tsne(self, P, degrees_of_freedom, n_samples, X_embedded, neighbors=None, skip_num_points=0): """Runs t-SNE.""" diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 93afcc646e3fb..66034f6740a8e 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -3,7 +3,6 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.testing import assert_allclose_dense_sparse -from ..externals.six import string_types def _identity(X): diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 9b3eaa98e4c08..23eecacfda163 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -21,7 +21,6 @@ from ..base import BaseEstimator, TransformerMixin from ..externals import six -from ..externals.six import string_types from ..utils import check_array from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var @@ -2872,4 +2871,4 @@ def power_transform(X, method='box-cox', standardize=True, copy=True): Royal Statistical Society B, 26, 211-252 (1964). """ pt = PowerTransformer(method=method, standardize=standardize, copy=copy) - return pt.fit_transform(X) \ No newline at end of file + return pt.fit_transform(X) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index e454633a3a294..8b18e5aafeb93 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -11,8 +11,7 @@ assert_greater, assert_almost_equal, assert_greater_equal, assert_array_equal, - assert_raises, - ignore_warnings) + assert_raises) from sklearn.datasets import make_classification, make_blobs from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py index 4cb8f5d148b04..789b274f8f7bf 100644 --- a/sklearn/tests/test_discriminant_analysis.py +++ b/sklearn/tests/test_discriminant_analysis.py @@ -9,7 +9,6 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_warns -from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_greater from sklearn.utils.testing import ignore_warnings diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py index 71ec07452abeb..be0d249f1a4a7 100644 --- a/sklearn/tests/test_kernel_approximation.py +++ b/sklearn/tests/test_kernel_approximation.py @@ -5,7 +5,6 @@ from sklearn.utils.testing import assert_not_equal from sklearn.utils.testing import assert_array_almost_equal, assert_raises from sklearn.utils.testing import assert_less_equal -from sklearn.utils.testing import assert_warns_message from sklearn.metrics.pairwise import kernel_metrics from sklearn.kernel_approximation import RBFSampler diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 07a83a17377b5..80bcfd5585c2c 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -17,9 +17,8 @@ import numpy as np from scipy import linalg, sparse -from . import check_random_state, deprecated +from . import check_random_state from .fixes import np_version -from .fixes import logsumexp as scipy_logsumexp from ._logistic_sigmoid import _log_logistic_sigmoid from ..externals.six.moves import xrange from .sparsefuncs_fast import csr_row_norms diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py index 17caa4fa2cb0d..b030af2fed81c 100644 --- a/sklearn/utils/graph.py +++ b/sklearn/utils/graph.py @@ -11,10 +11,8 @@ # License: BSD 3 clause from scipy import sparse -from scipy.sparse import csgraph from .graph_shortest_path import graph_shortest_path # noqa -from .deprecation import deprecated ############################################################################### diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py index 29d465fff8705..61be8214dd1f1 100644 --- a/sklearn/utils/random.py +++ b/sklearn/utils/random.py @@ -8,9 +8,8 @@ from sklearn.utils import check_random_state from ._random import sample_without_replacement -from .deprecation import deprecated -__all__ = ['sample_without_replacement', 'choice'] +__all__ = ['sample_without_replacement'] def random_choice_csc(n_samples, classes, class_probability=None, diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py index 458669e23eb3a..ff770afa55ad6 100644 --- a/sklearn/utils/stats.py +++ b/sklearn/utils/stats.py @@ -1,8 +1,6 @@ import numpy as np -from scipy.stats import rankdata as scipy_rankdata from sklearn.utils.extmath import stable_cumsum -from sklearn.utils.deprecation import deprecated def _weighted_percentile(array, sample_weight, percentile=50): diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index ce69b70cb1cbb..2c3b22a4f38f7 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -4,13 +4,10 @@ import pytest import numpy as np import scipy.sparse as sp -from scipy.linalg import pinv2 -from scipy.sparse.csgraph import laplacian from sklearn.utils.testing import (assert_equal, assert_raises, assert_true, - assert_almost_equal, assert_array_equal, + assert_array_equal, SkipTest, assert_raises_regex, - assert_greater_equal, ignore_warnings, assert_warns_message, assert_no_warnings) from sklearn.utils import check_random_state from sklearn.utils import deprecated From cfa9216bfdbd622f8a03ab3a67c56825189caf32 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 1 Oct 2018 19:13:41 -0400 Subject: [PATCH 09/11] remove more unused imports --- sklearn/covariance/tests/test_graphical_lasso.py | 6 +----- sklearn/decomposition/tests/test_kernel_pca.py | 2 +- sklearn/decomposition/tests/test_online_lda.py | 3 +-- sklearn/decomposition/tests/test_pca.py | 2 -- sklearn/linear_model/tests/test_huber.py | 1 - sklearn/metrics/tests/test_classification.py | 1 - sklearn/metrics/tests/test_pairwise.py | 1 - sklearn/metrics/tests/test_score_objects.py | 1 - sklearn/model_selection/tests/test_split.py | 1 - sklearn/neighbors/tests/test_kd_tree.py | 2 +- sklearn/neighbors/tests/test_lof.py | 2 +- sklearn/preprocessing/tests/test_data.py | 1 - sklearn/utils/tests/test_extmath.py | 3 --- 13 files changed, 5 insertions(+), 21 deletions(-) diff --git a/sklearn/covariance/tests/test_graphical_lasso.py b/sklearn/covariance/tests/test_graphical_lasso.py index 25e2f191d3ec8..47f15f4a762ac 100644 --- a/sklearn/covariance/tests/test_graphical_lasso.py +++ b/sklearn/covariance/tests/test_graphical_lasso.py @@ -8,7 +8,6 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_less -from sklearn.utils.testing import assert_warns_message from sklearn.covariance import (graphical_lasso, GraphicalLasso, GraphicalLassoCV, empirical_covariance) @@ -16,9 +15,6 @@ from sklearn.externals.six.moves import StringIO from sklearn.utils import check_random_state from sklearn import datasets -from sklearn.utils.fixes import PY3_OR_LATER - -from numpy.testing import assert_equal def test_graphical_lasso(random_state=0): @@ -136,4 +132,4 @@ def test_graphical_lasso_cv(random_state=1): sys.stdout = orig_stdout # Smoke test with specified alphas - GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) \ No newline at end of file + GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index b0f2c5aeae52a..040f9e49d590b 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -4,7 +4,7 @@ from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, - assert_raises, ignore_warnings) + assert_raises) from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index 655b367e0735a..0abc2efe75ec2 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -19,7 +19,6 @@ from sklearn.utils.testing import assert_greater_equal from sklearn.utils.testing import assert_raises_regexp from sklearn.utils.testing import if_safe_multiprocessing_with_blas -from sklearn.utils.testing import assert_warns from sklearn.exceptions import NotFittedError from sklearn.externals.six.moves import xrange @@ -401,4 +400,4 @@ def check_verbosity(verbose, evaluate_every, expected_lines, def test_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities): check_verbosity(verbose, evaluate_every, expected_lines, - expected_perplexities) \ No newline at end of file + expected_perplexities) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 7484367127157..c852e4bed0e58 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -13,7 +13,6 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_no_warnings -from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_less @@ -685,7 +684,6 @@ def test_svd_solver_auto(): assert_array_almost_equal(pca.components_, pca_test.components_) - @pytest.mark.parametrize('svd_solver', solver_list) def test_pca_sparse_input(svd_solver): X = np.random.RandomState(0).rand(5, 4) diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py index d7658396b3f22..3bc77ee8c1778 100644 --- a/sklearn/linear_model/tests/test_huber.py +++ b/sklearn/linear_model/tests/test_huber.py @@ -4,7 +4,6 @@ import numpy as np from scipy import optimize, sparse -from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index c07f9d66aa0f9..8e18af7128350 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -14,7 +14,6 @@ from sklearn.datasets import make_multilabel_classification from sklearn.preprocessing import label_binarize from sklearn.utils.validation import check_random_state -from sklearn.utils.testing import assert_dict_equal from sklearn.utils.testing import assert_raises, clean_warning_registry from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_equal diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 571e764a2c48a..62aaec5fdc9a6 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -17,7 +17,6 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regexp from sklearn.utils.testing import assert_true -from sklearn.utils.testing import assert_warns from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_warns_message diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index da04b4215dce0..9033a2b2d86ee 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -17,7 +17,6 @@ from sklearn.utils.testing import assert_false from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_not_equal -from sklearn.utils.testing import assert_warns_message from sklearn.base import BaseEstimator from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score, diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 28286bf2402fd..637b4dca5537f 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -49,7 +49,6 @@ from sklearn.linear_model import Ridge from sklearn.model_selection._split import _validate_shuffle_split -from sklearn.model_selection._split import _CVIterableWrapper from sklearn.model_selection._split import _build_repr from sklearn.model_selection._split import CV_WARNING from sklearn.model_selection._split import NSPLIT_WARNING diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 18d2138021605..0b9c612624cff 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -8,7 +8,7 @@ nodeheap_sort, DTYPE, ITYPE) from sklearn.neighbors.dist_metrics import DistanceMetric from sklearn.utils import check_random_state -from sklearn.utils.testing import SkipTest, assert_allclose +from sklearn.utils.testing import assert_allclose rng = np.random.RandomState(42) V = rng.random_sample((3, 3)) diff --git a/sklearn/neighbors/tests/test_lof.py b/sklearn/neighbors/tests/test_lof.py index ed57a1d0fba29..ef833024d5cb8 100644 --- a/sklearn/neighbors/tests/test_lof.py +++ b/sklearn/neighbors/tests/test_lof.py @@ -14,7 +14,7 @@ from sklearn.metrics import roc_auc_score from sklearn.utils import check_random_state -from sklearn.utils.testing import assert_greater, ignore_warnings +from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_warns_message diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f4d0b5af9799f..3279387dcce7a 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -6,7 +6,6 @@ from __future__ import division import warnings -import re import itertools import numpy as np diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index d22ec5b886c89..7586bbfd1eeb1 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -23,11 +23,9 @@ from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import skip_if_32bit from sklearn.utils.testing import SkipTest -from sklearn.utils.testing import ignore_warnings from sklearn.utils.fixes import np_version from sklearn.utils.extmath import density -from sklearn.utils.extmath import squared_norm from sklearn.utils.extmath import randomized_svd from sklearn.utils.extmath import row_norms from sklearn.utils.extmath import weighted_mode @@ -87,7 +85,6 @@ def test_random_weights(): assert_array_almost_equal(score.ravel(), w[:, :5].sum(1)) - def check_randomized_svd_low_rank(dtype): # Check that extmath.randomized_svd is consistent with linalg.svd n_samples = 100 From 45289e88bf4ba83453a9fd20200c0979499c34b9 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 2 Oct 2018 12:33:48 -0400 Subject: [PATCH 10/11] fix no newline at end of file in test_encoders.py --- sklearn/preprocessing/tests/test_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 13dfe08201c1e..d3833ed97c79d 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -607,4 +607,4 @@ def test_encoder_dtypes_pandas(): def test_one_hot_encoder_warning(): enc = OneHotEncoder() X = [['Male', 1], ['Female', 3]] - np.testing.assert_no_warnings(enc.fit_transform, X) \ No newline at end of file + np.testing.assert_no_warnings(enc.fit_transform, X) From 3afc42bb8512119be5c99e270421746c72fb73dc Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 2 Oct 2018 13:39:07 -0400 Subject: [PATCH 11/11] fix kernel approximation test --- sklearn/tests/test_kernel_approximation.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py index be0d249f1a4a7..c2ba50f3728ff 100644 --- a/sklearn/tests/test_kernel_approximation.py +++ b/sklearn/tests/test_kernel_approximation.py @@ -1,5 +1,6 @@ import numpy as np from scipy.sparse import csr_matrix +import pytest from sklearn.utils.testing import assert_array_equal, assert_equal, assert_true from sklearn.utils.testing import assert_not_equal @@ -244,3 +245,14 @@ def logging_histogram_kernel(x, y, log): n_components=(n_samples - 1), kernel_params={'log': kernel_log}).fit(X) assert_equal(len(kernel_log), n_samples * (n_samples - 1) / 2) + + def linear_kernel(X, Y): + return np.dot(X, Y.T) + + # if degree, gamma or coef0 is passed, we raise a warning + msg = "Don't pass gamma, coef0 or degree to Nystroem" + params = ({'gamma': 1}, {'coef0': 1}, {'degree': 2}) + for param in params: + ny = Nystroem(kernel=linear_kernel, **param) + with pytest.raises(ValueError, match=msg): + ny.fit(X)