diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py index c6daf4540ef27..b2b28497aedfa 100644 --- a/sklearn/cluster/_feature_agglomeration.py +++ b/sklearn/cluster/_feature_agglomeration.py @@ -10,10 +10,12 @@ from ..base import TransformerMixin from ..utils import check_array from ..utils.validation import check_is_fitted +from scipy.sparse import issparse ############################################################################### # Mixin class for feature agglomeration. + class AgglomerationTransform(TransformerMixin): """ A class for feature agglomeration via the transform interface @@ -40,14 +42,21 @@ def transform(self, X): pooling_func = self.pooling_func X = check_array(X) - nX = [] if len(self.labels_) != X.shape[1]: raise ValueError("X has a different number of features than " "during fitting.") - - for l in np.unique(self.labels_): - nX.append(pooling_func(X[:, self.labels_ == l], axis=1)) - return np.array(nX).T + if pooling_func == np.mean and not issparse(X): + size = np.bincount(self.labels_) + n_samples = X.shape[0] + # a fast way to compute the mean of grouped features + nX = np.array([np.bincount(self.labels_, X[i, :]) / size + for i in range(n_samples)]) + else: + nX = [] + for l in np.unique(self.labels_): + nX.append(pooling_func(X[:, self.labels_ == l], axis=1)) + nX = np.array(nX).T + return nX def inverse_transform(self, Xred): """ diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py new file mode 100644 index 0000000000000..98d5dfc4b72ca --- /dev/null +++ b/sklearn/cluster/tests/test_feature_agglomeration.py @@ -0,0 +1,43 @@ +""" +Tests for sklearn.cluster._feature_agglomeration +""" +# Authors: Sergul Aydore 2017 +import numpy as np +from sklearn.cluster import FeatureAgglomeration +from sklearn.utils.testing import assert_true +from sklearn.utils.testing import assert_array_almost_equal + + +def test_feature_agglomeration(): + n_clusters = 1 + X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) + + agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, + pooling_func=np.mean) + agglo_median = FeatureAgglomeration(n_clusters=n_clusters, + pooling_func=np.median) + agglo_mean.fit(X) + agglo_median.fit(X) + assert_true(np.size(np.unique(agglo_mean.labels_)) == n_clusters) + assert_true(np.size(np.unique(agglo_median.labels_)) == n_clusters) + assert_true(np.size(agglo_mean.labels_) == X.shape[1]) + assert_true(np.size(agglo_median.labels_) == X.shape[1]) + + # Test transform + Xt_mean = agglo_mean.transform(X) + Xt_median = agglo_median.transform(X) + assert_true(Xt_mean.shape[1] == n_clusters) + assert_true(Xt_median.shape[1] == n_clusters) + assert_true(Xt_mean == np.array([1 / 3.])) + assert_true(Xt_median == np.array([0.])) + + # Test inverse transform + X_full_mean = agglo_mean.inverse_transform(Xt_mean) + X_full_median = agglo_median.inverse_transform(Xt_median) + assert_true(np.unique(X_full_mean[0]).size == n_clusters) + assert_true(np.unique(X_full_median[0]).size == n_clusters) + + assert_array_almost_equal(agglo_mean.transform(X_full_mean), + Xt_mean) + assert_array_almost_equal(agglo_median.transform(X_full_median), + Xt_median)