Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b9474d0

Browse files
committed
Made the following changes
1. Made radius a property. 2. Added test for compute_label. 3. Minor changes to doc and split_subcluster. 4. n_cluster -> clusterer.
1 parent ce6a01d commit b9474d0

File tree

2 files changed

+66
-51
lines changed

2 files changed

+66
-51
lines changed

sklearn/cluster/birch.py

+50-48
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Authors: Manoj Kumar <[email protected]>
22
# Alexandre Gramfort <[email protected]>
3+
# Joel Nothman <[email protected]>
34
# License: BSD 3 clause
45
from __future__ import division
56

@@ -117,11 +118,11 @@ class _CFNode(object):
117118
the final subclusters.
118119
119120
init_centroids_ : ndarray, shape (branching_factor + 1, n_features)
120-
manipulate init_centroids throughout rather than centroids_ since
121+
manipulate init_centroids_ throughout rather than centroids_ since
121122
the centroids are just a view of the init_centroids_ .
122123
123124
init_sq_norm_ : ndarray, shape (branching_factor + 1,)
124-
manipulate squared_norms throughout. similar to init_centroids_.
125+
manipulate init_sq_norm_ throughout. similar to init_centroids_.
125126
126127
centroids_ : ndarray
127128
view of init_centroids_.
@@ -164,12 +165,9 @@ def update_split_subclusters(self, subcluster,
164165
"""
165166
n_samples = len(self.subclusters_)
166167
ind = self.subclusters_.index(subcluster)
167-
del self.subclusters_[ind]
168-
self.init_centroids_[ind:n_samples - 1, :] = \
169-
self.init_centroids_[ind + 1:n_samples, :]
170-
self.init_sq_norm_[ind:n_samples - 1] = \
171-
self.init_sq_norm_[ind + 1:n_samples]
172-
self.append_subcluster(new_subcluster1)
168+
self.subclusters_[ind] = new_subcluster1
169+
self.init_centroids_[ind] = new_subcluster1.centroid_
170+
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
173171
self.append_subcluster(new_subcluster2)
174172

175173
def insert_cf_subcluster(self, subcluster):
@@ -316,6 +314,7 @@ def merge_subcluster(self, nominee_cluster, threshold):
316314
return True
317315
return False
318316

317+
@property
319318
def radius(self):
320319
"""Return radius of the subcluster"""
321320
dot_product = -2 * np.dot(self.linear_sum_, self.centroid_)
@@ -349,10 +348,12 @@ class Birch(BaseEstimator, TransformerMixin, ClusterMixin):
349348
350349
n_clusters : int, instance of sklearn.cluster model, default None
351350
Number of clusters after the final clustering step, which treats the
352-
subclusters from the leaves as new samples. By default the global
353-
clustering step is AgglomerativeClustering with n_clusters set to
354-
3. By default, this final clustering step is not performed and the
355-
subclusters are returned as they are.
351+
subclusters from the leaves as new samples. By default, this final
352+
clustering step is not performed and the subclusters are returned
353+
as they are. If a model is provided, the model is fit treating
354+
the subclusters as new samples and the initial data is mapped to the
355+
label of the closest subcluster. If an int is provided, the model
356+
fit is AgglomerativeClustering with n_clusters set to the int.
356357
357358
compute_labels : bool, default True
358359
Whether or not to compute labels for each fit.
@@ -486,14 +487,35 @@ def _get_leaves(self):
486487
"""
487488
leaf_ptr = self.dummy_leaf_.next_leaf_
488489
leaves = []
489-
while leaf_ptr:
490+
while leaf_ptr is not None:
490491
leaves.append(leaf_ptr)
491492
leaf_ptr = leaf_ptr.next_leaf_
492493
return leaves
493494

495+
def partial_fit(self, X=None, y=None):
496+
"""
497+
Online learning. Prevents rebuilding of CFTree from scratch.
498+
499+
Parameters
500+
----------
501+
X : {array-like, sparse matrix}, shape (n_samples, n_features), None
502+
Input data. If X is not provided, only the global clustering
503+
step is done.
504+
"""
505+
self.partial_fit_, self.fit_ = True, False
506+
if X is None:
507+
# Perform just the final global clustering step.
508+
self._global_clustering()
509+
return self
510+
else:
511+
self._check_fit(X)
512+
return self._fit(X)
513+
494514
def _check_fit(self, X):
495515
is_fitted = hasattr(self, 'subcluster_centers_')
496-
has_partial_fit = hasattr(self, 'partial_fit')
516+
517+
# Called by partial_fit, before fitting.
518+
has_partial_fit = hasattr(self, 'partial_fit_')
497519

498520
# Should raise an error if one does not fit before predicting.
499521
if not has_partial_fit and not is_fitted:
@@ -502,7 +524,7 @@ def _check_fit(self, X):
502524
if is_fitted and X.shape[1] != self.subcluster_centers_.shape[1]:
503525
raise ValueError(
504526
"Training data and predicted data do "
505-
"not have same no. of features.")
527+
"not have same number of features.")
506528

507529
def predict(self, X):
508530
"""
@@ -527,30 +549,12 @@ def predict(self, X):
527549
reduced_distance += self._subcluster_norms
528550
return self.subcluster_labels_[np.argmin(reduced_distance, axis=1)]
529551

530-
def partial_fit(self, X=None, y=None):
531-
"""
532-
Online learning. Prevents rebuilding of CFTree from scratch.
533-
534-
Parameters
535-
----------
536-
X : {array-like, sparse matrix}, shape (n_samples, n_features), None
537-
Input data. If X is not provided, only the global clustering
538-
step is done.
539-
"""
540-
self.partial_fit_, self.fit_ = True, False
541-
if X is None:
542-
# Perform just the final global clustering step.
543-
self._global_clustering()
544-
return self
545-
else:
546-
self._check_fit(X)
547-
return self._fit(X)
548-
549552
def transform(self, X, y=None):
550553
"""
551554
Transform X into subcluster centroids dimension.
552555
553-
Each dimension represents the distance between each cluster centroid.
556+
Each dimension represents the distance from the sample point to each
557+
cluster centroid.
554558
555559
Parameters
556560
----------
@@ -570,30 +574,28 @@ def _global_clustering(self, X=None):
570574
"""
571575
Global clustering for the subclusters obtained after fitting
572576
"""
573-
clusters = self.n_clusters
577+
clusterer = self.n_clusters
574578
centroids = self.subcluster_centers_
575579
compute_labels = (X is not None) and self.compute_labels
576580

577581
# Preprocessing for the global clustering.
578582
not_enough_centroids = False
579-
if hasattr(clusters, 'fit_predict'):
580-
n_cluster = clusters
581-
elif isinstance(clusters, int):
582-
n_cluster = AgglomerativeClustering(
583-
n_clusters=clusters)
583+
if isinstance(clusterer, int):
584+
clusterer = AgglomerativeClustering(
585+
n_clusters=self.n_clusters)
584586
# There is no need to perform the global clustering step.
585-
if len(centroids) < clusters:
587+
if len(centroids) < self.n_clusters:
586588
not_enough_centroids = True
587-
elif clusters is not None:
589+
elif (clusterer is not None and not
590+
hasattr(clusterer, 'fit_predict')):
588591
raise ValueError("n_clusters should be an instance of "
589592
"ClusterMixin or an int")
590593

591594
# To use in predict to avoid recalculation.
592-
if compute_labels:
593-
self._subcluster_norms = row_norms(
594-
self.subcluster_centers_, squared=True)
595+
self._subcluster_norms = row_norms(
596+
self.subcluster_centers_, squared=True)
595597

596-
if self.n_clusters is None or not_enough_centroids:
598+
if clusterer is None or not_enough_centroids:
597599
self.subcluster_labels_ = np.arange(len(centroids))
598600
if not_enough_centroids:
599601
warnings.warn(
@@ -604,7 +606,7 @@ def _global_clustering(self, X=None):
604606
# The global clustering step that clusters the subclusters of
605607
# the leaves. It assumes the centroids of the subclusters as
606608
# samples and finds the final centroids.
607-
self.subcluster_labels_ = n_cluster.fit_predict(
609+
self.subcluster_labels_ = clusterer.fit_predict(
608610
self.subcluster_centers_)
609611

610612
if compute_labels:

sklearn/cluster/tests/test_birch.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from sklearn.utils.testing import assert_greater_equal
1616
from sklearn.utils.testing import assert_equal
17+
from sklearn.utils.testing import assert_greater
1718
from sklearn.utils.testing import assert_almost_equal
1819
from sklearn.utils.testing import assert_array_equal
1920
from sklearn.utils.testing import assert_raises
@@ -34,8 +35,6 @@ def test_n_samples_leaves_roots():
3435

3536
def test_partial_fit():
3637
"""Test that fit is equivalent to calling partial_fit multiple times"""
37-
# Test that same subcluster centres are obtained after calling partial
38-
# fit twice
3938
X, y = make_blobs(n_samples=100)
4039
brc = Birch(n_clusters=3)
4140
brc.fit(X)
@@ -75,6 +74,7 @@ def test_n_clusters():
7574
X, y = make_blobs(n_samples=100, centers=10)
7675
brc1 = Birch(n_clusters=10)
7776
brc1.fit(X)
77+
assert_greater(len(brc1.subcluster_centers_), 10)
7878
assert_equal(len(np.unique(brc1.labels_)), 10)
7979

8080
# Test that n_clusters = Agglomerative Clustering gives
@@ -144,7 +144,7 @@ def check_threshold(birch_instance, threshold):
144144
while current_leaf:
145145
subclusters = current_leaf.subclusters_
146146
for sc in subclusters:
147-
assert_greater_equal(threshold, sc.radius())
147+
assert_greater_equal(threshold, sc.radius)
148148
current_leaf = current_leaf.next_leaf_
149149

150150

@@ -158,3 +158,16 @@ def test_threshold():
158158
brc = Birch(threshold=5.0, n_clusters=None)
159159
brc.fit(X)
160160
check_threshold(brc, 5.)
161+
162+
163+
def test_compute_label_predict():
164+
"""Test predict is invariant of the param 'compute_labels'"""
165+
X, y = make_blobs(n_samples=80, centers=4)
166+
brc1 = Birch(threshold=0.5, n_clusters=None, compute_labels=True)
167+
brc1.fit(X)
168+
brc1_labels = brc1.predict(X)
169+
170+
brc2 = Birch(threshold=0.5, n_clusters=None, compute_labels=False)
171+
brc2.fit(X)
172+
brc2_labels = brc2.predict(X)
173+
assert_almost_equal(v_measure_score(brc1_labels, brc2_labels), 1.0)

0 commit comments

Comments
 (0)