1
1
# Authors: Manoj Kumar <[email protected] >
2
2
# Alexandre Gramfort <[email protected] >
3
+ # Joel Nothman <[email protected] >
3
4
# License: BSD 3 clause
4
5
from __future__ import division
5
6
@@ -117,11 +118,11 @@ class _CFNode(object):
117
118
the final subclusters.
118
119
119
120
init_centroids_ : ndarray, shape (branching_factor + 1, n_features)
120
- manipulate init_centroids throughout rather than centroids_ since
121
+ manipulate init_centroids_ throughout rather than centroids_ since
121
122
the centroids are just a view of the init_centroids_ .
122
123
123
124
init_sq_norm_ : ndarray, shape (branching_factor + 1,)
124
- manipulate squared_norms throughout. similar to init_centroids_.
125
+ manipulate init_sq_norm_ throughout. similar to init_centroids_.
125
126
126
127
centroids_ : ndarray
127
128
view of init_centroids_.
@@ -164,12 +165,9 @@ def update_split_subclusters(self, subcluster,
164
165
"""
165
166
n_samples = len (self .subclusters_ )
166
167
ind = self .subclusters_ .index (subcluster )
167
- del self .subclusters_ [ind ]
168
- self .init_centroids_ [ind :n_samples - 1 , :] = \
169
- self .init_centroids_ [ind + 1 :n_samples , :]
170
- self .init_sq_norm_ [ind :n_samples - 1 ] = \
171
- self .init_sq_norm_ [ind + 1 :n_samples ]
172
- self .append_subcluster (new_subcluster1 )
168
+ self .subclusters_ [ind ] = new_subcluster1
169
+ self .init_centroids_ [ind ] = new_subcluster1 .centroid_
170
+ self .init_sq_norm_ [ind ] = new_subcluster1 .sq_norm_
173
171
self .append_subcluster (new_subcluster2 )
174
172
175
173
def insert_cf_subcluster (self , subcluster ):
@@ -316,6 +314,7 @@ def merge_subcluster(self, nominee_cluster, threshold):
316
314
return True
317
315
return False
318
316
317
+ @property
319
318
def radius (self ):
320
319
"""Return radius of the subcluster"""
321
320
dot_product = - 2 * np .dot (self .linear_sum_ , self .centroid_ )
@@ -349,10 +348,12 @@ class Birch(BaseEstimator, TransformerMixin, ClusterMixin):
349
348
350
349
n_clusters : int, instance of sklearn.cluster model, default None
351
350
Number of clusters after the final clustering step, which treats the
352
- subclusters from the leaves as new samples. By default the global
353
- clustering step is AgglomerativeClustering with n_clusters set to
354
- 3. By default, this final clustering step is not performed and the
355
- subclusters are returned as they are.
351
+ subclusters from the leaves as new samples. By default, this final
352
+ clustering step is not performed and the subclusters are returned
353
+ as they are. If a model is provided, the model is fit treating
354
+ the subclusters as new samples and the initial data is mapped to the
355
+ label of the closest subcluster. If an int is provided, the model
356
+ fit is AgglomerativeClustering with n_clusters set to the int.
356
357
357
358
compute_labels : bool, default True
358
359
Whether or not to compute labels for each fit.
@@ -486,14 +487,35 @@ def _get_leaves(self):
486
487
"""
487
488
leaf_ptr = self .dummy_leaf_ .next_leaf_
488
489
leaves = []
489
- while leaf_ptr :
490
+ while leaf_ptr is not None :
490
491
leaves .append (leaf_ptr )
491
492
leaf_ptr = leaf_ptr .next_leaf_
492
493
return leaves
493
494
495
+ def partial_fit (self , X = None , y = None ):
496
+ """
497
+ Online learning. Prevents rebuilding of CFTree from scratch.
498
+
499
+ Parameters
500
+ ----------
501
+ X : {array-like, sparse matrix}, shape (n_samples, n_features), None
502
+ Input data. If X is not provided, only the global clustering
503
+ step is done.
504
+ """
505
+ self .partial_fit_ , self .fit_ = True , False
506
+ if X is None :
507
+ # Perform just the final global clustering step.
508
+ self ._global_clustering ()
509
+ return self
510
+ else :
511
+ self ._check_fit (X )
512
+ return self ._fit (X )
513
+
494
514
def _check_fit (self , X ):
495
515
is_fitted = hasattr (self , 'subcluster_centers_' )
496
- has_partial_fit = hasattr (self , 'partial_fit' )
516
+
517
+ # Called by partial_fit, before fitting.
518
+ has_partial_fit = hasattr (self , 'partial_fit_' )
497
519
498
520
# Should raise an error if one does not fit before predicting.
499
521
if not has_partial_fit and not is_fitted :
@@ -502,7 +524,7 @@ def _check_fit(self, X):
502
524
if is_fitted and X .shape [1 ] != self .subcluster_centers_ .shape [1 ]:
503
525
raise ValueError (
504
526
"Training data and predicted data do "
505
- "not have same no. of features." )
527
+ "not have same number of features." )
506
528
507
529
def predict (self , X ):
508
530
"""
@@ -527,30 +549,12 @@ def predict(self, X):
527
549
reduced_distance += self ._subcluster_norms
528
550
return self .subcluster_labels_ [np .argmin (reduced_distance , axis = 1 )]
529
551
530
- def partial_fit (self , X = None , y = None ):
531
- """
532
- Online learning. Prevents rebuilding of CFTree from scratch.
533
-
534
- Parameters
535
- ----------
536
- X : {array-like, sparse matrix}, shape (n_samples, n_features), None
537
- Input data. If X is not provided, only the global clustering
538
- step is done.
539
- """
540
- self .partial_fit_ , self .fit_ = True , False
541
- if X is None :
542
- # Perform just the final global clustering step.
543
- self ._global_clustering ()
544
- return self
545
- else :
546
- self ._check_fit (X )
547
- return self ._fit (X )
548
-
549
552
def transform (self , X , y = None ):
550
553
"""
551
554
Transform X into subcluster centroids dimension.
552
555
553
- Each dimension represents the distance between each cluster centroid.
556
+ Each dimension represents the distance from the sample point to each
557
+ cluster centroid.
554
558
555
559
Parameters
556
560
----------
@@ -570,30 +574,28 @@ def _global_clustering(self, X=None):
570
574
"""
571
575
Global clustering for the subclusters obtained after fitting
572
576
"""
573
- clusters = self .n_clusters
577
+ clusterer = self .n_clusters
574
578
centroids = self .subcluster_centers_
575
579
compute_labels = (X is not None ) and self .compute_labels
576
580
577
581
# Preprocessing for the global clustering.
578
582
not_enough_centroids = False
579
- if hasattr (clusters , 'fit_predict' ):
580
- n_cluster = clusters
581
- elif isinstance (clusters , int ):
582
- n_cluster = AgglomerativeClustering (
583
- n_clusters = clusters )
583
+ if isinstance (clusterer , int ):
584
+ clusterer = AgglomerativeClustering (
585
+ n_clusters = self .n_clusters )
584
586
# There is no need to perform the global clustering step.
585
- if len (centroids ) < clusters :
587
+ if len (centroids ) < self . n_clusters :
586
588
not_enough_centroids = True
587
- elif clusters is not None :
589
+ elif (clusterer is not None and not
590
+ hasattr (clusterer , 'fit_predict' )):
588
591
raise ValueError ("n_clusters should be an instance of "
589
592
"ClusterMixin or an int" )
590
593
591
594
# To use in predict to avoid recalculation.
592
- if compute_labels :
593
- self ._subcluster_norms = row_norms (
594
- self .subcluster_centers_ , squared = True )
595
+ self ._subcluster_norms = row_norms (
596
+ self .subcluster_centers_ , squared = True )
595
597
596
- if self . n_clusters is None or not_enough_centroids :
598
+ if clusterer is None or not_enough_centroids :
597
599
self .subcluster_labels_ = np .arange (len (centroids ))
598
600
if not_enough_centroids :
599
601
warnings .warn (
@@ -604,7 +606,7 @@ def _global_clustering(self, X=None):
604
606
# The global clustering step that clusters the subclusters of
605
607
# the leaves. It assumes the centroids of the subclusters as
606
608
# samples and finds the final centroids.
607
- self .subcluster_labels_ = n_cluster .fit_predict (
609
+ self .subcluster_labels_ = clusterer .fit_predict (
608
610
self .subcluster_centers_ )
609
611
610
612
if compute_labels :
0 commit comments