From 8c02326132141581b5ae4003450f9d73e84f0495 Mon Sep 17 00:00:00 2001 From: scampion Date: Tue, 11 Jan 2011 16:26:18 +0100 Subject: [PATCH 1/7] Self-Organizing Map algorithm added with Calinksi cluster quality mesure --- examples/cluster/som_digits.py | 76 +++++++++++ scikits/learn/cluster/som_.py | 171 ++++++++++++++++++++++++ scikits/learn/cluster/tests/test_som.py | 43 ++++++ 3 files changed, 290 insertions(+) create mode 100644 examples/cluster/som_digits.py create mode 100644 scikits/learn/cluster/som_.py create mode 100644 scikits/learn/cluster/tests/test_som.py diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py new file mode 100644 index 0000000000000..54f162b76a3c1 --- /dev/null +++ b/examples/cluster/som_digits.py @@ -0,0 +1,76 @@ +""" +=========================================================== +A demo of Self-Organising Map and KMeans on the handwritten +digits data +=========================================================== +""" +from __future__ import division +print __doc__ + +from time import time +import numpy as np + +from scikits.learn.cluster import KMeans +from scikits.learn.cluster import SOM +from scikits.learn.cluster import calinski_index + +from scikits.learn.datasets import load_digits +from scikits.learn.pca import PCA +from scikits.learn.preprocessing import scale + +def display(labels,digits,nbclusters): + r = {0:[],1:[],2:[],3:[],4:[],5:[],6:[],7:[],8:[],9:[]} + for i,v in enumerate(labels): + r[digits.target[i]].append(v) + + for k,v in r.items(): + s = set(v) + print 'target %i | nb cluster %i |' % (k,len(s)),s + +np.random.seed(42) + +digits = load_digits() +data = scale(digits.data) + +n_samples, n_features = data.shape +n_digits = len(np.unique(digits.target)) + +print "n_digits: %d" % n_digits +print "n_features: %d" % n_features +print "n_samples: %d" % n_samples +print + +print "Self-Organizing Map " +t0 = time() +grid_width = 4 +som = SOM(w=grid_width,n_init=n_samples*5,learning_rate=1) +som.fit(data) +print "done in %0.3fs" % (time() - t0) +print + +display(som.labels_,digits,grid_width**2) +C = calinski_index(data,som.labels_,som.neurons_) +print 'calinski index %0.2f | %0.2f%%' % (C,100*(C/(1+C))) +print + +print '*'*80 + +print "KMeans " +t0 = time() +#km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data) +km = KMeans(init='k-means++', k=grid_width**2, n_init=10) +km.fit(data) +print "done in %0.3fs" % (time() - t0) +print + +display(km.labels_,digits,n_digits) +C = calinski_index(data,km.labels_,km.cluster_centers_) +print 'calinski index %0.2f | %0.2f%%' % (C,100*(C/(1+C))) + + + + + + + + diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py new file mode 100644 index 0000000000000..90a43dc10d70b --- /dev/null +++ b/scikits/learn/cluster/som_.py @@ -0,0 +1,171 @@ +""" + Self-organizing map +""" +# Authors: Sebastien Campion +# License: BSD +from __future__ import division +import warnings +from ..base import BaseEstimator +import numpy as np +import math +import Image + +################################################################################ +def save_rgb(neurons,ofile,thumb_size=32): + '''Function to save map using 3 dim, as RGB''' + assert neurons.shape[-1] == 3 + tsize = (thumb_size,thumb_size) + size = tuple([v * thumb_size for v in neurons.shape[0:2] ]) + im = Image.new('RGB',size) + for x in range(neurons.shape[0]): + for y in range(neurons.shape[1]): + color = tuple([int(c) for c in neurons[x][y]]) + t = Image.new('RGB',tsize,color) + im.paste(t,(x*thumb_size,y*thumb_size)) + im.save(ofile) + +################################################################################ +class Kohonen2DMap(): + def __init__(self,size,dim,neurons=None): + #self.log = logging.getLogger('kohonen.map') + self.dim = dim + self.size = size + self.neurons = neurons + if neurons == None : + self.neurons = np.random.rand(size,size,dim)#/10 + self.iteration = 0 + + def bmu(self,data): + assert data.shape[0] == self.neurons.shape[-1] + data = np.resize(data,self.neurons.shape) + dists = np.sum((data-self.neurons)** 2,axis=-1) + min = dists.argmin() + #w = np.unravel_index(min,dists.shape) + return divmod(min,self.size) + + def learn(self,datas,nbiter,learning_rate=1,callback=None): + '''Given an sample of datas, we randomly choose one of them for each + iteration. + A good ratio, nb datas = 2 or 3 x nbiter''' + self.iteration = 0 + indices = np.random.random_integers(0,len(datas)-1,nbiter) + for i in indices: + l = nbiter/self.size + lr = learning_rate * math.exp(-self.iteration/l) + self._learn_vector(datas[i], nbiter, lr) + self.iteration += 1 + if callback != None: + callback(self,self.iteration) + + def _learn_vector(self, data, nbiter, lr): + w = self.bmu(data) + radius = self.radius_of_the_neighbordhood(nbiter) + for n in self.neurons_in_radius(w,radius): + nx,ny = n + wt = self.neurons[nx][ny] + dr = self.dist(w,n,radius) + self.neurons[nx][ny] = wt + dr*lr*(data-wt) + #self.log.debug(('nod',n,'l_rate',lr,'d_radius',dr)) + #self.log.debug(('bmu',w,'iter',self.iteration,'radius',radius)) + + def dist(self,w,n,radius): + wx,wy = w + nx,ny = n + d = (wx-nx)**2 + (wy-ny)**2 + #offcial paper implementation : return math.exp(-d/2*radius**2) + return math.exp(-d/radius) + + def neurons_in_radius(self,w,radius): + wi,wj = w + r = [] + for i in range(self.neurons.shape[0]): + for j in range(self.neurons.shape[1]): + if math.sqrt((i-wi)**2 + (j-wj)**2) < radius: + r.append((i,j)) + return r + + def radius_of_the_neighbordhood(self,nbiter): + l = nbiter/self.size + return self.size * math.exp(-self.iteration/l) + + +################################################################################ + +class SOM(BaseEstimator): + """ Self-Organizing Map + + Parameters + ---------- + + data : ndarray + A M by N array of M observations in N dimensions or a length + M array of M one-dimensional observations. + + w : int + Width and height of the square mape as well as the number of + centroids to generate. If init initialization string is + 'matrix', or if a ndarray is given instead, it is + interpreted as initial cluster to use instead. + + n_iter : int + Number of iterations of the som algrithm to run + + learning_rate : float + Learning rate + + init : {'random', 'matrix'} + Method for initialization, defaults to 'random': + + 'random': randomly points choosed + + 'matrix': interpret the w parameter as a w by M array + of initial centroids. + + Methods + ------- + + fit(X): + Compute SOM + + Attributes + ---------- + + neurons_: array, [(x,y), n_features] + Coordinates of neurons and value + + labels_: + Labels of each point + + Notes + ------ + + """ + + def __init__(self, w=16, init='random', n_init=64,learning_rate=1): + self.w = w + self.init = init + self.n_init = n_init + self.learning_rate = learning_rate + self.callback = None + + def fit(self, X, **params): + """ Compute som""" + X = np.asanyarray(X) + self._set_params(**params) + + neurons = None + dim = X.shape[-1] + + if self.init == 'matrix': + assert len(self.w.shape) == 3 + neurons = self.w + self.w = neurons.shape[0] + + + km = Kohonen2DMap(self.w,dim,neurons) + km.learn(X,self.n_init,self.learning_rate,callback=self.callback) + self.neurons_ = km.neurons + self.labels_ = [km.bmu(x) for x in X] + + return self + diff --git a/scikits/learn/cluster/tests/test_som.py b/scikits/learn/cluster/tests/test_som.py new file mode 100644 index 0000000000000..2c697f170a727 --- /dev/null +++ b/scikits/learn/cluster/tests/test_som.py @@ -0,0 +1,43 @@ +""" +Testing for SOM + +""" +import Image +import numpy as np +from numpy.testing import assert_equal + +from ..som_ import SOM,save_rgb +from .common import generate_clustered_data + +n_clusters = 4 +X = generate_clustered_data(n_clusters=n_clusters, std=.1) + +def test_som(): + np.random.seed(1) + som = SOM().fit(X,w=4,n_init=32,learning_rate=0.4) + labels = som.labels_ + + assert_equal(np.unique(labels).shape[0],4) + assert_equal(np.unique(labels[:20]).shape[0], 1) + assert_equal(np.unique(labels[20:40]).shape[0], 1) + assert_equal(np.unique(labels[40:60]).shape[0], 1) + assert_equal(np.unique(labels[60:]).shape[0], 1) + +def test_color_map(): + train = np.array([[0,0,0], #black + [255,255,255], #white + [255,0,0], #red + [0,255,0], #green + [0,0,255], #blue + [255,255,0], #yellow + [0,255,255], #cyan + [255,0,255] #magenta + ]) + w = np.random.rand(16,16,3)*255 + som = SOM(w,n_init=1024,init='matrix',learning_rate=1) + save_rgb(w,'init.jpg') + som = SOM(w,n_init=1024,init='matrix',learning_rate=1) + som.fit(train) + save_rgb(som.neurons_,'color_map.jpg') + + From 5e79af60f6b7d57cf648a10bc7d2fd46e068652d Mon Sep 17 00:00:00 2001 From: scampion Date: Wed, 12 Jan 2011 17:58:10 +0100 Subject: [PATCH 2/7] merge SOM and Kohonen class, improve variable name, add and example using color and calinski function to measure clustering quality --- examples/cluster/plot_som_colormap.py | 53 +++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 examples/cluster/plot_som_colormap.py diff --git a/examples/cluster/plot_som_colormap.py b/examples/cluster/plot_som_colormap.py new file mode 100644 index 0000000000000..ce7090039c2c3 --- /dev/null +++ b/examples/cluster/plot_som_colormap.py @@ -0,0 +1,53 @@ +""" +=========================================================== +A demo of SelfOrganisingMap with colored neurons +=========================================================== + +""" +print __doc__ + +from time import time +import numpy as np +from scikits.learn.cluster import SelfOrganisingMap +import pylab as pl +import Image + +def map2image(neurons,thumb_size=32): + '''Function to save som using 3 dim, as RGB''' + assert neurons.shape[-1] == 3 + tsize = (thumb_size,thumb_size) + size = tuple([v * thumb_size for v in neurons.shape[0:2] ]) + im = Image.new('RGB',size) + for x in range(neurons.shape[0]): + for y in range(neurons.shape[1]): + color = tuple([int(c) for c in neurons[x][y]]) + t = Image.new('RGB',tsize,color) + im.paste(t,(x*thumb_size,y*thumb_size)) + return im + +train = np.array([[0,0,0], #black + [255,255,255], #white + [255,0,0], #red + [0,255,0], #green + [0,0,255], #blue + [255,255,0], #yellow + [0,255,255], #cyan + [255,0,255] #magenta + ]) + + + +init = np.random.rand(16,16,3)*255 + +pl.subplot(1, 2, 1) +pl.imshow(map2image(init)) +pl.title('Initial map') + +som = SelfOrganisingMap(init,n_iterations=1024, + init='matrix',learning_rate=1) +som.fit(train) + +pl.subplot(1, 2, 2) +pl.imshow(map2image(som.neurons_)) +pl.title('Organized Map') +pl.show() From d5dc95c75f9c6c180753cae6ef865e3fce107cf4 Mon Sep 17 00:00:00 2001 From: scampion Date: Thu, 13 Jan 2011 09:47:44 +0100 Subject: [PATCH 3/7] remove PIL module, s/SelfOrganising/SelfOrganizing/ merge SOM and Kohonen class, improve variable name, add and example using color and calinski function to measure clustering quality --- examples/cluster/plot_som_colormap.py | 37 ++--- examples/cluster/som_digits.py | 20 +-- scikits/learn/cluster/__init__.py | 12 ++ scikits/learn/cluster/som_.py | 189 ++++++++++-------------- scikits/learn/cluster/tests/test_som.py | 25 +--- 5 files changed, 117 insertions(+), 166 deletions(-) diff --git a/examples/cluster/plot_som_colormap.py b/examples/cluster/plot_som_colormap.py index ce7090039c2c3..2397afe726b60 100644 --- a/examples/cluster/plot_som_colormap.py +++ b/examples/cluster/plot_som_colormap.py @@ -5,25 +5,18 @@ """ print __doc__ - -from time import time -import numpy as np -from scikits.learn.cluster import SelfOrganisingMap import pylab as pl -import Image +from matplotlib.colors import ListedColormap, NoNorm, rgb2hex +import numpy as np +from scikits.learn.cluster import SelfOrganizingMap -def map2image(neurons,thumb_size=32): - '''Function to save som using 3 dim, as RGB''' +def plot(neurons): assert neurons.shape[-1] == 3 - tsize = (thumb_size,thumb_size) - size = tuple([v * thumb_size for v in neurons.shape[0:2] ]) - im = Image.new('RGB',size) - for x in range(neurons.shape[0]): - for y in range(neurons.shape[1]): - color = tuple([int(c) for c in neurons[x][y]]) - t = Image.new('RGB',tsize,color) - im.paste(t,(x*thumb_size,y*thumb_size)) - return im + h,w,d = neurons.shape + hexmap = np.apply_along_axis(rgb2hex,1, + neurons.reshape(-1,3)/256) + index = np.arange(h*w).reshape(h,w) + pl.pcolor(index,cmap=ListedColormap(hexmap),norm=NoNorm()) train = np.array([[0,0,0], #black [255,255,255], #white @@ -39,15 +32,17 @@ def map2image(neurons,thumb_size=32): init = np.random.rand(16,16,3)*255 -pl.subplot(1, 2, 1) -pl.imshow(map2image(init)) +pl.subplot(1, 2, 1,aspect='equal') +plot(init) pl.title('Initial map') -som = SelfOrganisingMap(init,n_iterations=1024, +som = SelfOrganizingMap(init,n_iterations=1024, init='matrix',learning_rate=1) som.fit(train) -pl.subplot(1, 2, 2) -pl.imshow(map2image(som.neurons_)) +pl.subplot(1, 2, 2,aspect='equal') +plot(som.neurons_) pl.title('Organized Map') +F = pl.gcf() +F.set_size_inches( (40,20) ) pl.show() diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py index 54f162b76a3c1..ff9a9c544ce56 100644 --- a/examples/cluster/som_digits.py +++ b/examples/cluster/som_digits.py @@ -1,8 +1,7 @@ """ -=========================================================== -A demo of Self-Organising Map and KMeans on the handwritten -digits data -=========================================================== +======================================================================= +A demo of Self-Organising Map and KMeans on the handwritten digits data +======================================================================= """ from __future__ import division print __doc__ @@ -11,11 +10,10 @@ import numpy as np from scikits.learn.cluster import KMeans -from scikits.learn.cluster import SOM +from scikits.learn.cluster import SelfOrganizingMap from scikits.learn.cluster import calinski_index from scikits.learn.datasets import load_digits -from scikits.learn.pca import PCA from scikits.learn.preprocessing import scale def display(labels,digits,nbclusters): @@ -43,7 +41,8 @@ def display(labels,digits,nbclusters): print "Self-Organizing Map " t0 = time() grid_width = 4 -som = SOM(w=grid_width,n_init=n_samples*5,learning_rate=1) +som = SelfOrganizingMap(size=grid_width,n_iterations=n_samples*5, + learning_rate=1) som.fit(data) print "done in %0.3fs" % (time() - t0) print @@ -53,8 +52,6 @@ def display(labels,digits,nbclusters): print 'calinski index %0.2f | %0.2f%%' % (C,100*(C/(1+C))) print -print '*'*80 - print "KMeans " t0 = time() #km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data) @@ -69,8 +66,3 @@ def display(labels,digits,nbclusters): - - - - - diff --git a/scikits/learn/cluster/__init__.py b/scikits/learn/cluster/__init__.py index cedab2e2f8e0d..1f2b9f90a0cc8 100644 --- a/scikits/learn/cluster/__init__.py +++ b/scikits/learn/cluster/__init__.py @@ -6,4 +6,16 @@ from .mean_shift_ import mean_shift, MeanShift, estimate_bandwidth from .affinity_propagation_ import affinity_propagation, AffinityPropagation from .k_means_ import k_means, KMeans +from .som_ import SelfOrganizingMap +import numpy as np + +def calinski_index(X,labels,centroids): + mean = np.mean(X,axis=0) + B = np.sum([ (c - mean)**2 for c in centroids]) + W = np.sum([ (x-centroids[labels[i]])**2 + for i,x in enumerate(X)]) + c = len(centroids) + n = len(X) + return (B /(c-1))/(W/ (n-c)) + diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py index 90a43dc10d70b..b5fd08a3ff6de 100644 --- a/scikits/learn/cluster/som_.py +++ b/scikits/learn/cluster/som_.py @@ -1,113 +1,34 @@ """ Self-organizing map + + Reference : (to check) + Kohonen, T.; , "The self-organizing map," + Proceedings of the IEEE , vol.78, no.9, pp.1464-1480, Sep 1990 """ # Authors: Sebastien Campion # License: BSD from __future__ import division -import warnings from ..base import BaseEstimator import numpy as np import math -import Image - -################################################################################ -def save_rgb(neurons,ofile,thumb_size=32): - '''Function to save map using 3 dim, as RGB''' - assert neurons.shape[-1] == 3 - tsize = (thumb_size,thumb_size) - size = tuple([v * thumb_size for v in neurons.shape[0:2] ]) - im = Image.new('RGB',size) - for x in range(neurons.shape[0]): - for y in range(neurons.shape[1]): - color = tuple([int(c) for c in neurons[x][y]]) - t = Image.new('RGB',tsize,color) - im.paste(t,(x*thumb_size,y*thumb_size)) - im.save(ofile) - -################################################################################ -class Kohonen2DMap(): - def __init__(self,size,dim,neurons=None): - #self.log = logging.getLogger('kohonen.map') - self.dim = dim - self.size = size - self.neurons = neurons - if neurons == None : - self.neurons = np.random.rand(size,size,dim)#/10 - self.iteration = 0 - - def bmu(self,data): - assert data.shape[0] == self.neurons.shape[-1] - data = np.resize(data,self.neurons.shape) - dists = np.sum((data-self.neurons)** 2,axis=-1) - min = dists.argmin() - #w = np.unravel_index(min,dists.shape) - return divmod(min,self.size) - - def learn(self,datas,nbiter,learning_rate=1,callback=None): - '''Given an sample of datas, we randomly choose one of them for each - iteration. - A good ratio, nb datas = 2 or 3 x nbiter''' - self.iteration = 0 - indices = np.random.random_integers(0,len(datas)-1,nbiter) - for i in indices: - l = nbiter/self.size - lr = learning_rate * math.exp(-self.iteration/l) - self._learn_vector(datas[i], nbiter, lr) - self.iteration += 1 - if callback != None: - callback(self,self.iteration) - def _learn_vector(self, data, nbiter, lr): - w = self.bmu(data) - radius = self.radius_of_the_neighbordhood(nbiter) - for n in self.neurons_in_radius(w,radius): - nx,ny = n - wt = self.neurons[nx][ny] - dr = self.dist(w,n,radius) - self.neurons[nx][ny] = wt + dr*lr*(data-wt) - #self.log.debug(('nod',n,'l_rate',lr,'d_radius',dr)) - #self.log.debug(('bmu',w,'iter',self.iteration,'radius',radius)) - - def dist(self,w,n,radius): - wx,wy = w - nx,ny = n - d = (wx-nx)**2 + (wy-ny)**2 - #offcial paper implementation : return math.exp(-d/2*radius**2) - return math.exp(-d/radius) - - def neurons_in_radius(self,w,radius): - wi,wj = w - r = [] - for i in range(self.neurons.shape[0]): - for j in range(self.neurons.shape[1]): - if math.sqrt((i-wi)**2 + (j-wj)**2) < radius: - r.append((i,j)) - return r - - def radius_of_the_neighbordhood(self,nbiter): - l = nbiter/self.size - return self.size * math.exp(-self.iteration/l) - - -################################################################################ - -class SOM(BaseEstimator): +class SelfOrganizingMap(BaseEstimator): """ Self-Organizing Map Parameters ---------- - data : ndarray + X : ndarray A M by N array of M observations in N dimensions or a length M array of M one-dimensional observations. - w : int - Width and height of the square mape as well as the number of + size : int + Width and height of the square map as well as the number of centroids to generate. If init initialization string is 'matrix', or if a ndarray is given instead, it is interpreted as initial cluster to use instead. - n_iter : int + n_iterations : int Number of iterations of the som algrithm to run learning_rate : float @@ -140,32 +61,82 @@ class SOM(BaseEstimator): ------ """ - - def __init__(self, w=16, init='random', n_init=64,learning_rate=1): - self.w = w + def __init__(self,size=16,init='random',n_iterations=64,learning_rate=1, + callback=None ): + self.size = size self.init = init - self.n_init = n_init + self.n_iterations = n_iterations self.learning_rate = learning_rate - self.callback = None + self.callback = callback def fit(self, X, **params): - """ Compute som""" + """Given an sample of X, we randomly choose one of them for each + iteration. + A good ratio, nb X = 2 or 3 x nbiter""" X = np.asanyarray(X) self._set_params(**params) - - neurons = None - dim = X.shape[-1] - - if self.init == 'matrix': - assert len(self.w.shape) == 3 - neurons = self.w - self.w = neurons.shape[0] - - - km = Kohonen2DMap(self.w,dim,neurons) - km.learn(X,self.n_init,self.learning_rate,callback=self.callback) - self.neurons_ = km.neurons - self.labels_ = [km.bmu(x) for x in X] - + self.dim = X.shape[-1] + self.neurons_ = None + + #init neurons_ + if self.init == 'random': + self.neurons_ = np.random.rand(self.size,self.size,self.dim) + elif self.init == 'matrix': + assert len(self.size.shape) == 3 + self.neurons_ = self.size + self.size = self.neurons_.shape[0] + + #iteration loop + self.iteration = 0 + indices = np.random.random_integers(0,len(X)-1,self.n_iterations) + for i in indices: + l = self.n_iterations/self.size + lr = self.learning_rate * math.exp(-self.iteration/l) + self._learn_vector(X[i],lr) + self.iteration += 1 + if self.callback != None: + self.callback(self,self.iteration) + + #assign labels + self.labels_ = [self.bmu(x) for x in X] return self + + def _learn_vector(self, vector, lr): + winner = self.bmu(vector) + radius = self.radius_of_the_neighbordhood() + for n in self.neurons_in_radius(winner,radius): + nx,ny = n + wt = self.neurons_[nx][ny] + dr = self.dist(winner,n,radius) + self.neurons_[nx][ny] = wt + dr*lr*(vector - wt) + def bmu(self,vector): + """ + best matching unit + """ + assert vector.shape[0] == self.neurons_.shape[-1] + vector = np.resize(vector,self.neurons_.shape) + dists = np.sum((vector-self.neurons_)** 2,axis=-1) + min = dists.argmin() + #w = np.unravel_index(min,dists.shape) + return divmod(min,self.size) + + def dist(self,w,n,radius): + wx,wy = w + nx,ny = n + d = (wx-nx)**2 + (wy-ny)**2 + #offcial paper implementation : return math.exp(-d/2*radius**2) + return math.exp(-d/radius) + + def neurons_in_radius(self,winner,radius): + wi,wj = winner + r = [] + for i in range(self.neurons_.shape[0]): + for j in range(self.neurons_.shape[1]): + if math.sqrt((i-wi)**2 + (j-wj)**2) < radius: + r.append((i,j)) + return r + + def radius_of_the_neighbordhood(self): + l = self.n_iterations/self.size + return self.size * math.exp(-self.iteration/l) diff --git a/scikits/learn/cluster/tests/test_som.py b/scikits/learn/cluster/tests/test_som.py index 2c697f170a727..e99bd05dee8d2 100644 --- a/scikits/learn/cluster/tests/test_som.py +++ b/scikits/learn/cluster/tests/test_som.py @@ -2,11 +2,10 @@ Testing for SOM """ -import Image import numpy as np from numpy.testing import assert_equal -from ..som_ import SOM,save_rgb +from ..som_ import SelfOrganizingMap from .common import generate_clustered_data n_clusters = 4 @@ -14,7 +13,8 @@ def test_som(): np.random.seed(1) - som = SOM().fit(X,w=4,n_init=32,learning_rate=0.4) + som = SelfOrganizingMap(size=2,n_iterations=10,learning_rate=1) + som.fit(X) labels = som.labels_ assert_equal(np.unique(labels).shape[0],4) @@ -22,22 +22,3 @@ def test_som(): assert_equal(np.unique(labels[20:40]).shape[0], 1) assert_equal(np.unique(labels[40:60]).shape[0], 1) assert_equal(np.unique(labels[60:]).shape[0], 1) - -def test_color_map(): - train = np.array([[0,0,0], #black - [255,255,255], #white - [255,0,0], #red - [0,255,0], #green - [0,0,255], #blue - [255,255,0], #yellow - [0,255,255], #cyan - [255,0,255] #magenta - ]) - w = np.random.rand(16,16,3)*255 - som = SOM(w,n_init=1024,init='matrix',learning_rate=1) - save_rgb(w,'init.jpg') - som = SOM(w,n_init=1024,init='matrix',learning_rate=1) - som.fit(train) - save_rgb(som.neurons_,'color_map.jpg') - - From 4e313371d1b3031148c8bdc0c123b75193a86e09 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Sat, 15 Jan 2011 22:34:54 -0500 Subject: [PATCH 4/7] review SOM code (PEP8, cosmits and inline comments) --- examples/cluster/plot_som_colormap.py | 55 ++++++----- examples/cluster/som_digits.py | 34 +++---- scikits/learn/cluster/som_.py | 122 ++++++++++++------------ scikits/learn/cluster/tests/test_som.py | 7 +- 4 files changed, 110 insertions(+), 108 deletions(-) diff --git a/examples/cluster/plot_som_colormap.py b/examples/cluster/plot_som_colormap.py index 2397afe726b60..434a508f47c85 100644 --- a/examples/cluster/plot_som_colormap.py +++ b/examples/cluster/plot_som_colormap.py @@ -1,8 +1,10 @@ """ =========================================================== -A demo of SelfOrganisingMap with colored neurons +A demo of SelfOrganisingMap with colored neurons =========================================================== +XXX : add description of example. + """ print __doc__ import pylab as pl @@ -10,39 +12,36 @@ import numpy as np from scikits.learn.cluster import SelfOrganizingMap -def plot(neurons): - assert neurons.shape[-1] == 3 - h,w,d = neurons.shape - hexmap = np.apply_along_axis(rgb2hex,1, - neurons.reshape(-1,3)/256) - index = np.arange(h*w).reshape(h,w) - pl.pcolor(index,cmap=ListedColormap(hexmap),norm=NoNorm()) - -train = np.array([[0,0,0], #black - [255,255,255], #white - [255,0,0], #red - [0,255,0], #green - [0,0,255], #blue - [255,255,0], #yellow - [0,255,255], #cyan - [255,0,255] #magenta - ]) - - -init = np.random.rand(16,16,3)*255 - -pl.subplot(1, 2, 1,aspect='equal') +def plot(neurons): + assert neurons.shape[-1] == 3 + h, w, d = neurons.shape + hexmap = np.apply_along_axis(rgb2hex, 1, neurons.reshape(-1, 3) / 256) + index = np.arange(h * w).reshape(h, w) + pl.pcolor(index, cmap=ListedColormap(hexmap), norm=NoNorm()) + +train = np.array([[0, 0, 0], # black + [255, 255, 255], # white + [255, 0, 0], # red + [0, 255, 0], # green + [0, 0, 255], # blue + [255, 255, 0], # yellow + [0, 255, 255], # cyan + [255, 0, 255]]) # magenta + +init = np.random.rand(16, 16, 3) * 255 + +pl.subplot(1, 2, 1, aspect='equal') plot(init) pl.title('Initial map') -som = SelfOrganizingMap(init,n_iterations=1024, - init='matrix',learning_rate=1) +som = SelfOrganizingMap(init, n_iterations=1024, + init='matrix', learning_rate=1) som.fit(train) -pl.subplot(1, 2, 2,aspect='equal') +pl.subplot(1, 2, 2, aspect='equal') plot(som.neurons_) -pl.title('Organized Map') +pl.title('Organized Map') F = pl.gcf() -F.set_size_inches( (40,20) ) +F.set_size_inches((40, 20)) pl.show() diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py index ff9a9c544ce56..c2e0a37efd199 100644 --- a/examples/cluster/som_digits.py +++ b/examples/cluster/som_digits.py @@ -2,6 +2,9 @@ ======================================================================= A demo of Self-Organising Map and KMeans on the handwritten digits data ======================================================================= + +XXX : Should add text to describe what the example and what to expect +from the output. Would it be possible to plot something? """ from __future__ import division print __doc__ @@ -16,14 +19,16 @@ from scikits.learn.datasets import load_digits from scikits.learn.preprocessing import scale -def display(labels,digits,nbclusters): - r = {0:[],1:[],2:[],3:[],4:[],5:[],6:[],7:[],8:[],9:[]} - for i,v in enumerate(labels): + +def display(labels, digits, n_clusters): + # XXX : n_clusters unused + r = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []} + for i, v in enumerate(labels): r[digits.target[i]].append(v) - for k,v in r.items(): + for k, v in r.items(): s = set(v) - print 'target %i | nb cluster %i |' % (k,len(s)),s + print 'target %i | nb cluster %i |' % (k, len(s)), s np.random.seed(42) @@ -41,15 +46,15 @@ def display(labels,digits,nbclusters): print "Self-Organizing Map " t0 = time() grid_width = 4 -som = SelfOrganizingMap(size=grid_width,n_iterations=n_samples*5, +som = SelfOrganizingMap(size=grid_width, n_iterations=n_samples*5, learning_rate=1) som.fit(data) print "done in %0.3fs" % (time() - t0) print -display(som.labels_,digits,grid_width**2) -C = calinski_index(data,som.labels_,som.neurons_) -print 'calinski index %0.2f | %0.2f%%' % (C,100*(C/(1+C))) +display(som.labels_, digits, grid_width**2) +C = calinski_index(data, som.labels_, som.neurons_) +print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C))) print print "KMeans " @@ -58,11 +63,8 @@ def display(labels,digits,nbclusters): km = KMeans(init='k-means++', k=grid_width**2, n_init=10) km.fit(data) print "done in %0.3fs" % (time() - t0) -print - -display(km.labels_,digits,n_digits) -C = calinski_index(data,km.labels_,km.cluster_centers_) -print 'calinski index %0.2f | %0.2f%%' % (C,100*(C/(1+C))) - - +print +display(km.labels_, digits, n_digits) +C = calinski_index(data, km.labels_, km.cluster_centers_) +print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C))) diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py index b5fd08a3ff6de..18d6d3bccb08d 100644 --- a/scikits/learn/cluster/som_.py +++ b/scikits/learn/cluster/som_.py @@ -1,26 +1,26 @@ """ - Self-organizing map - + Self-organizing map + Reference : (to check) Kohonen, T.; , "The self-organizing map," - Proceedings of the IEEE , vol.78, no.9, pp.1464-1480, Sep 1990 + Proceedings of the IEEE , vol.78, no.9, pp.1464-1480, Sep 1990 """ # Authors: Sebastien Campion # License: BSD from __future__ import division -from ..base import BaseEstimator +import math import numpy as np -import math +from ..base import BaseEstimator + class SelfOrganizingMap(BaseEstimator): - """ Self-Organizing Map + """Self-Organizing Map Parameters ---------- - X : ndarray A M by N array of M observations in N dimensions or a length - M array of M one-dimensional observations. + M array of N one-dimensional observations. size : int Width and height of the square map as well as the number of @@ -44,13 +44,11 @@ class SelfOrganizingMap(BaseEstimator): Methods ------- - fit(X): Compute SOM Attributes ---------- - neurons_: array, [(x,y), n_features] Coordinates of neurons and value @@ -61,8 +59,9 @@ class SelfOrganizingMap(BaseEstimator): ------ """ - def __init__(self,size=16,init='random',n_iterations=64,learning_rate=1, - callback=None ): + + def __init__(self, size=16, init='random', n_iterations=64, + learning_rate=1, callback=None): self.size = size self.init = init self.n_iterations = n_iterations @@ -70,73 +69,74 @@ def __init__(self,size=16,init='random',n_iterations=64,learning_rate=1, self.callback = callback def fit(self, X, **params): - """Given an sample of X, we randomly choose one of them for each + """Given an sample of X, we randomly choose one of them for each iteration. A good ratio, nb X = 2 or 3 x nbiter""" X = np.asanyarray(X) self._set_params(**params) - self.dim = X.shape[-1] - self.neurons_ = None - - #init neurons_ - if self.init == 'random': - self.neurons_ = np.random.rand(self.size,self.size,self.dim) - elif self.init == 'matrix': - assert len(self.size.shape) == 3 - self.neurons_ = self.size + self.dim = X.shape[-1] # XXX : could we avoid storing dim in self? + self.neurons_ = None + + # init neurons_ + if self.init == 'random': + self.neurons_ = np.random.rand(self.size, self.size, self.dim) + elif self.init == 'matrix': + # XXX : untested + assert len(self.size.shape) == 3 + self.neurons_ = self.size self.size = self.neurons_.shape[0] - - #iteration loop - self.iteration = 0 - indices = np.random.random_integers(0,len(X)-1,self.n_iterations) - for i in indices: - l = self.n_iterations/self.size - lr = self.learning_rate * math.exp(-self.iteration/l) - self._learn_vector(X[i],lr) - self.iteration += 1 + + # iteration loop + self.iteration = 0 # XXX : could we avoid storing iteration in self? + indices = np.random.random_integers(0, len(X)-1, self.n_iterations) + for i in indices: + l = self.n_iterations / self.size + lr = self.learning_rate * math.exp(-self.iteration / l) + self._learn_vector(X[i], lr) + self.iteration += 1 if self.callback != None: - self.callback(self,self.iteration) - - #assign labels + self.callback(self, self.iteration) + + # assign labels self.labels_ = [self.bmu(x) for x in X] return self def _learn_vector(self, vector, lr): winner = self.bmu(vector) radius = self.radius_of_the_neighbordhood() - for n in self.neurons_in_radius(winner,radius): - nx,ny = n + for n in self.neurons_in_radius(winner, radius): + nx, ny = n wt = self.neurons_[nx][ny] - dr = self.dist(winner,n,radius) - self.neurons_[nx][ny] = wt + dr*lr*(vector - wt) - - def bmu(self,vector): - """ - best matching unit + dr = self.dist(winner, n, radius) + self.neurons_[nx][ny] = wt + dr * lr * (vector - wt) + + def bmu(self, vector): + """Best matching unit """ - assert vector.shape[0] == self.neurons_.shape[-1] - vector = np.resize(vector,self.neurons_.shape) - dists = np.sum((vector-self.neurons_)** 2,axis=-1) + assert vector.shape[0] == self.neurons_.shape[-1] + vector = np.resize(vector, self.neurons_.shape) + dists = np.sum((vector - self.neurons_)**2, axis=-1) min = dists.argmin() #w = np.unravel_index(min,dists.shape) - return divmod(min,self.size) - - def dist(self,w,n,radius): - wx,wy = w - nx,ny = n - d = (wx-nx)**2 + (wy-ny)**2 - #offcial paper implementation : return math.exp(-d/2*radius**2) - return math.exp(-d/radius) - - def neurons_in_radius(self,winner,radius): - wi,wj = winner + return divmod(min, self.size) + + def dist(self, w, n, radius): + wx, wy = w + nx, ny = n + d = (wx - nx)**2 + (wy - ny)**2 + # offcial paper implementation : return math.exp(-d/2*radius**2) + return math.exp(-d / radius) + + def neurons_in_radius(self, winner, radius): + wi, wj = winner r = [] + # XXX : should be vertorized with numpy for i in range(self.neurons_.shape[0]): for j in range(self.neurons_.shape[1]): - if math.sqrt((i-wi)**2 + (j-wj)**2) < radius: - r.append((i,j)) + if math.sqrt((i - wi)**2 + (j - wj)**2) < radius: + r.append((i, j)) return r - + def radius_of_the_neighbordhood(self): - l = self.n_iterations/self.size - return self.size * math.exp(-self.iteration/l) + l = self.n_iterations / self.size + return self.size * math.exp(-self.iteration / l) diff --git a/scikits/learn/cluster/tests/test_som.py b/scikits/learn/cluster/tests/test_som.py index e99bd05dee8d2..cc40acfb05896 100644 --- a/scikits/learn/cluster/tests/test_som.py +++ b/scikits/learn/cluster/tests/test_som.py @@ -1,5 +1,5 @@ """ -Testing for SOM +Testing for SOM """ import numpy as np @@ -11,13 +11,14 @@ n_clusters = 4 X = generate_clustered_data(n_clusters=n_clusters, std=.1) + def test_som(): np.random.seed(1) - som = SelfOrganizingMap(size=2,n_iterations=10,learning_rate=1) + som = SelfOrganizingMap(size=2, n_iterations=10, learning_rate=1) som.fit(X) labels = som.labels_ - assert_equal(np.unique(labels).shape[0],4) + assert_equal(np.unique(labels).shape[0], 4) assert_equal(np.unique(labels[:20]).shape[0], 1) assert_equal(np.unique(labels[20:40]).shape[0], 1) assert_equal(np.unique(labels[40:60]).shape[0], 1) From 7d9c7e979b8d99cf3bfe95d65a85a53f666b100f Mon Sep 17 00:00:00 2001 From: Sebastien Campion Date: Thu, 20 Jan 2011 12:57:07 +0100 Subject: [PATCH 5/7] numpy vectorized function + some changes on variable scope --- scikits/learn/cluster/som_.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py index 18d6d3bccb08d..adb4dab8003db 100644 --- a/scikits/learn/cluster/som_.py +++ b/scikits/learn/cluster/som_.py @@ -74,12 +74,12 @@ def fit(self, X, **params): A good ratio, nb X = 2 or 3 x nbiter""" X = np.asanyarray(X) self._set_params(**params) - self.dim = X.shape[-1] # XXX : could we avoid storing dim in self? self.neurons_ = None + dim = X.shape[-1] # init neurons_ if self.init == 'random': - self.neurons_ = np.random.rand(self.size, self.size, self.dim) + self.neurons_ = np.random.rand(self.size, self.size, dim) elif self.init == 'matrix': # XXX : untested assert len(self.size.shape) == 3 @@ -87,23 +87,23 @@ def fit(self, X, **params): self.size = self.neurons_.shape[0] # iteration loop - self.iteration = 0 # XXX : could we avoid storing iteration in self? + iteration = 0 indices = np.random.random_integers(0, len(X)-1, self.n_iterations) for i in indices: l = self.n_iterations / self.size - lr = self.learning_rate * math.exp(-self.iteration / l) - self._learn_vector(X[i], lr) - self.iteration += 1 + lr = self.learning_rate * math.exp(-iteration / l) + self._learn_vector(X[i], lr, iteration) + iteration += 1 if self.callback != None: - self.callback(self, self.iteration) + self.callback(self, iteration) # assign labels self.labels_ = [self.bmu(x) for x in X] return self - def _learn_vector(self, vector, lr): + def _learn_vector(self, vector, lr, iteration): winner = self.bmu(vector) - radius = self.radius_of_the_neighbordhood() + radius = self.radius_of_the_neighbordhood(iteration) for n in self.neurons_in_radius(winner, radius): nx, ny = n wt = self.neurons_[nx][ny] @@ -129,14 +129,17 @@ def dist(self, w, n, radius): def neurons_in_radius(self, winner, radius): wi, wj = winner - r = [] - # XXX : should be vertorized with numpy + x = y = np.arange(self.size) + xx, yy = np.meshgrid(x, y) + v = np.sqrt((xx - wi)**2 + (yy - wj)**2) < radius + return np.c_[np.nonzero(v)] + for i in range(self.neurons_.shape[0]): for j in range(self.neurons_.shape[1]): if math.sqrt((i - wi)**2 + (j - wj)**2) < radius: r.append((i, j)) return r - def radius_of_the_neighbordhood(self): + def radius_of_the_neighbordhood(self, iteration): l = self.n_iterations / self.size - return self.size * math.exp(-self.iteration / l) + return self.size * math.exp(-iteration / l) From e4453b198a7d0f0c8b5531f1312cc2425079d8ff Mon Sep 17 00:00:00 2001 From: scampion Date: Fri, 21 Jan 2011 09:23:16 +0100 Subject: [PATCH 6/7] small fix, clean unused part of code --- examples/cluster/som_digits.py | 1 - scikits/learn/cluster/som_.py | 6 ------ 2 files changed, 7 deletions(-) diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py index c2e0a37efd199..f9c82a5b9f2d6 100644 --- a/examples/cluster/som_digits.py +++ b/examples/cluster/som_digits.py @@ -59,7 +59,6 @@ def display(labels, digits, n_clusters): print "KMeans " t0 = time() -#km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data) km = KMeans(init='k-means++', k=grid_width**2, n_init=10) km.fit(data) print "done in %0.3fs" % (time() - t0) diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py index adb4dab8003db..3b9201b80e27a 100644 --- a/scikits/learn/cluster/som_.py +++ b/scikits/learn/cluster/som_.py @@ -134,12 +134,6 @@ def neurons_in_radius(self, winner, radius): v = np.sqrt((xx - wi)**2 + (yy - wj)**2) < radius return np.c_[np.nonzero(v)] - for i in range(self.neurons_.shape[0]): - for j in range(self.neurons_.shape[1]): - if math.sqrt((i - wi)**2 + (j - wj)**2) < radius: - r.append((i, j)) - return r - def radius_of_the_neighbordhood(self, iteration): l = self.n_iterations / self.size return self.size * math.exp(-iteration / l) From 690f47dc352616df4e66908ca9298f3869e3c131 Mon Sep 17 00:00:00 2001 From: Sebastien Campion Date: Fri, 4 Feb 2011 12:02:59 +0100 Subject: [PATCH 7/7] Added test, renamed calinski to pseudo_F and update few docs --- examples/cluster/plot_som_colormap.py | 3 +- examples/cluster/som_digits.py | 49 ++++++++++++------------- scikits/learn/cluster/__init__.py | 15 +++++++- scikits/learn/cluster/som_.py | 5 +-- scikits/learn/cluster/tests/test_som.py | 19 +++++++++- 5 files changed, 58 insertions(+), 33 deletions(-) diff --git a/examples/cluster/plot_som_colormap.py b/examples/cluster/plot_som_colormap.py index 434a508f47c85..19f326d156cea 100644 --- a/examples/cluster/plot_som_colormap.py +++ b/examples/cluster/plot_som_colormap.py @@ -3,7 +3,8 @@ A demo of SelfOrganisingMap with colored neurons =========================================================== -XXX : add description of example. +Example for SOM clustering using 3 dimensionals vectors (RGB) +with 8 colors (black, white, red, green, blue, yellow, cyan, magenta) """ print __doc__ diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py index f9c82a5b9f2d6..db64fc892819d 100644 --- a/examples/cluster/som_digits.py +++ b/examples/cluster/som_digits.py @@ -3,8 +3,9 @@ A demo of Self-Organising Map and KMeans on the handwritten digits data ======================================================================= -XXX : Should add text to describe what the example and what to expect -from the output. Would it be possible to plot something? +Comparing various SOM and Kmeans clustering on the handwritten digits data +with the pseudo_F index + """ from __future__ import division print __doc__ @@ -14,35 +15,30 @@ from scikits.learn.cluster import KMeans from scikits.learn.cluster import SelfOrganizingMap -from scikits.learn.cluster import calinski_index - +from scikits.learn.cluster import pseudo_F from scikits.learn.datasets import load_digits from scikits.learn.preprocessing import scale - - -def display(labels, digits, n_clusters): - # XXX : n_clusters unused - r = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []} - for i, v in enumerate(labels): - r[digits.target[i]].append(v) - - for k, v in r.items(): - s = set(v) - print 'target %i | nb cluster %i |' % (k, len(s)), s - +from scikits.learn.metrics import confusion_matrix + np.random.seed(42) +################################################################################ +# Load dataset + digits = load_digits() data = scale(digits.data) - n_samples, n_features = data.shape n_digits = len(np.unique(digits.target)) -print "n_digits: %d" % n_digits -print "n_features: %d" % n_features -print "n_samples: %d" % n_samples +print "Digits dataset" +print "n_digits : %d" % n_digits +print "n_features : %d" % n_features +print "n_samples : %d" % n_samples print +################################################################################ +# Digits dataset clustering using Self-Organizing Map + print "Self-Organizing Map " t0 = time() grid_width = 4 @@ -52,11 +48,13 @@ def display(labels, digits, n_clusters): print "done in %0.3fs" % (time() - t0) print -display(som.labels_, digits, grid_width**2) -C = calinski_index(data, som.labels_, som.neurons_) -print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C))) +F = pseudo_F(data, som.labels_, som.neurons_) +print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F))) print +################################################################################ +# Digits dataset clustering using Kmeans + print "KMeans " t0 = time() km = KMeans(init='k-means++', k=grid_width**2, n_init=10) @@ -64,6 +62,5 @@ def display(labels, digits, n_clusters): print "done in %0.3fs" % (time() - t0) print -display(km.labels_, digits, n_digits) -C = calinski_index(data, km.labels_, km.cluster_centers_) -print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C))) +F = pseudo_F(data, km.labels_, km.cluster_centers_) +print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F))) diff --git a/scikits/learn/cluster/__init__.py b/scikits/learn/cluster/__init__.py index 1f2b9f90a0cc8..c0647f9685654 100644 --- a/scikits/learn/cluster/__init__.py +++ b/scikits/learn/cluster/__init__.py @@ -10,11 +10,22 @@ import numpy as np -def calinski_index(X,labels,centroids): +def pseudo_F(X, labels, centroids): + ''' + The pseudo F statistic : + + pseudo F = [( [(T - PG)/(G - 1)])/( [(PG)/(n - G)])] + + The pseudo F statistic was suggested by Calinski and Harabasz (1974) + + Calinski, T. and J. Harabasz. 1974. + A dendrite method for cluster analysis. Commun. Stat. 3: 1-27. + http://dx.doi.org/10.1080/03610927408827101 + ''' mean = np.mean(X,axis=0) B = np.sum([ (c - mean)**2 for c in centroids]) W = np.sum([ (x-centroids[labels[i]])**2 - for i,x in enumerate(X)]) + for i, x in enumerate(X)]) c = len(centroids) n = len(X) return (B /(c-1))/(W/ (n-c)) diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py index 3b9201b80e27a..772588bd8c369 100644 --- a/scikits/learn/cluster/som_.py +++ b/scikits/learn/cluster/som_.py @@ -39,8 +39,8 @@ class SelfOrganizingMap(BaseEstimator): 'random': randomly points choosed - 'matrix': interpret the w parameter as a w by M array - of initial centroids. + 'matrix': interpret the size parameter as a size by M array + of initial neurons. Methods ------- @@ -81,7 +81,6 @@ def fit(self, X, **params): if self.init == 'random': self.neurons_ = np.random.rand(self.size, self.size, dim) elif self.init == 'matrix': - # XXX : untested assert len(self.size.shape) == 3 self.neurons_ = self.size self.size = self.neurons_.shape[0] diff --git a/scikits/learn/cluster/tests/test_som.py b/scikits/learn/cluster/tests/test_som.py index cc40acfb05896..080648f2643fb 100644 --- a/scikits/learn/cluster/tests/test_som.py +++ b/scikits/learn/cluster/tests/test_som.py @@ -9,7 +9,8 @@ from .common import generate_clustered_data n_clusters = 4 -X = generate_clustered_data(n_clusters=n_clusters, std=.1) +n_features = 2 +X = generate_clustered_data(n_clusters=n_clusters, n_features=2, std=.1) def test_som(): @@ -23,3 +24,19 @@ def test_som(): assert_equal(np.unique(labels[20:40]).shape[0], 1) assert_equal(np.unique(labels[40:60]).shape[0], 1) assert_equal(np.unique(labels[60:]).shape[0], 1) + +def test_som_init_matrix(): + np.random.seed(1) + random_ind = np.random.randint(0, X.shape[0], size=n_clusters) + init_map = X[random_ind].reshape(2,2,n_features) + + som = SelfOrganizingMap(size=init_map, init='matrix', + n_iterations=2000, learning_rate=0.1) + + som.fit(X) + labels = som.labels_ + assert_equal(np.unique(labels).shape[0], 4) + assert_equal(np.unique(labels[:20]).shape[0], 1) + assert_equal(np.unique(labels[20:40]).shape[0], 1) + assert_equal(np.unique(labels[40:60]).shape[0], 1) + assert_equal(np.unique(labels[60:]).shape[0], 1)